1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2009, Microsoft Corporation.
4 *
5 * Authors:
6 * Haiyang Zhang <haiyangz@microsoft.com>
7 * Hank Janssen <hjanssen@microsoft.com>
8 * K. Y. Srinivasan <kys@microsoft.com>
9 */
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/device.h>
15 #include <linux/platform_device.h>
16 #include <linux/interrupt.h>
17 #include <linux/sysctl.h>
18 #include <linux/slab.h>
19 #include <linux/acpi.h>
20 #include <linux/completion.h>
21 #include <linux/hyperv.h>
22 #include <linux/kernel_stat.h>
23 #include <linux/of_address.h>
24 #include <linux/clockchips.h>
25 #include <linux/cpu.h>
26 #include <linux/sched/isolation.h>
27 #include <linux/sched/task_stack.h>
28 #include <linux/smpboot.h>
29
30 #include <linux/delay.h>
31 #include <linux/panic_notifier.h>
32 #include <linux/ptrace.h>
33 #include <linux/sysfb.h>
34 #include <linux/efi.h>
35 #include <linux/random.h>
36 #include <linux/kernel.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/dma-map-ops.h>
39 #include <linux/pci.h>
40 #include <linux/export.h>
41 #include <clocksource/hyperv_timer.h>
42 #include <asm/mshyperv.h>
43 #include "hyperv_vmbus.h"
44
45 struct vmbus_dynid {
46 struct list_head node;
47 struct hv_vmbus_device_id id;
48 };
49
50 /* VMBus Root Device */
51 static struct device *vmbus_root_device;
52
53 static int hyperv_cpuhp_online;
54
55 static DEFINE_PER_CPU(long, vmbus_evt);
56
57 /* Values parsed from ACPI DSDT */
58 int vmbus_irq;
59 int vmbus_interrupt;
60
61 /*
62 * If the Confidential VMBus is used, the data on the "wire" is not
63 * visible to either the host or the hypervisor.
64 */
65 static bool is_confidential;
66
vmbus_is_confidential(void)67 bool vmbus_is_confidential(void)
68 {
69 return is_confidential;
70 }
71 EXPORT_SYMBOL_GPL(vmbus_is_confidential);
72
73 /*
74 * The panic notifier below is responsible solely for unloading the
75 * vmbus connection, which is necessary in a panic event.
76 *
77 * Notice an intrincate relation of this notifier with Hyper-V
78 * framebuffer panic notifier exists - we need vmbus connection alive
79 * there in order to succeed, so we need to order both with each other
80 * [see hvfb_on_panic()] - this is done using notifiers' priorities.
81 */
hv_panic_vmbus_unload(struct notifier_block * nb,unsigned long val,void * args)82 static int hv_panic_vmbus_unload(struct notifier_block *nb, unsigned long val,
83 void *args)
84 {
85 vmbus_initiate_unload(true);
86 return NOTIFY_DONE;
87 }
88 static struct notifier_block hyperv_panic_vmbus_unload_block = {
89 .notifier_call = hv_panic_vmbus_unload,
90 .priority = INT_MIN + 1, /* almost the latest one to execute */
91 };
92
93 static const char *fb_mmio_name = "fb_range";
94 static struct resource *fb_mmio;
95 static struct resource *hyperv_mmio;
96 static DEFINE_MUTEX(hyperv_mmio_lock);
97
hv_get_vmbus_root_device(void)98 struct device *hv_get_vmbus_root_device(void)
99 {
100 return vmbus_root_device;
101 }
102 EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device);
103
vmbus_exists(void)104 static int vmbus_exists(void)
105 {
106 if (vmbus_root_device == NULL)
107 return -ENODEV;
108
109 return 0;
110 }
111
channel_monitor_group(const struct vmbus_channel * channel)112 static u8 channel_monitor_group(const struct vmbus_channel *channel)
113 {
114 return (u8)channel->offermsg.monitorid / 32;
115 }
116
channel_monitor_offset(const struct vmbus_channel * channel)117 static u8 channel_monitor_offset(const struct vmbus_channel *channel)
118 {
119 return (u8)channel->offermsg.monitorid % 32;
120 }
121
channel_pending(const struct vmbus_channel * channel,const struct hv_monitor_page * monitor_page)122 static u32 channel_pending(const struct vmbus_channel *channel,
123 const struct hv_monitor_page *monitor_page)
124 {
125 u8 monitor_group = channel_monitor_group(channel);
126
127 return monitor_page->trigger_group[monitor_group].pending;
128 }
129
channel_latency(const struct vmbus_channel * channel,const struct hv_monitor_page * monitor_page)130 static u32 channel_latency(const struct vmbus_channel *channel,
131 const struct hv_monitor_page *monitor_page)
132 {
133 u8 monitor_group = channel_monitor_group(channel);
134 u8 monitor_offset = channel_monitor_offset(channel);
135
136 return monitor_page->latency[monitor_group][monitor_offset];
137 }
138
channel_conn_id(struct vmbus_channel * channel,struct hv_monitor_page * monitor_page)139 static u32 channel_conn_id(struct vmbus_channel *channel,
140 struct hv_monitor_page *monitor_page)
141 {
142 u8 monitor_group = channel_monitor_group(channel);
143 u8 monitor_offset = channel_monitor_offset(channel);
144
145 return monitor_page->parameter[monitor_group][monitor_offset].connectionid.u.id;
146 }
147
id_show(struct device * dev,struct device_attribute * dev_attr,char * buf)148 static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr,
149 char *buf)
150 {
151 struct hv_device *hv_dev = device_to_hv_device(dev);
152
153 if (!hv_dev->channel)
154 return -ENODEV;
155 return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid);
156 }
157 static DEVICE_ATTR_RO(id);
158
state_show(struct device * dev,struct device_attribute * dev_attr,char * buf)159 static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr,
160 char *buf)
161 {
162 struct hv_device *hv_dev = device_to_hv_device(dev);
163
164 if (!hv_dev->channel)
165 return -ENODEV;
166 return sysfs_emit(buf, "%d\n", hv_dev->channel->state);
167 }
168 static DEVICE_ATTR_RO(state);
169
monitor_id_show(struct device * dev,struct device_attribute * dev_attr,char * buf)170 static ssize_t monitor_id_show(struct device *dev,
171 struct device_attribute *dev_attr, char *buf)
172 {
173 struct hv_device *hv_dev = device_to_hv_device(dev);
174
175 if (!hv_dev->channel)
176 return -ENODEV;
177 return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid);
178 }
179 static DEVICE_ATTR_RO(monitor_id);
180
class_id_show(struct device * dev,struct device_attribute * dev_attr,char * buf)181 static ssize_t class_id_show(struct device *dev,
182 struct device_attribute *dev_attr, char *buf)
183 {
184 struct hv_device *hv_dev = device_to_hv_device(dev);
185
186 if (!hv_dev->channel)
187 return -ENODEV;
188 return sysfs_emit(buf, "{%pUl}\n",
189 &hv_dev->channel->offermsg.offer.if_type);
190 }
191 static DEVICE_ATTR_RO(class_id);
192
device_id_show(struct device * dev,struct device_attribute * dev_attr,char * buf)193 static ssize_t device_id_show(struct device *dev,
194 struct device_attribute *dev_attr, char *buf)
195 {
196 struct hv_device *hv_dev = device_to_hv_device(dev);
197
198 if (!hv_dev->channel)
199 return -ENODEV;
200 return sysfs_emit(buf, "{%pUl}\n",
201 &hv_dev->channel->offermsg.offer.if_instance);
202 }
203 static DEVICE_ATTR_RO(device_id);
204
modalias_show(struct device * dev,struct device_attribute * dev_attr,char * buf)205 static ssize_t modalias_show(struct device *dev,
206 struct device_attribute *dev_attr, char *buf)
207 {
208 struct hv_device *hv_dev = device_to_hv_device(dev);
209
210 return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type);
211 }
212 static DEVICE_ATTR_RO(modalias);
213
214 #ifdef CONFIG_NUMA
numa_node_show(struct device * dev,struct device_attribute * attr,char * buf)215 static ssize_t numa_node_show(struct device *dev,
216 struct device_attribute *attr, char *buf)
217 {
218 struct hv_device *hv_dev = device_to_hv_device(dev);
219
220 if (!hv_dev->channel)
221 return -ENODEV;
222
223 return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu));
224 }
225 static DEVICE_ATTR_RO(numa_node);
226 #endif
227
server_monitor_pending_show(struct device * dev,struct device_attribute * dev_attr,char * buf)228 static ssize_t server_monitor_pending_show(struct device *dev,
229 struct device_attribute *dev_attr,
230 char *buf)
231 {
232 struct hv_device *hv_dev = device_to_hv_device(dev);
233
234 if (!hv_dev->channel)
235 return -ENODEV;
236 return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel,
237 vmbus_connection.monitor_pages[0]));
238 }
239 static DEVICE_ATTR_RO(server_monitor_pending);
240
client_monitor_pending_show(struct device * dev,struct device_attribute * dev_attr,char * buf)241 static ssize_t client_monitor_pending_show(struct device *dev,
242 struct device_attribute *dev_attr,
243 char *buf)
244 {
245 struct hv_device *hv_dev = device_to_hv_device(dev);
246
247 if (!hv_dev->channel)
248 return -ENODEV;
249 return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel,
250 vmbus_connection.monitor_pages[1]));
251 }
252 static DEVICE_ATTR_RO(client_monitor_pending);
253
server_monitor_latency_show(struct device * dev,struct device_attribute * dev_attr,char * buf)254 static ssize_t server_monitor_latency_show(struct device *dev,
255 struct device_attribute *dev_attr,
256 char *buf)
257 {
258 struct hv_device *hv_dev = device_to_hv_device(dev);
259
260 if (!hv_dev->channel)
261 return -ENODEV;
262 return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel,
263 vmbus_connection.monitor_pages[0]));
264 }
265 static DEVICE_ATTR_RO(server_monitor_latency);
266
client_monitor_latency_show(struct device * dev,struct device_attribute * dev_attr,char * buf)267 static ssize_t client_monitor_latency_show(struct device *dev,
268 struct device_attribute *dev_attr,
269 char *buf)
270 {
271 struct hv_device *hv_dev = device_to_hv_device(dev);
272
273 if (!hv_dev->channel)
274 return -ENODEV;
275 return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel,
276 vmbus_connection.monitor_pages[1]));
277 }
278 static DEVICE_ATTR_RO(client_monitor_latency);
279
server_monitor_conn_id_show(struct device * dev,struct device_attribute * dev_attr,char * buf)280 static ssize_t server_monitor_conn_id_show(struct device *dev,
281 struct device_attribute *dev_attr,
282 char *buf)
283 {
284 struct hv_device *hv_dev = device_to_hv_device(dev);
285
286 if (!hv_dev->channel)
287 return -ENODEV;
288 return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel,
289 vmbus_connection.monitor_pages[0]));
290 }
291 static DEVICE_ATTR_RO(server_monitor_conn_id);
292
client_monitor_conn_id_show(struct device * dev,struct device_attribute * dev_attr,char * buf)293 static ssize_t client_monitor_conn_id_show(struct device *dev,
294 struct device_attribute *dev_attr,
295 char *buf)
296 {
297 struct hv_device *hv_dev = device_to_hv_device(dev);
298
299 if (!hv_dev->channel)
300 return -ENODEV;
301 return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel,
302 vmbus_connection.monitor_pages[1]));
303 }
304 static DEVICE_ATTR_RO(client_monitor_conn_id);
305
out_intr_mask_show(struct device * dev,struct device_attribute * dev_attr,char * buf)306 static ssize_t out_intr_mask_show(struct device *dev,
307 struct device_attribute *dev_attr, char *buf)
308 {
309 struct hv_device *hv_dev = device_to_hv_device(dev);
310 struct hv_ring_buffer_debug_info outbound;
311 int ret;
312
313 if (!hv_dev->channel)
314 return -ENODEV;
315
316 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
317 &outbound);
318 if (ret < 0)
319 return ret;
320
321 return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask);
322 }
323 static DEVICE_ATTR_RO(out_intr_mask);
324
out_read_index_show(struct device * dev,struct device_attribute * dev_attr,char * buf)325 static ssize_t out_read_index_show(struct device *dev,
326 struct device_attribute *dev_attr, char *buf)
327 {
328 struct hv_device *hv_dev = device_to_hv_device(dev);
329 struct hv_ring_buffer_debug_info outbound;
330 int ret;
331
332 if (!hv_dev->channel)
333 return -ENODEV;
334
335 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
336 &outbound);
337 if (ret < 0)
338 return ret;
339 return sysfs_emit(buf, "%u\n", outbound.current_read_index);
340 }
341 static DEVICE_ATTR_RO(out_read_index);
342
out_write_index_show(struct device * dev,struct device_attribute * dev_attr,char * buf)343 static ssize_t out_write_index_show(struct device *dev,
344 struct device_attribute *dev_attr,
345 char *buf)
346 {
347 struct hv_device *hv_dev = device_to_hv_device(dev);
348 struct hv_ring_buffer_debug_info outbound;
349 int ret;
350
351 if (!hv_dev->channel)
352 return -ENODEV;
353
354 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
355 &outbound);
356 if (ret < 0)
357 return ret;
358 return sysfs_emit(buf, "%u\n", outbound.current_write_index);
359 }
360 static DEVICE_ATTR_RO(out_write_index);
361
out_read_bytes_avail_show(struct device * dev,struct device_attribute * dev_attr,char * buf)362 static ssize_t out_read_bytes_avail_show(struct device *dev,
363 struct device_attribute *dev_attr,
364 char *buf)
365 {
366 struct hv_device *hv_dev = device_to_hv_device(dev);
367 struct hv_ring_buffer_debug_info outbound;
368 int ret;
369
370 if (!hv_dev->channel)
371 return -ENODEV;
372
373 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
374 &outbound);
375 if (ret < 0)
376 return ret;
377 return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread);
378 }
379 static DEVICE_ATTR_RO(out_read_bytes_avail);
380
out_write_bytes_avail_show(struct device * dev,struct device_attribute * dev_attr,char * buf)381 static ssize_t out_write_bytes_avail_show(struct device *dev,
382 struct device_attribute *dev_attr,
383 char *buf)
384 {
385 struct hv_device *hv_dev = device_to_hv_device(dev);
386 struct hv_ring_buffer_debug_info outbound;
387 int ret;
388
389 if (!hv_dev->channel)
390 return -ENODEV;
391
392 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
393 &outbound);
394 if (ret < 0)
395 return ret;
396 return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite);
397 }
398 static DEVICE_ATTR_RO(out_write_bytes_avail);
399
in_intr_mask_show(struct device * dev,struct device_attribute * dev_attr,char * buf)400 static ssize_t in_intr_mask_show(struct device *dev,
401 struct device_attribute *dev_attr, char *buf)
402 {
403 struct hv_device *hv_dev = device_to_hv_device(dev);
404 struct hv_ring_buffer_debug_info inbound;
405 int ret;
406
407 if (!hv_dev->channel)
408 return -ENODEV;
409
410 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
411 if (ret < 0)
412 return ret;
413
414 return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask);
415 }
416 static DEVICE_ATTR_RO(in_intr_mask);
417
in_read_index_show(struct device * dev,struct device_attribute * dev_attr,char * buf)418 static ssize_t in_read_index_show(struct device *dev,
419 struct device_attribute *dev_attr, char *buf)
420 {
421 struct hv_device *hv_dev = device_to_hv_device(dev);
422 struct hv_ring_buffer_debug_info inbound;
423 int ret;
424
425 if (!hv_dev->channel)
426 return -ENODEV;
427
428 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
429 if (ret < 0)
430 return ret;
431
432 return sysfs_emit(buf, "%d\n", inbound.current_read_index);
433 }
434 static DEVICE_ATTR_RO(in_read_index);
435
in_write_index_show(struct device * dev,struct device_attribute * dev_attr,char * buf)436 static ssize_t in_write_index_show(struct device *dev,
437 struct device_attribute *dev_attr, char *buf)
438 {
439 struct hv_device *hv_dev = device_to_hv_device(dev);
440 struct hv_ring_buffer_debug_info inbound;
441 int ret;
442
443 if (!hv_dev->channel)
444 return -ENODEV;
445
446 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
447 if (ret < 0)
448 return ret;
449
450 return sysfs_emit(buf, "%d\n", inbound.current_write_index);
451 }
452 static DEVICE_ATTR_RO(in_write_index);
453
in_read_bytes_avail_show(struct device * dev,struct device_attribute * dev_attr,char * buf)454 static ssize_t in_read_bytes_avail_show(struct device *dev,
455 struct device_attribute *dev_attr,
456 char *buf)
457 {
458 struct hv_device *hv_dev = device_to_hv_device(dev);
459 struct hv_ring_buffer_debug_info inbound;
460 int ret;
461
462 if (!hv_dev->channel)
463 return -ENODEV;
464
465 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
466 if (ret < 0)
467 return ret;
468
469 return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread);
470 }
471 static DEVICE_ATTR_RO(in_read_bytes_avail);
472
in_write_bytes_avail_show(struct device * dev,struct device_attribute * dev_attr,char * buf)473 static ssize_t in_write_bytes_avail_show(struct device *dev,
474 struct device_attribute *dev_attr,
475 char *buf)
476 {
477 struct hv_device *hv_dev = device_to_hv_device(dev);
478 struct hv_ring_buffer_debug_info inbound;
479 int ret;
480
481 if (!hv_dev->channel)
482 return -ENODEV;
483
484 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
485 if (ret < 0)
486 return ret;
487
488 return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite);
489 }
490 static DEVICE_ATTR_RO(in_write_bytes_avail);
491
channel_vp_mapping_show(struct device * dev,struct device_attribute * dev_attr,char * buf)492 static ssize_t channel_vp_mapping_show(struct device *dev,
493 struct device_attribute *dev_attr,
494 char *buf)
495 {
496 struct hv_device *hv_dev = device_to_hv_device(dev);
497 struct vmbus_channel *channel = hv_dev->channel, *cur_sc;
498 int n_written;
499 struct list_head *cur;
500
501 if (!channel)
502 return -ENODEV;
503
504 mutex_lock(&vmbus_connection.channel_mutex);
505
506 n_written = sysfs_emit(buf, "%u:%u\n",
507 channel->offermsg.child_relid,
508 channel->target_cpu);
509
510 list_for_each(cur, &channel->sc_list) {
511
512 cur_sc = list_entry(cur, struct vmbus_channel, sc_list);
513 n_written += sysfs_emit_at(buf, n_written, "%u:%u\n",
514 cur_sc->offermsg.child_relid,
515 cur_sc->target_cpu);
516 }
517
518 mutex_unlock(&vmbus_connection.channel_mutex);
519
520 return n_written;
521 }
522 static DEVICE_ATTR_RO(channel_vp_mapping);
523
vendor_show(struct device * dev,struct device_attribute * dev_attr,char * buf)524 static ssize_t vendor_show(struct device *dev,
525 struct device_attribute *dev_attr,
526 char *buf)
527 {
528 struct hv_device *hv_dev = device_to_hv_device(dev);
529
530 return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id);
531 }
532 static DEVICE_ATTR_RO(vendor);
533
device_show(struct device * dev,struct device_attribute * dev_attr,char * buf)534 static ssize_t device_show(struct device *dev,
535 struct device_attribute *dev_attr,
536 char *buf)
537 {
538 struct hv_device *hv_dev = device_to_hv_device(dev);
539
540 return sysfs_emit(buf, "0x%x\n", hv_dev->device_id);
541 }
542 static DEVICE_ATTR_RO(device);
543
driver_override_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)544 static ssize_t driver_override_store(struct device *dev,
545 struct device_attribute *attr,
546 const char *buf, size_t count)
547 {
548 struct hv_device *hv_dev = device_to_hv_device(dev);
549 int ret;
550
551 ret = driver_set_override(dev, &hv_dev->driver_override, buf, count);
552 if (ret)
553 return ret;
554
555 return count;
556 }
557
driver_override_show(struct device * dev,struct device_attribute * attr,char * buf)558 static ssize_t driver_override_show(struct device *dev,
559 struct device_attribute *attr, char *buf)
560 {
561 struct hv_device *hv_dev = device_to_hv_device(dev);
562 ssize_t len;
563
564 device_lock(dev);
565 len = sysfs_emit(buf, "%s\n", hv_dev->driver_override);
566 device_unlock(dev);
567
568 return len;
569 }
570 static DEVICE_ATTR_RW(driver_override);
571
572 /* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */
573 static struct attribute *vmbus_dev_attrs[] = {
574 &dev_attr_id.attr,
575 &dev_attr_state.attr,
576 &dev_attr_monitor_id.attr,
577 &dev_attr_class_id.attr,
578 &dev_attr_device_id.attr,
579 &dev_attr_modalias.attr,
580 #ifdef CONFIG_NUMA
581 &dev_attr_numa_node.attr,
582 #endif
583 &dev_attr_server_monitor_pending.attr,
584 &dev_attr_client_monitor_pending.attr,
585 &dev_attr_server_monitor_latency.attr,
586 &dev_attr_client_monitor_latency.attr,
587 &dev_attr_server_monitor_conn_id.attr,
588 &dev_attr_client_monitor_conn_id.attr,
589 &dev_attr_out_intr_mask.attr,
590 &dev_attr_out_read_index.attr,
591 &dev_attr_out_write_index.attr,
592 &dev_attr_out_read_bytes_avail.attr,
593 &dev_attr_out_write_bytes_avail.attr,
594 &dev_attr_in_intr_mask.attr,
595 &dev_attr_in_read_index.attr,
596 &dev_attr_in_write_index.attr,
597 &dev_attr_in_read_bytes_avail.attr,
598 &dev_attr_in_write_bytes_avail.attr,
599 &dev_attr_channel_vp_mapping.attr,
600 &dev_attr_vendor.attr,
601 &dev_attr_device.attr,
602 &dev_attr_driver_override.attr,
603 NULL,
604 };
605
606 /*
607 * Device-level attribute_group callback function. Returns the permission for
608 * each attribute, and returns 0 if an attribute is not visible.
609 */
vmbus_dev_attr_is_visible(struct kobject * kobj,struct attribute * attr,int idx)610 static umode_t vmbus_dev_attr_is_visible(struct kobject *kobj,
611 struct attribute *attr, int idx)
612 {
613 struct device *dev = kobj_to_dev(kobj);
614 const struct hv_device *hv_dev = device_to_hv_device(dev);
615
616 /* Hide the monitor attributes if the monitor mechanism is not used. */
617 if (!hv_dev->channel->offermsg.monitor_allocated &&
618 (attr == &dev_attr_monitor_id.attr ||
619 attr == &dev_attr_server_monitor_pending.attr ||
620 attr == &dev_attr_client_monitor_pending.attr ||
621 attr == &dev_attr_server_monitor_latency.attr ||
622 attr == &dev_attr_client_monitor_latency.attr ||
623 attr == &dev_attr_server_monitor_conn_id.attr ||
624 attr == &dev_attr_client_monitor_conn_id.attr))
625 return 0;
626
627 return attr->mode;
628 }
629
630 static const struct attribute_group vmbus_dev_group = {
631 .attrs = vmbus_dev_attrs,
632 .is_visible = vmbus_dev_attr_is_visible
633 };
634 __ATTRIBUTE_GROUPS(vmbus_dev);
635
636 /* Set up the attribute for /sys/bus/vmbus/hibernation */
hibernation_show(const struct bus_type * bus,char * buf)637 static ssize_t hibernation_show(const struct bus_type *bus, char *buf)
638 {
639 return sprintf(buf, "%d\n", !!hv_is_hibernation_supported());
640 }
641
642 static BUS_ATTR_RO(hibernation);
643
644 static struct attribute *vmbus_bus_attrs[] = {
645 &bus_attr_hibernation.attr,
646 NULL,
647 };
648 static const struct attribute_group vmbus_bus_group = {
649 .attrs = vmbus_bus_attrs,
650 };
651 __ATTRIBUTE_GROUPS(vmbus_bus);
652
653 /*
654 * vmbus_uevent - add uevent for our device
655 *
656 * This routine is invoked when a device is added or removed on the vmbus to
657 * generate a uevent to udev in the userspace. The udev will then look at its
658 * rule and the uevent generated here to load the appropriate driver
659 *
660 * The alias string will be of the form vmbus:guid where guid is the string
661 * representation of the device guid (each byte of the guid will be
662 * represented with two hex characters.
663 */
vmbus_uevent(const struct device * device,struct kobj_uevent_env * env)664 static int vmbus_uevent(const struct device *device, struct kobj_uevent_env *env)
665 {
666 const struct hv_device *dev = device_to_hv_device(device);
667 const char *format = "MODALIAS=vmbus:%*phN";
668
669 return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type);
670 }
671
672 static const struct hv_vmbus_device_id *
hv_vmbus_dev_match(const struct hv_vmbus_device_id * id,const guid_t * guid)673 hv_vmbus_dev_match(const struct hv_vmbus_device_id *id, const guid_t *guid)
674 {
675 if (id == NULL)
676 return NULL; /* empty device table */
677
678 for (; !guid_is_null(&id->guid); id++)
679 if (guid_equal(&id->guid, guid))
680 return id;
681
682 return NULL;
683 }
684
685 static const struct hv_vmbus_device_id *
hv_vmbus_dynid_match(struct hv_driver * drv,const guid_t * guid)686 hv_vmbus_dynid_match(struct hv_driver *drv, const guid_t *guid)
687 {
688 const struct hv_vmbus_device_id *id = NULL;
689 struct vmbus_dynid *dynid;
690
691 spin_lock(&drv->dynids.lock);
692 list_for_each_entry(dynid, &drv->dynids.list, node) {
693 if (guid_equal(&dynid->id.guid, guid)) {
694 id = &dynid->id;
695 break;
696 }
697 }
698 spin_unlock(&drv->dynids.lock);
699
700 return id;
701 }
702
703 static const struct hv_vmbus_device_id vmbus_device_null;
704
705 /*
706 * Return a matching hv_vmbus_device_id pointer.
707 * If there is no match, return NULL.
708 */
hv_vmbus_get_id(const struct hv_driver * drv,struct hv_device * dev)709 static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv,
710 struct hv_device *dev)
711 {
712 const guid_t *guid = &dev->dev_type;
713 const struct hv_vmbus_device_id *id;
714
715 /* When driver_override is set, only bind to the matching driver */
716 if (dev->driver_override && strcmp(dev->driver_override, drv->name))
717 return NULL;
718
719 /* Look at the dynamic ids first, before the static ones */
720 id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid);
721 if (!id)
722 id = hv_vmbus_dev_match(drv->id_table, guid);
723
724 /* driver_override will always match, send a dummy id */
725 if (!id && dev->driver_override)
726 id = &vmbus_device_null;
727
728 return id;
729 }
730
731 /* vmbus_add_dynid - add a new device ID to this driver and re-probe devices
732 *
733 * This function can race with vmbus_device_register(). This function is
734 * typically running on a user thread in response to writing to the "new_id"
735 * sysfs entry for a driver. vmbus_device_register() is running on a
736 * workqueue thread in response to the Hyper-V host offering a device to the
737 * guest. This function calls driver_attach(), which looks for an existing
738 * device matching the new id, and attaches the driver to which the new id
739 * has been assigned. vmbus_device_register() calls device_register(), which
740 * looks for a driver that matches the device being registered. If both
741 * operations are running simultaneously, the device driver probe function runs
742 * on whichever thread establishes the linkage between the driver and device.
743 *
744 * In most cases, it doesn't matter which thread runs the driver probe
745 * function. But if vmbus_device_register() does not find a matching driver,
746 * it proceeds to create the "channels" subdirectory and numbered per-channel
747 * subdirectory in sysfs. While that multi-step creation is in progress, this
748 * function could run the driver probe function. If the probe function checks
749 * for, or operates on, entries in the "channels" subdirectory, including by
750 * calling hv_create_ring_sysfs(), the operation may or may not succeed
751 * depending on the race. The race can't create a kernel failure in VMBus
752 * or device subsystem code, but probe functions in VMBus drivers doing such
753 * operations must be prepared for the failure case.
754 */
vmbus_add_dynid(struct hv_driver * drv,guid_t * guid)755 static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid)
756 {
757 struct vmbus_dynid *dynid;
758
759 dynid = kzalloc_obj(*dynid);
760 if (!dynid)
761 return -ENOMEM;
762
763 dynid->id.guid = *guid;
764
765 spin_lock(&drv->dynids.lock);
766 list_add_tail(&dynid->node, &drv->dynids.list);
767 spin_unlock(&drv->dynids.lock);
768
769 return driver_attach(&drv->driver);
770 }
771
vmbus_free_dynids(struct hv_driver * drv)772 static void vmbus_free_dynids(struct hv_driver *drv)
773 {
774 struct vmbus_dynid *dynid, *n;
775
776 spin_lock(&drv->dynids.lock);
777 list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
778 list_del(&dynid->node);
779 kfree(dynid);
780 }
781 spin_unlock(&drv->dynids.lock);
782 }
783
784 /*
785 * store_new_id - sysfs frontend to vmbus_add_dynid()
786 *
787 * Allow GUIDs to be added to an existing driver via sysfs.
788 */
new_id_store(struct device_driver * driver,const char * buf,size_t count)789 static ssize_t new_id_store(struct device_driver *driver, const char *buf,
790 size_t count)
791 {
792 struct hv_driver *drv = drv_to_hv_drv(driver);
793 guid_t guid;
794 ssize_t retval;
795
796 retval = guid_parse(buf, &guid);
797 if (retval)
798 return retval;
799
800 if (hv_vmbus_dynid_match(drv, &guid))
801 return -EEXIST;
802
803 retval = vmbus_add_dynid(drv, &guid);
804 if (retval)
805 return retval;
806 return count;
807 }
808 static DRIVER_ATTR_WO(new_id);
809
810 /*
811 * store_remove_id - remove a PCI device ID from this driver
812 *
813 * Removes a dynamic pci device ID to this driver.
814 */
remove_id_store(struct device_driver * driver,const char * buf,size_t count)815 static ssize_t remove_id_store(struct device_driver *driver, const char *buf,
816 size_t count)
817 {
818 struct hv_driver *drv = drv_to_hv_drv(driver);
819 struct vmbus_dynid *dynid, *n;
820 guid_t guid;
821 ssize_t retval;
822
823 retval = guid_parse(buf, &guid);
824 if (retval)
825 return retval;
826
827 retval = -ENODEV;
828 spin_lock(&drv->dynids.lock);
829 list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
830 struct hv_vmbus_device_id *id = &dynid->id;
831
832 if (guid_equal(&id->guid, &guid)) {
833 list_del(&dynid->node);
834 kfree(dynid);
835 retval = count;
836 break;
837 }
838 }
839 spin_unlock(&drv->dynids.lock);
840
841 return retval;
842 }
843 static DRIVER_ATTR_WO(remove_id);
844
845 static struct attribute *vmbus_drv_attrs[] = {
846 &driver_attr_new_id.attr,
847 &driver_attr_remove_id.attr,
848 NULL,
849 };
850 ATTRIBUTE_GROUPS(vmbus_drv);
851
852
853 /*
854 * vmbus_match - Attempt to match the specified device to the specified driver
855 */
vmbus_match(struct device * device,const struct device_driver * driver)856 static int vmbus_match(struct device *device, const struct device_driver *driver)
857 {
858 const struct hv_driver *drv = drv_to_hv_drv(driver);
859 struct hv_device *hv_dev = device_to_hv_device(device);
860
861 /* The hv_sock driver handles all hv_sock offers. */
862 if (is_hvsock_channel(hv_dev->channel))
863 return drv->hvsock;
864
865 if (hv_vmbus_get_id(drv, hv_dev))
866 return 1;
867
868 return 0;
869 }
870
871 /*
872 * vmbus_probe - Add the new vmbus's child device
873 */
vmbus_probe(struct device * child_device)874 static int vmbus_probe(struct device *child_device)
875 {
876 int ret = 0;
877 struct hv_driver *drv =
878 drv_to_hv_drv(child_device->driver);
879 struct hv_device *dev = device_to_hv_device(child_device);
880 const struct hv_vmbus_device_id *dev_id;
881
882 dev_id = hv_vmbus_get_id(drv, dev);
883 if (drv->probe) {
884 ret = drv->probe(dev, dev_id);
885 if (ret != 0)
886 pr_err("probe failed for device %s (%d)\n",
887 dev_name(child_device), ret);
888
889 } else {
890 pr_err("probe not set for driver %s\n",
891 dev_name(child_device));
892 ret = -ENODEV;
893 }
894 return ret;
895 }
896
897 /*
898 * vmbus_dma_configure -- Configure DMA coherence for VMbus device
899 */
vmbus_dma_configure(struct device * child_device)900 static int vmbus_dma_configure(struct device *child_device)
901 {
902 /*
903 * On ARM64, propagate the DMA coherence setting from the top level
904 * VMbus ACPI device to the child VMbus device being added here.
905 * On x86/x64 coherence is assumed and these calls have no effect.
906 */
907 hv_setup_dma_ops(child_device,
908 device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT);
909 return 0;
910 }
911
912 /*
913 * vmbus_remove - Remove a vmbus device
914 */
vmbus_remove(struct device * child_device)915 static void vmbus_remove(struct device *child_device)
916 {
917 struct hv_driver *drv;
918 struct hv_device *dev = device_to_hv_device(child_device);
919
920 if (child_device->driver) {
921 drv = drv_to_hv_drv(child_device->driver);
922 if (drv->remove)
923 drv->remove(dev);
924 }
925 }
926
927 /*
928 * vmbus_shutdown - Shutdown a vmbus device
929 */
vmbus_shutdown(struct device * child_device)930 static void vmbus_shutdown(struct device *child_device)
931 {
932 struct hv_driver *drv;
933 struct hv_device *dev = device_to_hv_device(child_device);
934
935
936 /* The device may not be attached yet */
937 if (!child_device->driver)
938 return;
939
940 drv = drv_to_hv_drv(child_device->driver);
941
942 if (drv->shutdown)
943 drv->shutdown(dev);
944 }
945
946 #ifdef CONFIG_PM_SLEEP
947 /*
948 * vmbus_suspend - Suspend a vmbus device
949 */
vmbus_suspend(struct device * child_device)950 static int vmbus_suspend(struct device *child_device)
951 {
952 struct hv_driver *drv;
953 struct hv_device *dev = device_to_hv_device(child_device);
954
955 /* The device may not be attached yet */
956 if (!child_device->driver)
957 return 0;
958
959 drv = drv_to_hv_drv(child_device->driver);
960 if (!drv->suspend)
961 return -EOPNOTSUPP;
962
963 return drv->suspend(dev);
964 }
965
966 /*
967 * vmbus_resume - Resume a vmbus device
968 */
vmbus_resume(struct device * child_device)969 static int vmbus_resume(struct device *child_device)
970 {
971 struct hv_driver *drv;
972 struct hv_device *dev = device_to_hv_device(child_device);
973
974 /* The device may not be attached yet */
975 if (!child_device->driver)
976 return 0;
977
978 drv = drv_to_hv_drv(child_device->driver);
979 if (!drv->resume)
980 return -EOPNOTSUPP;
981
982 return drv->resume(dev);
983 }
984 #else
985 #define vmbus_suspend NULL
986 #define vmbus_resume NULL
987 #endif /* CONFIG_PM_SLEEP */
988
989 /*
990 * vmbus_device_release - Final callback release of the vmbus child device
991 */
vmbus_device_release(struct device * device)992 static void vmbus_device_release(struct device *device)
993 {
994 struct hv_device *hv_dev = device_to_hv_device(device);
995 struct vmbus_channel *channel = hv_dev->channel;
996
997 hv_debug_rm_dev_dir(hv_dev);
998
999 mutex_lock(&vmbus_connection.channel_mutex);
1000 hv_process_channel_removal(channel);
1001 mutex_unlock(&vmbus_connection.channel_mutex);
1002 kfree(hv_dev);
1003 }
1004
1005 /*
1006 * Note: we must use the "noirq" ops: see the comment before vmbus_bus_pm.
1007 *
1008 * suspend_noirq/resume_noirq are set to NULL to support Suspend-to-Idle: we
1009 * shouldn't suspend the vmbus devices upon Suspend-to-Idle, otherwise there
1010 * is no way to wake up a Generation-2 VM.
1011 *
1012 * The other 4 ops are for hibernation.
1013 */
1014
1015 static const struct dev_pm_ops vmbus_pm = {
1016 .suspend_noirq = NULL,
1017 .resume_noirq = NULL,
1018 .freeze_noirq = vmbus_suspend,
1019 .thaw_noirq = vmbus_resume,
1020 .poweroff_noirq = vmbus_suspend,
1021 .restore_noirq = vmbus_resume,
1022 };
1023
1024 /* The one and only one */
1025 static const struct bus_type hv_bus = {
1026 .name = "vmbus",
1027 .match = vmbus_match,
1028 .shutdown = vmbus_shutdown,
1029 .remove = vmbus_remove,
1030 .probe = vmbus_probe,
1031 .uevent = vmbus_uevent,
1032 .dma_configure = vmbus_dma_configure,
1033 .dev_groups = vmbus_dev_groups,
1034 .drv_groups = vmbus_drv_groups,
1035 .bus_groups = vmbus_bus_groups,
1036 .pm = &vmbus_pm,
1037 };
1038
1039 struct onmessage_work_context {
1040 struct work_struct work;
1041 struct {
1042 struct hv_message_header header;
1043 u8 payload[];
1044 } msg;
1045 };
1046
vmbus_onmessage_work(struct work_struct * work)1047 static void vmbus_onmessage_work(struct work_struct *work)
1048 {
1049 struct onmessage_work_context *ctx;
1050
1051 /* Do not process messages if we're in DISCONNECTED state */
1052 if (vmbus_connection.conn_state == DISCONNECTED)
1053 return;
1054
1055 ctx = container_of(work, struct onmessage_work_context,
1056 work);
1057 vmbus_onmessage((struct vmbus_channel_message_header *)
1058 &ctx->msg.payload);
1059 kfree(ctx);
1060 }
1061
__vmbus_on_msg_dpc(void * message_page_addr)1062 static void __vmbus_on_msg_dpc(void *message_page_addr)
1063 {
1064 struct hv_message msg_copy, *msg;
1065 struct vmbus_channel_message_header *hdr;
1066 enum vmbus_channel_message_type msgtype;
1067 const struct vmbus_channel_message_table_entry *entry;
1068 struct onmessage_work_context *ctx;
1069 __u8 payload_size;
1070 u32 message_type;
1071
1072 if (!message_page_addr)
1073 return;
1074 msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
1075
1076 /*
1077 * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
1078 * it is being used in 'struct vmbus_channel_message_header' definition
1079 * which is supposed to match hypervisor ABI.
1080 */
1081 BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32));
1082
1083 /*
1084 * Since the message is in memory shared with the host, an erroneous or
1085 * malicious Hyper-V could modify the message while vmbus_on_msg_dpc()
1086 * or individual message handlers are executing; to prevent this, copy
1087 * the message into private memory.
1088 */
1089 memcpy(&msg_copy, msg, sizeof(struct hv_message));
1090
1091 message_type = msg_copy.header.message_type;
1092 if (message_type == HVMSG_NONE)
1093 /* no msg */
1094 return;
1095
1096 hdr = (struct vmbus_channel_message_header *)msg_copy.u.payload;
1097 msgtype = hdr->msgtype;
1098
1099 trace_vmbus_on_msg_dpc(hdr);
1100
1101 if (msgtype >= CHANNELMSG_COUNT) {
1102 WARN_ONCE(1, "unknown msgtype=%d\n", msgtype);
1103 goto msg_handled;
1104 }
1105
1106 payload_size = msg_copy.header.payload_size;
1107 if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
1108 WARN_ONCE(1, "payload size is too large (%d)\n", payload_size);
1109 goto msg_handled;
1110 }
1111
1112 entry = &channel_message_table[msgtype];
1113
1114 if (!entry->message_handler)
1115 goto msg_handled;
1116
1117 if (payload_size < entry->min_payload_len) {
1118 WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", msgtype, payload_size);
1119 goto msg_handled;
1120 }
1121
1122 if (entry->handler_type == VMHT_BLOCKING) {
1123 ctx = kmalloc_flex(*ctx, msg.payload, payload_size, GFP_ATOMIC);
1124 if (ctx == NULL)
1125 return;
1126
1127 INIT_WORK(&ctx->work, vmbus_onmessage_work);
1128 ctx->msg.header = msg_copy.header;
1129 memcpy(&ctx->msg.payload, msg_copy.u.payload, payload_size);
1130
1131 /*
1132 * The host can generate a rescind message while we
1133 * may still be handling the original offer. We deal with
1134 * this condition by relying on the synchronization provided
1135 * by offer_in_progress and by channel_mutex. See also the
1136 * inline comments in vmbus_onoffer_rescind().
1137 */
1138 switch (msgtype) {
1139 case CHANNELMSG_RESCIND_CHANNELOFFER:
1140 /*
1141 * If we are handling the rescind message;
1142 * schedule the work on the global work queue.
1143 *
1144 * The OFFER message and the RESCIND message should
1145 * not be handled by the same serialized work queue,
1146 * because the OFFER handler may call vmbus_open(),
1147 * which tries to open the channel by sending an
1148 * OPEN_CHANNEL message to the host and waits for
1149 * the host's response; however, if the host has
1150 * rescinded the channel before it receives the
1151 * OPEN_CHANNEL message, the host just silently
1152 * ignores the OPEN_CHANNEL message; as a result,
1153 * the guest's OFFER handler hangs for ever, if we
1154 * handle the RESCIND message in the same serialized
1155 * work queue: the RESCIND handler can not start to
1156 * run before the OFFER handler finishes.
1157 */
1158 if (vmbus_connection.ignore_any_offer_msg)
1159 break;
1160 queue_work(vmbus_connection.rescind_work_queue, &ctx->work);
1161 break;
1162
1163 case CHANNELMSG_OFFERCHANNEL:
1164 /*
1165 * The host sends the offer message of a given channel
1166 * before sending the rescind message of the same
1167 * channel. These messages are sent to the guest's
1168 * connect CPU; the guest then starts processing them
1169 * in the tasklet handler on this CPU:
1170 *
1171 * VMBUS_CONNECT_CPU
1172 *
1173 * [vmbus_on_msg_dpc()]
1174 * atomic_inc() // CHANNELMSG_OFFERCHANNEL
1175 * queue_work()
1176 * ...
1177 * [vmbus_on_msg_dpc()]
1178 * schedule_work() // CHANNELMSG_RESCIND_CHANNELOFFER
1179 *
1180 * We rely on the memory-ordering properties of the
1181 * queue_work() and schedule_work() primitives, which
1182 * guarantee that the atomic increment will be visible
1183 * to the CPUs which will execute the offer & rescind
1184 * works by the time these works will start execution.
1185 */
1186 if (vmbus_connection.ignore_any_offer_msg)
1187 break;
1188 atomic_inc(&vmbus_connection.offer_in_progress);
1189 fallthrough;
1190
1191 default:
1192 queue_work(vmbus_connection.work_queue, &ctx->work);
1193 }
1194 } else
1195 entry->message_handler(hdr);
1196
1197 msg_handled:
1198 vmbus_signal_eom(msg, message_type);
1199 }
1200
vmbus_on_msg_dpc(unsigned long data)1201 void vmbus_on_msg_dpc(unsigned long data)
1202 {
1203 struct hv_per_cpu_context *hv_cpu = (void *)data;
1204
1205 __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
1206 __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
1207 }
1208
1209 #ifdef CONFIG_PM_SLEEP
1210 /*
1211 * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
1212 * hibernation, because hv_sock connections can not persist across hibernation.
1213 */
vmbus_force_channel_rescinded(struct vmbus_channel * channel)1214 static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
1215 {
1216 struct onmessage_work_context *ctx;
1217 struct vmbus_channel_rescind_offer *rescind;
1218
1219 WARN_ON(!is_hvsock_channel(channel));
1220
1221 /*
1222 * Allocation size is small and the allocation should really not fail,
1223 * otherwise the state of the hv_sock connections ends up in limbo.
1224 */
1225 ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind),
1226 GFP_KERNEL | __GFP_NOFAIL);
1227
1228 /*
1229 * So far, these are not really used by Linux. Just set them to the
1230 * reasonable values conforming to the definitions of the fields.
1231 */
1232 ctx->msg.header.message_type = 1;
1233 ctx->msg.header.payload_size = sizeof(*rescind);
1234
1235 /* These values are actually used by Linux. */
1236 rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload;
1237 rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER;
1238 rescind->child_relid = channel->offermsg.child_relid;
1239
1240 INIT_WORK(&ctx->work, vmbus_onmessage_work);
1241
1242 queue_work(vmbus_connection.work_queue, &ctx->work);
1243 }
1244 #endif /* CONFIG_PM_SLEEP */
1245
1246 /*
1247 * Schedule all channels with events pending.
1248 * The event page can be directly checked to get the id of
1249 * the channel that has the interrupt pending.
1250 */
vmbus_chan_sched(void * event_page_addr)1251 static void vmbus_chan_sched(void *event_page_addr)
1252 {
1253 unsigned long *recv_int_page;
1254 u32 maxbits, relid;
1255 union hv_synic_event_flags *event;
1256
1257 if (!event_page_addr)
1258 return;
1259 event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
1260
1261 maxbits = HV_EVENT_FLAGS_COUNT;
1262 recv_int_page = event->flags;
1263
1264 if (unlikely(!recv_int_page))
1265 return;
1266
1267 /*
1268 * Suggested-by: Michael Kelley <mhklinux@outlook.com>
1269 * One possible optimization would be to keep track of the largest relID that's in use,
1270 * and only scan up to that relID.
1271 */
1272 for_each_set_bit(relid, recv_int_page, maxbits) {
1273 void (*callback_fn)(void *context);
1274 struct vmbus_channel *channel;
1275
1276 if (!sync_test_and_clear_bit(relid, recv_int_page))
1277 continue;
1278
1279 /* Special case - vmbus channel protocol msg */
1280 if (relid == 0)
1281 continue;
1282
1283 /*
1284 * Pairs with the kfree_rcu() in vmbus_chan_release().
1285 * Guarantees that the channel data structure doesn't
1286 * get freed while the channel pointer below is being
1287 * dereferenced.
1288 */
1289 rcu_read_lock();
1290
1291 /* Find channel based on relid */
1292 channel = relid2channel(relid);
1293 if (channel == NULL)
1294 goto sched_unlock_rcu;
1295
1296 if (channel->rescind)
1297 goto sched_unlock_rcu;
1298
1299 /*
1300 * Make sure that the ring buffer data structure doesn't get
1301 * freed while we dereference the ring buffer pointer. Test
1302 * for the channel's onchannel_callback being NULL within a
1303 * sched_lock critical section. See also the inline comments
1304 * in vmbus_reset_channel_cb().
1305 */
1306 spin_lock(&channel->sched_lock);
1307
1308 callback_fn = channel->onchannel_callback;
1309 if (unlikely(callback_fn == NULL))
1310 goto sched_unlock;
1311
1312 trace_vmbus_chan_sched(channel);
1313
1314 ++channel->interrupts;
1315
1316 switch (channel->callback_mode) {
1317 case HV_CALL_ISR:
1318 (*callback_fn)(channel->channel_callback_context);
1319 break;
1320
1321 case HV_CALL_BATCHED:
1322 hv_begin_read(&channel->inbound);
1323 fallthrough;
1324 case HV_CALL_DIRECT:
1325 tasklet_schedule(&channel->callback_event);
1326 }
1327
1328 sched_unlock:
1329 spin_unlock(&channel->sched_lock);
1330 sched_unlock_rcu:
1331 rcu_read_unlock();
1332 }
1333 }
1334
vmbus_message_sched(struct hv_per_cpu_context * hv_cpu,void * message_page_addr)1335 static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
1336 {
1337 struct hv_message *msg;
1338
1339 if (!message_page_addr)
1340 return;
1341 msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
1342
1343 /* Check if there are actual msgs to be processed */
1344 if (msg->header.message_type != HVMSG_NONE) {
1345 if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
1346 hv_stimer0_isr();
1347 vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
1348 } else {
1349 tasklet_schedule(&hv_cpu->msg_dpc);
1350 }
1351 }
1352 }
1353
__vmbus_isr(void)1354 static void __vmbus_isr(void)
1355 {
1356 struct hv_per_cpu_context *hv_cpu
1357 = this_cpu_ptr(hv_context.cpu_context);
1358
1359 vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
1360 vmbus_chan_sched(hv_cpu->para_synic_event_page);
1361
1362 vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
1363 vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
1364
1365 add_interrupt_randomness(vmbus_interrupt);
1366 }
1367
1368 static DEFINE_PER_CPU(bool, vmbus_irq_pending);
1369 static DEFINE_PER_CPU(struct task_struct *, vmbus_irqd);
1370
vmbus_irqd_wake(void)1371 static void vmbus_irqd_wake(void)
1372 {
1373 struct task_struct *tsk = __this_cpu_read(vmbus_irqd);
1374
1375 __this_cpu_write(vmbus_irq_pending, true);
1376 wake_up_process(tsk);
1377 }
1378
vmbus_irqd_setup(unsigned int cpu)1379 static void vmbus_irqd_setup(unsigned int cpu)
1380 {
1381 sched_set_fifo(current);
1382 }
1383
vmbus_irqd_should_run(unsigned int cpu)1384 static int vmbus_irqd_should_run(unsigned int cpu)
1385 {
1386 return __this_cpu_read(vmbus_irq_pending);
1387 }
1388
run_vmbus_irqd(unsigned int cpu)1389 static void run_vmbus_irqd(unsigned int cpu)
1390 {
1391 __this_cpu_write(vmbus_irq_pending, false);
1392 __vmbus_isr();
1393 }
1394
1395 static bool vmbus_irq_initialized;
1396
1397 static struct smp_hotplug_thread vmbus_irq_threads = {
1398 .store = &vmbus_irqd,
1399 .setup = vmbus_irqd_setup,
1400 .thread_should_run = vmbus_irqd_should_run,
1401 .thread_fn = run_vmbus_irqd,
1402 .thread_comm = "vmbus_irq/%u",
1403 };
1404
vmbus_isr(void)1405 void vmbus_isr(void)
1406 {
1407 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
1408 vmbus_irqd_wake();
1409 } else {
1410 lockdep_hardirq_threaded();
1411 __vmbus_isr();
1412 }
1413 }
1414 EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
1415
vmbus_percpu_isr(int irq,void * dev_id)1416 static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
1417 {
1418 vmbus_isr();
1419 return IRQ_HANDLED;
1420 }
1421
vmbus_percpu_work(struct work_struct * work)1422 static void vmbus_percpu_work(struct work_struct *work)
1423 {
1424 unsigned int cpu = smp_processor_id();
1425
1426 hv_synic_init(cpu);
1427 }
1428
vmbus_alloc_synic_and_connect(void)1429 static int vmbus_alloc_synic_and_connect(void)
1430 {
1431 int ret, cpu;
1432 struct work_struct __percpu *works;
1433 int hyperv_cpuhp_online;
1434
1435 ret = hv_synic_alloc();
1436 if (ret < 0)
1437 goto err_alloc;
1438
1439 works = alloc_percpu(struct work_struct);
1440 if (!works) {
1441 ret = -ENOMEM;
1442 goto err_alloc;
1443 }
1444
1445 /*
1446 * Initialize the per-cpu interrupt state and stimer state.
1447 * Then connect to the host.
1448 */
1449 cpus_read_lock();
1450 for_each_online_cpu(cpu) {
1451 struct work_struct *work = per_cpu_ptr(works, cpu);
1452
1453 INIT_WORK(work, vmbus_percpu_work);
1454 schedule_work_on(cpu, work);
1455 }
1456
1457 for_each_online_cpu(cpu)
1458 flush_work(per_cpu_ptr(works, cpu));
1459
1460 /* Register the callbacks for possible CPU online/offline'ing */
1461 ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
1462 hv_synic_init, hv_synic_cleanup);
1463 cpus_read_unlock();
1464 free_percpu(works);
1465 if (ret < 0)
1466 goto err_alloc;
1467 hyperv_cpuhp_online = ret;
1468
1469 ret = vmbus_connect();
1470 if (ret)
1471 goto err_connect;
1472 return 0;
1473
1474 err_connect:
1475 cpuhp_remove_state(hyperv_cpuhp_online);
1476 return -ENODEV;
1477 err_alloc:
1478 hv_synic_free();
1479 return -ENOMEM;
1480 }
1481
1482 /*
1483 * vmbus_bus_init -Main vmbus driver initialization routine.
1484 *
1485 * Here, we
1486 * - initialize the vmbus driver context
1487 * - invoke the vmbus hv main init routine
1488 * - retrieve the channel offers
1489 */
vmbus_bus_init(void)1490 static int vmbus_bus_init(void)
1491 {
1492 int ret;
1493
1494 ret = hv_init();
1495 if (ret != 0) {
1496 pr_err("Unable to initialize the hypervisor - 0x%x\n", ret);
1497 return ret;
1498 }
1499
1500 ret = bus_register(&hv_bus);
1501 if (ret)
1502 return ret;
1503
1504 /*
1505 * VMbus interrupts are best modeled as per-cpu interrupts. If
1506 * on an architecture with support for per-cpu IRQs (e.g. ARM64),
1507 * allocate a per-cpu IRQ using standard Linux kernel functionality.
1508 * If not on such an architecture (e.g., x86/x64), then rely on
1509 * code in the arch-specific portion of the code tree to connect
1510 * the VMbus interrupt handler.
1511 */
1512
1513 if (IS_ENABLED(CONFIG_PREEMPT_RT) && !vmbus_irq_initialized) {
1514 ret = smpboot_register_percpu_thread(&vmbus_irq_threads);
1515 if (ret)
1516 goto err_kthread;
1517 vmbus_irq_initialized = true;
1518 }
1519
1520 if (vmbus_irq == -1) {
1521 hv_setup_vmbus_handler(vmbus_isr);
1522 } else {
1523 ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
1524 "Hyper-V VMbus", &vmbus_evt);
1525 if (ret) {
1526 pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
1527 vmbus_irq, ret);
1528 goto err_setup;
1529 }
1530 }
1531
1532 /*
1533 * Cache the value as getting it involves a VM exit on x86(_64), and
1534 * doing that on each VP while initializing SynIC's wastes time.
1535 */
1536 is_confidential = ms_hyperv.confidential_vmbus_available;
1537 if (is_confidential)
1538 pr_info("Establishing connection to the confidential VMBus\n");
1539 hv_para_set_sint_proxy(!is_confidential);
1540 ret = vmbus_alloc_synic_and_connect();
1541 if (ret)
1542 goto err_connect;
1543
1544 /*
1545 * Always register the vmbus unload panic notifier because we
1546 * need to shut the VMbus channel connection on panic.
1547 */
1548 atomic_notifier_chain_register(&panic_notifier_list,
1549 &hyperv_panic_vmbus_unload_block);
1550
1551 vmbus_request_offers();
1552
1553 return 0;
1554
1555 err_connect:
1556 if (vmbus_irq == -1)
1557 hv_remove_vmbus_handler();
1558 else
1559 free_percpu_irq(vmbus_irq, &vmbus_evt);
1560 err_setup:
1561 if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
1562 smpboot_unregister_percpu_thread(&vmbus_irq_threads);
1563 vmbus_irq_initialized = false;
1564 }
1565 err_kthread:
1566 bus_unregister(&hv_bus);
1567 return ret;
1568 }
1569
1570 /**
1571 * __vmbus_driver_register() - Register a vmbus's driver
1572 * @hv_driver: Pointer to driver structure you want to register
1573 * @owner: owner module of the drv
1574 * @mod_name: module name string
1575 *
1576 * Registers the given driver with Linux through the 'driver_register()' call
1577 * and sets up the hyper-v vmbus handling for this driver.
1578 * It will return the state of the 'driver_register()' call.
1579 *
1580 */
__vmbus_driver_register(struct hv_driver * hv_driver,struct module * owner,const char * mod_name)1581 int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, const char *mod_name)
1582 {
1583 int ret;
1584
1585 pr_info("registering driver %s\n", hv_driver->name);
1586
1587 ret = vmbus_exists();
1588 if (ret < 0)
1589 return ret;
1590
1591 hv_driver->driver.name = hv_driver->name;
1592 hv_driver->driver.owner = owner;
1593 hv_driver->driver.mod_name = mod_name;
1594 hv_driver->driver.bus = &hv_bus;
1595
1596 spin_lock_init(&hv_driver->dynids.lock);
1597 INIT_LIST_HEAD(&hv_driver->dynids.list);
1598
1599 ret = driver_register(&hv_driver->driver);
1600
1601 return ret;
1602 }
1603 EXPORT_SYMBOL_GPL(__vmbus_driver_register);
1604
1605 /**
1606 * vmbus_driver_unregister() - Unregister a vmbus's driver
1607 * @hv_driver: Pointer to driver structure you want to
1608 * un-register
1609 *
1610 * Un-register the given driver that was previous registered with a call to
1611 * vmbus_driver_register()
1612 */
vmbus_driver_unregister(struct hv_driver * hv_driver)1613 void vmbus_driver_unregister(struct hv_driver *hv_driver)
1614 {
1615 pr_info("unregistering driver %s\n", hv_driver->name);
1616
1617 if (!vmbus_exists()) {
1618 driver_unregister(&hv_driver->driver);
1619 vmbus_free_dynids(hv_driver);
1620 }
1621 }
1622 EXPORT_SYMBOL_GPL(vmbus_driver_unregister);
1623
1624
1625 /*
1626 * Called when last reference to channel is gone.
1627 */
vmbus_chan_release(struct kobject * kobj)1628 static void vmbus_chan_release(struct kobject *kobj)
1629 {
1630 struct vmbus_channel *channel
1631 = container_of(kobj, struct vmbus_channel, kobj);
1632
1633 kfree_rcu(channel, rcu);
1634 }
1635
1636 struct vmbus_chan_attribute {
1637 struct attribute attr;
1638 ssize_t (*show)(struct vmbus_channel *chan, char *buf);
1639 ssize_t (*store)(struct vmbus_channel *chan,
1640 const char *buf, size_t count);
1641 };
1642 #define VMBUS_CHAN_ATTR(_name, _mode, _show, _store) \
1643 struct vmbus_chan_attribute chan_attr_##_name \
1644 = __ATTR(_name, _mode, _show, _store)
1645 #define VMBUS_CHAN_ATTR_RW(_name) \
1646 struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RW(_name)
1647 #define VMBUS_CHAN_ATTR_RO(_name) \
1648 struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RO(_name)
1649 #define VMBUS_CHAN_ATTR_WO(_name) \
1650 struct vmbus_chan_attribute chan_attr_##_name = __ATTR_WO(_name)
1651
vmbus_chan_attr_show(struct kobject * kobj,struct attribute * attr,char * buf)1652 static ssize_t vmbus_chan_attr_show(struct kobject *kobj,
1653 struct attribute *attr, char *buf)
1654 {
1655 const struct vmbus_chan_attribute *attribute
1656 = container_of(attr, struct vmbus_chan_attribute, attr);
1657 struct vmbus_channel *chan
1658 = container_of(kobj, struct vmbus_channel, kobj);
1659
1660 if (!attribute->show)
1661 return -EIO;
1662
1663 return attribute->show(chan, buf);
1664 }
1665
vmbus_chan_attr_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t count)1666 static ssize_t vmbus_chan_attr_store(struct kobject *kobj,
1667 struct attribute *attr, const char *buf,
1668 size_t count)
1669 {
1670 const struct vmbus_chan_attribute *attribute
1671 = container_of(attr, struct vmbus_chan_attribute, attr);
1672 struct vmbus_channel *chan
1673 = container_of(kobj, struct vmbus_channel, kobj);
1674
1675 if (!attribute->store)
1676 return -EIO;
1677
1678 return attribute->store(chan, buf, count);
1679 }
1680
1681 static const struct sysfs_ops vmbus_chan_sysfs_ops = {
1682 .show = vmbus_chan_attr_show,
1683 .store = vmbus_chan_attr_store,
1684 };
1685
out_mask_show(struct vmbus_channel * channel,char * buf)1686 static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf)
1687 {
1688 struct hv_ring_buffer_info *rbi = &channel->outbound;
1689 ssize_t ret;
1690
1691 mutex_lock(&rbi->ring_buffer_mutex);
1692 if (!rbi->ring_buffer) {
1693 mutex_unlock(&rbi->ring_buffer_mutex);
1694 return -EINVAL;
1695 }
1696
1697 ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
1698 mutex_unlock(&rbi->ring_buffer_mutex);
1699 return ret;
1700 }
1701 static VMBUS_CHAN_ATTR_RO(out_mask);
1702
in_mask_show(struct vmbus_channel * channel,char * buf)1703 static ssize_t in_mask_show(struct vmbus_channel *channel, char *buf)
1704 {
1705 struct hv_ring_buffer_info *rbi = &channel->inbound;
1706 ssize_t ret;
1707
1708 mutex_lock(&rbi->ring_buffer_mutex);
1709 if (!rbi->ring_buffer) {
1710 mutex_unlock(&rbi->ring_buffer_mutex);
1711 return -EINVAL;
1712 }
1713
1714 ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
1715 mutex_unlock(&rbi->ring_buffer_mutex);
1716 return ret;
1717 }
1718 static VMBUS_CHAN_ATTR_RO(in_mask);
1719
read_avail_show(struct vmbus_channel * channel,char * buf)1720 static ssize_t read_avail_show(struct vmbus_channel *channel, char *buf)
1721 {
1722 struct hv_ring_buffer_info *rbi = &channel->inbound;
1723 ssize_t ret;
1724
1725 mutex_lock(&rbi->ring_buffer_mutex);
1726 if (!rbi->ring_buffer) {
1727 mutex_unlock(&rbi->ring_buffer_mutex);
1728 return -EINVAL;
1729 }
1730
1731 ret = sprintf(buf, "%u\n", hv_get_bytes_to_read(rbi));
1732 mutex_unlock(&rbi->ring_buffer_mutex);
1733 return ret;
1734 }
1735 static VMBUS_CHAN_ATTR_RO(read_avail);
1736
write_avail_show(struct vmbus_channel * channel,char * buf)1737 static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf)
1738 {
1739 struct hv_ring_buffer_info *rbi = &channel->outbound;
1740 ssize_t ret;
1741
1742 mutex_lock(&rbi->ring_buffer_mutex);
1743 if (!rbi->ring_buffer) {
1744 mutex_unlock(&rbi->ring_buffer_mutex);
1745 return -EINVAL;
1746 }
1747
1748 ret = sprintf(buf, "%u\n", hv_get_bytes_to_write(rbi));
1749 mutex_unlock(&rbi->ring_buffer_mutex);
1750 return ret;
1751 }
1752 static VMBUS_CHAN_ATTR_RO(write_avail);
1753
target_cpu_show(struct vmbus_channel * channel,char * buf)1754 static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
1755 {
1756 return sprintf(buf, "%u\n", channel->target_cpu);
1757 }
1758
vmbus_channel_set_cpu(struct vmbus_channel * channel,u32 target_cpu)1759 int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
1760 {
1761 u32 origin_cpu;
1762 int ret = 0;
1763
1764 lockdep_assert_cpus_held();
1765 lockdep_assert_held(&vmbus_connection.channel_mutex);
1766
1767 if (vmbus_proto_version < VERSION_WIN10_V4_1)
1768 return -EIO;
1769
1770 /* Validate target_cpu for the cpumask_test_cpu() operation below. */
1771 if (target_cpu >= nr_cpumask_bits)
1772 return -EINVAL;
1773
1774 if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
1775 return -EINVAL;
1776
1777 if (!cpu_online(target_cpu))
1778 return -EINVAL;
1779
1780 /*
1781 * Synchronizes vmbus_channel_set_cpu() and channel closure:
1782 *
1783 * { Initially: state = CHANNEL_OPENED }
1784 *
1785 * CPU1 CPU2
1786 *
1787 * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()]
1788 *
1789 * LOCK channel_mutex LOCK channel_mutex
1790 * LOAD r1 = state LOAD r2 = state
1791 * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED)
1792 * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN
1793 * [...] SEND CLOSECHANNEL
1794 * UNLOCK channel_mutex UNLOCK channel_mutex
1795 *
1796 * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes
1797 * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND
1798 *
1799 * Note. The host processes the channel messages "sequentially", in
1800 * the order in which they are received on a per-partition basis.
1801 */
1802
1803 /*
1804 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
1805 * avoid sending the message and fail here for such channels.
1806 */
1807 if (channel->state != CHANNEL_OPENED_STATE) {
1808 ret = -EIO;
1809 goto end;
1810 }
1811
1812 origin_cpu = channel->target_cpu;
1813 if (target_cpu == origin_cpu)
1814 goto end;
1815
1816 if (vmbus_send_modifychannel(channel,
1817 hv_cpu_number_to_vp_number(target_cpu))) {
1818 ret = -EIO;
1819 goto end;
1820 }
1821
1822 /*
1823 * For version before VERSION_WIN10_V5_3, the following warning holds:
1824 *
1825 * Warning. At this point, there is *no* guarantee that the host will
1826 * have successfully processed the vmbus_send_modifychannel() request.
1827 * See the header comment of vmbus_send_modifychannel() for more info.
1828 *
1829 * Lags in the processing of the above vmbus_send_modifychannel() can
1830 * result in missed interrupts if the "old" target CPU is taken offline
1831 * before Hyper-V starts sending interrupts to the "new" target CPU.
1832 * But apart from this offlining scenario, the code tolerates such
1833 * lags. It will function correctly even if a channel interrupt comes
1834 * in on a CPU that is different from the channel target_cpu value.
1835 */
1836
1837 channel->target_cpu = target_cpu;
1838
1839 /* See init_vp_index(). */
1840 if (hv_is_perf_channel(channel))
1841 hv_update_allocated_cpus(origin_cpu, target_cpu);
1842
1843 /* Currently set only for storvsc channels. */
1844 if (channel->change_target_cpu_callback) {
1845 (*channel->change_target_cpu_callback)(channel,
1846 origin_cpu, target_cpu);
1847 }
1848
1849 end:
1850 return ret;
1851 }
1852
target_cpu_store(struct vmbus_channel * channel,const char * buf,size_t count)1853 static ssize_t target_cpu_store(struct vmbus_channel *channel,
1854 const char *buf, size_t count)
1855 {
1856 u32 target_cpu;
1857 ssize_t ret;
1858
1859 if (sscanf(buf, "%u", &target_cpu) != 1)
1860 return -EIO;
1861
1862 cpus_read_lock();
1863 mutex_lock(&vmbus_connection.channel_mutex);
1864 ret = vmbus_channel_set_cpu(channel, target_cpu);
1865 mutex_unlock(&vmbus_connection.channel_mutex);
1866 cpus_read_unlock();
1867
1868 return ret ?: count;
1869 }
1870 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
1871
channel_pending_show(struct vmbus_channel * channel,char * buf)1872 static ssize_t channel_pending_show(struct vmbus_channel *channel,
1873 char *buf)
1874 {
1875 return sprintf(buf, "%d\n",
1876 channel_pending(channel,
1877 vmbus_connection.monitor_pages[1]));
1878 }
1879 static VMBUS_CHAN_ATTR(pending, 0444, channel_pending_show, NULL);
1880
channel_latency_show(struct vmbus_channel * channel,char * buf)1881 static ssize_t channel_latency_show(struct vmbus_channel *channel,
1882 char *buf)
1883 {
1884 return sprintf(buf, "%d\n",
1885 channel_latency(channel,
1886 vmbus_connection.monitor_pages[1]));
1887 }
1888 static VMBUS_CHAN_ATTR(latency, 0444, channel_latency_show, NULL);
1889
channel_interrupts_show(struct vmbus_channel * channel,char * buf)1890 static ssize_t channel_interrupts_show(struct vmbus_channel *channel, char *buf)
1891 {
1892 return sprintf(buf, "%llu\n", channel->interrupts);
1893 }
1894 static VMBUS_CHAN_ATTR(interrupts, 0444, channel_interrupts_show, NULL);
1895
channel_events_show(struct vmbus_channel * channel,char * buf)1896 static ssize_t channel_events_show(struct vmbus_channel *channel, char *buf)
1897 {
1898 return sprintf(buf, "%llu\n", channel->sig_events);
1899 }
1900 static VMBUS_CHAN_ATTR(events, 0444, channel_events_show, NULL);
1901
channel_intr_in_full_show(struct vmbus_channel * channel,char * buf)1902 static ssize_t channel_intr_in_full_show(struct vmbus_channel *channel,
1903 char *buf)
1904 {
1905 return sprintf(buf, "%llu\n",
1906 (unsigned long long)channel->intr_in_full);
1907 }
1908 static VMBUS_CHAN_ATTR(intr_in_full, 0444, channel_intr_in_full_show, NULL);
1909
channel_intr_out_empty_show(struct vmbus_channel * channel,char * buf)1910 static ssize_t channel_intr_out_empty_show(struct vmbus_channel *channel,
1911 char *buf)
1912 {
1913 return sprintf(buf, "%llu\n",
1914 (unsigned long long)channel->intr_out_empty);
1915 }
1916 static VMBUS_CHAN_ATTR(intr_out_empty, 0444, channel_intr_out_empty_show, NULL);
1917
channel_out_full_first_show(struct vmbus_channel * channel,char * buf)1918 static ssize_t channel_out_full_first_show(struct vmbus_channel *channel,
1919 char *buf)
1920 {
1921 return sprintf(buf, "%llu\n",
1922 (unsigned long long)channel->out_full_first);
1923 }
1924 static VMBUS_CHAN_ATTR(out_full_first, 0444, channel_out_full_first_show, NULL);
1925
channel_out_full_total_show(struct vmbus_channel * channel,char * buf)1926 static ssize_t channel_out_full_total_show(struct vmbus_channel *channel,
1927 char *buf)
1928 {
1929 return sprintf(buf, "%llu\n",
1930 (unsigned long long)channel->out_full_total);
1931 }
1932 static VMBUS_CHAN_ATTR(out_full_total, 0444, channel_out_full_total_show, NULL);
1933
subchannel_monitor_id_show(struct vmbus_channel * channel,char * buf)1934 static ssize_t subchannel_monitor_id_show(struct vmbus_channel *channel,
1935 char *buf)
1936 {
1937 return sprintf(buf, "%u\n", channel->offermsg.monitorid);
1938 }
1939 static VMBUS_CHAN_ATTR(monitor_id, 0444, subchannel_monitor_id_show, NULL);
1940
subchannel_id_show(struct vmbus_channel * channel,char * buf)1941 static ssize_t subchannel_id_show(struct vmbus_channel *channel,
1942 char *buf)
1943 {
1944 return sprintf(buf, "%u\n",
1945 channel->offermsg.offer.sub_channel_index);
1946 }
1947 static VMBUS_CHAN_ATTR_RO(subchannel_id);
1948
hv_mmap_ring_buffer_wrapper(struct file * filp,struct kobject * kobj,const struct bin_attribute * attr,struct vm_area_struct * vma)1949 static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj,
1950 const struct bin_attribute *attr,
1951 struct vm_area_struct *vma)
1952 {
1953 struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj);
1954
1955 /*
1956 * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer
1957 * is not NULL.
1958 */
1959 return channel->mmap_ring_buffer(channel, vma);
1960 }
1961
1962 static struct bin_attribute chan_attr_ring_buffer = {
1963 .attr = {
1964 .name = "ring",
1965 .mode = 0600,
1966 },
1967 .mmap = hv_mmap_ring_buffer_wrapper,
1968 };
1969 static struct attribute *vmbus_chan_attrs[] = {
1970 &chan_attr_out_mask.attr,
1971 &chan_attr_in_mask.attr,
1972 &chan_attr_read_avail.attr,
1973 &chan_attr_write_avail.attr,
1974 &chan_attr_cpu.attr,
1975 &chan_attr_pending.attr,
1976 &chan_attr_latency.attr,
1977 &chan_attr_interrupts.attr,
1978 &chan_attr_events.attr,
1979 &chan_attr_intr_in_full.attr,
1980 &chan_attr_intr_out_empty.attr,
1981 &chan_attr_out_full_first.attr,
1982 &chan_attr_out_full_total.attr,
1983 &chan_attr_monitor_id.attr,
1984 &chan_attr_subchannel_id.attr,
1985 NULL
1986 };
1987
1988 static const struct bin_attribute *vmbus_chan_bin_attrs[] = {
1989 &chan_attr_ring_buffer,
1990 NULL
1991 };
1992
1993 /*
1994 * Channel-level attribute_group callback function. Returns the permission for
1995 * each attribute, and returns 0 if an attribute is not visible.
1996 */
vmbus_chan_attr_is_visible(struct kobject * kobj,struct attribute * attr,int idx)1997 static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj,
1998 struct attribute *attr, int idx)
1999 {
2000 const struct vmbus_channel *channel =
2001 container_of(kobj, struct vmbus_channel, kobj);
2002
2003 /* Hide the monitor attributes if the monitor mechanism is not used. */
2004 if (!channel->offermsg.monitor_allocated &&
2005 (attr == &chan_attr_pending.attr ||
2006 attr == &chan_attr_latency.attr ||
2007 attr == &chan_attr_monitor_id.attr))
2008 return 0;
2009
2010 return attr->mode;
2011 }
2012
vmbus_chan_bin_attr_is_visible(struct kobject * kobj,const struct bin_attribute * attr,int idx)2013 static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj,
2014 const struct bin_attribute *attr, int idx)
2015 {
2016 const struct vmbus_channel *channel =
2017 container_of(kobj, struct vmbus_channel, kobj);
2018
2019 /* Hide ring attribute if channel's ring_sysfs_visible is set to false */
2020 if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible)
2021 return 0;
2022
2023 return attr->attr.mode;
2024 }
2025
vmbus_chan_bin_size(struct kobject * kobj,const struct bin_attribute * bin_attr,int a)2026 static size_t vmbus_chan_bin_size(struct kobject *kobj,
2027 const struct bin_attribute *bin_attr, int a)
2028 {
2029 const struct vmbus_channel *channel =
2030 container_of(kobj, struct vmbus_channel, kobj);
2031
2032 return channel->ringbuffer_pagecount << PAGE_SHIFT;
2033 }
2034
2035 static const struct attribute_group vmbus_chan_group = {
2036 .attrs = vmbus_chan_attrs,
2037 .bin_attrs = vmbus_chan_bin_attrs,
2038 .is_visible = vmbus_chan_attr_is_visible,
2039 .is_bin_visible = vmbus_chan_bin_attr_is_visible,
2040 .bin_size = vmbus_chan_bin_size,
2041 };
2042
2043 static const struct kobj_type vmbus_chan_ktype = {
2044 .sysfs_ops = &vmbus_chan_sysfs_ops,
2045 .release = vmbus_chan_release,
2046 };
2047
2048 /**
2049 * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel.
2050 * @channel: Pointer to vmbus_channel structure
2051 * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of
2052 * channel's "ring" sysfs node, which is for the ring buffer of that channel.
2053 * Function pointer is of below type:
2054 * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
2055 * struct vm_area_struct *vma))
2056 * This has a pointer to the channel and a pointer to vm_area_struct,
2057 * used for mmap, as arguments.
2058 *
2059 * Sysfs node for ring buffer of a channel is created along with other fields, however its
2060 * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case
2061 * is running.
2062 * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of
2063 * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid
2064 * exposing the ring buffer by default, this function is responsible to enable visibility of
2065 * ring for userspace to use.
2066 * Note: Race conditions can happen with userspace and it is not encouraged to create new
2067 * use-cases for this. This was added to maintain backward compatibility, while solving
2068 * one of the race conditions in uio_hv_generic while creating sysfs. See comments with
2069 * vmbus_add_dynid() and vmbus_device_register().
2070 *
2071 * Returns 0 on success or error code on failure.
2072 */
hv_create_ring_sysfs(struct vmbus_channel * channel,int (* hv_mmap_ring_buffer)(struct vmbus_channel * channel,struct vm_area_struct * vma))2073 int hv_create_ring_sysfs(struct vmbus_channel *channel,
2074 int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
2075 struct vm_area_struct *vma))
2076 {
2077 struct kobject *kobj = &channel->kobj;
2078
2079 channel->mmap_ring_buffer = hv_mmap_ring_buffer;
2080 channel->ring_sysfs_visible = true;
2081
2082 return sysfs_update_group(kobj, &vmbus_chan_group);
2083 }
2084 EXPORT_SYMBOL_GPL(hv_create_ring_sysfs);
2085
2086 /**
2087 * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel.
2088 * @channel: Pointer to vmbus_channel structure
2089 *
2090 * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group.
2091 *
2092 * Returns 0 on success or error code on failure.
2093 */
hv_remove_ring_sysfs(struct vmbus_channel * channel)2094 int hv_remove_ring_sysfs(struct vmbus_channel *channel)
2095 {
2096 struct kobject *kobj = &channel->kobj;
2097 int ret;
2098
2099 channel->ring_sysfs_visible = false;
2100 ret = sysfs_update_group(kobj, &vmbus_chan_group);
2101 channel->mmap_ring_buffer = NULL;
2102 return ret;
2103 }
2104 EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs);
2105
2106 /*
2107 * vmbus_add_channel_kobj - setup a sub-directory under device/channels
2108 */
vmbus_add_channel_kobj(struct hv_device * dev,struct vmbus_channel * channel)2109 int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel)
2110 {
2111 const struct device *device = &dev->device;
2112 struct kobject *kobj = &channel->kobj;
2113 u32 relid = channel->offermsg.child_relid;
2114 int ret;
2115
2116 kobj->kset = dev->channels_kset;
2117 ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL,
2118 "%u", relid);
2119 if (ret) {
2120 kobject_put(kobj);
2121 return ret;
2122 }
2123
2124 ret = sysfs_create_group(kobj, &vmbus_chan_group);
2125
2126 if (ret) {
2127 /*
2128 * The calling functions' error handling paths will cleanup the
2129 * empty channel directory.
2130 */
2131 kobject_put(kobj);
2132 dev_err(device, "Unable to set up channel sysfs files\n");
2133 return ret;
2134 }
2135
2136 kobject_uevent(kobj, KOBJ_ADD);
2137
2138 return 0;
2139 }
2140
2141 /*
2142 * vmbus_remove_channel_attr_group - remove the channel's attribute group
2143 */
vmbus_remove_channel_attr_group(struct vmbus_channel * channel)2144 void vmbus_remove_channel_attr_group(struct vmbus_channel *channel)
2145 {
2146 sysfs_remove_group(&channel->kobj, &vmbus_chan_group);
2147 }
2148
2149 /*
2150 * vmbus_device_create - Creates and registers a new child device
2151 * on the vmbus.
2152 */
vmbus_device_create(const guid_t * type,const guid_t * instance,struct vmbus_channel * channel)2153 struct hv_device *vmbus_device_create(const guid_t *type,
2154 const guid_t *instance,
2155 struct vmbus_channel *channel)
2156 {
2157 struct hv_device *child_device_obj;
2158
2159 child_device_obj = kzalloc_obj(struct hv_device);
2160 if (!child_device_obj) {
2161 pr_err("Unable to allocate device object for child device\n");
2162 return NULL;
2163 }
2164
2165 child_device_obj->channel = channel;
2166 guid_copy(&child_device_obj->dev_type, type);
2167 guid_copy(&child_device_obj->dev_instance, instance);
2168 child_device_obj->vendor_id = PCI_VENDOR_ID_MICROSOFT;
2169
2170 return child_device_obj;
2171 }
2172
2173 /*
2174 * vmbus_device_register - Register the child device
2175 */
vmbus_device_register(struct hv_device * child_device_obj)2176 int vmbus_device_register(struct hv_device *child_device_obj)
2177 {
2178 struct kobject *kobj = &child_device_obj->device.kobj;
2179 int ret;
2180
2181 dev_set_name(&child_device_obj->device, "%pUl",
2182 &child_device_obj->channel->offermsg.offer.if_instance);
2183
2184 child_device_obj->device.bus = &hv_bus;
2185 child_device_obj->device.parent = vmbus_root_device;
2186 child_device_obj->device.release = vmbus_device_release;
2187
2188 child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
2189 child_device_obj->device.dma_mask = &child_device_obj->dma_mask;
2190 dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64));
2191
2192 /*
2193 * Register with the LDM. This will kick off the driver/device
2194 * binding...which will eventually call vmbus_match() and vmbus_probe()
2195 */
2196 ret = device_register(&child_device_obj->device);
2197 if (ret) {
2198 pr_err("Unable to register child device\n");
2199 put_device(&child_device_obj->device);
2200 return ret;
2201 }
2202
2203 /*
2204 * If device_register() found a driver to assign to the device, the
2205 * driver's probe function has already run at this point. If that
2206 * probe function accesses or operates on the "channels" subdirectory
2207 * in sysfs, those operations will have failed because the "channels"
2208 * subdirectory doesn't exist until the code below runs. Or if the
2209 * probe function creates a /dev entry, a user space program could
2210 * find and open the /dev entry, and then create a race by accessing
2211 * the "channels" subdirectory while the creation steps are in progress
2212 * here. The race can't result in a kernel failure, but the user space
2213 * program may get an error in accessing "channels" or its
2214 * subdirectories. See also comments with vmbus_add_dynid() about a
2215 * related race condition.
2216 */
2217 child_device_obj->channels_kset = kset_create_and_add("channels",
2218 NULL, kobj);
2219 if (!child_device_obj->channels_kset) {
2220 ret = -ENOMEM;
2221 goto err_dev_unregister;
2222 }
2223
2224 ret = vmbus_add_channel_kobj(child_device_obj,
2225 child_device_obj->channel);
2226 if (ret) {
2227 pr_err("Unable to register primary channel\n");
2228 goto err_kset_unregister;
2229 }
2230 hv_debug_add_dev_dir(child_device_obj);
2231
2232 return 0;
2233
2234 err_kset_unregister:
2235 kset_unregister(child_device_obj->channels_kset);
2236
2237 err_dev_unregister:
2238 device_unregister(&child_device_obj->device);
2239 return ret;
2240 }
2241
2242 /*
2243 * vmbus_device_unregister - Remove the specified child device
2244 * from the vmbus.
2245 */
vmbus_device_unregister(struct hv_device * device_obj)2246 void vmbus_device_unregister(struct hv_device *device_obj)
2247 {
2248 pr_debug("child device %s unregistered\n",
2249 dev_name(&device_obj->device));
2250
2251 kset_unregister(device_obj->channels_kset);
2252
2253 /*
2254 * Kick off the process of unregistering the device.
2255 * This will call vmbus_remove() and eventually vmbus_device_release()
2256 */
2257 device_unregister(&device_obj->device);
2258 }
2259 EXPORT_SYMBOL_GPL(vmbus_device_unregister);
2260
2261 #ifdef CONFIG_ACPI
2262 /*
2263 * VMBUS is an acpi enumerated device. Get the information we
2264 * need from DSDT.
2265 */
vmbus_walk_resources(struct acpi_resource * res,void * ctx)2266 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
2267 {
2268 resource_size_t start = 0;
2269 resource_size_t end = 0;
2270 struct resource *new_res;
2271 struct resource **old_res = &hyperv_mmio;
2272 struct resource **prev_res = NULL;
2273 struct resource r;
2274
2275 switch (res->type) {
2276
2277 /*
2278 * "Address" descriptors are for bus windows. Ignore
2279 * "memory" descriptors, which are for registers on
2280 * devices.
2281 */
2282 case ACPI_RESOURCE_TYPE_ADDRESS32:
2283 start = res->data.address32.address.minimum;
2284 end = res->data.address32.address.maximum;
2285 break;
2286
2287 case ACPI_RESOURCE_TYPE_ADDRESS64:
2288 start = res->data.address64.address.minimum;
2289 end = res->data.address64.address.maximum;
2290 break;
2291
2292 /*
2293 * The IRQ information is needed only on ARM64, which Hyper-V
2294 * sets up in the extended format. IRQ information is present
2295 * on x86/x64 in the non-extended format but it is not used by
2296 * Linux. So don't bother checking for the non-extended format.
2297 */
2298 case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
2299 if (!acpi_dev_resource_interrupt(res, 0, &r)) {
2300 pr_err("Unable to parse Hyper-V ACPI interrupt\n");
2301 return AE_ERROR;
2302 }
2303 /* ARM64 INTID for VMbus */
2304 vmbus_interrupt = res->data.extended_irq.interrupts[0];
2305 /* Linux IRQ number */
2306 vmbus_irq = r.start;
2307 return AE_OK;
2308
2309 default:
2310 /* Unused resource type */
2311 return AE_OK;
2312
2313 }
2314 /*
2315 * Ignore ranges that are below 1MB, as they're not
2316 * necessary or useful here.
2317 */
2318 if (end < 0x100000)
2319 return AE_OK;
2320
2321 new_res = kzalloc_obj(*new_res, GFP_ATOMIC);
2322 if (!new_res)
2323 return AE_NO_MEMORY;
2324
2325 /* If this range overlaps the virtual TPM, truncate it. */
2326 if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
2327 end = VTPM_BASE_ADDRESS;
2328
2329 new_res->name = "hyperv mmio";
2330 new_res->flags = IORESOURCE_MEM;
2331 new_res->start = start;
2332 new_res->end = end;
2333
2334 /*
2335 * If two ranges are adjacent, merge them.
2336 */
2337 do {
2338 if (!*old_res) {
2339 *old_res = new_res;
2340 break;
2341 }
2342
2343 if (((*old_res)->end + 1) == new_res->start) {
2344 (*old_res)->end = new_res->end;
2345 kfree(new_res);
2346 break;
2347 }
2348
2349 if ((*old_res)->start == new_res->end + 1) {
2350 (*old_res)->start = new_res->start;
2351 kfree(new_res);
2352 break;
2353 }
2354
2355 if ((*old_res)->start > new_res->end) {
2356 new_res->sibling = *old_res;
2357 if (prev_res)
2358 (*prev_res)->sibling = new_res;
2359 *old_res = new_res;
2360 break;
2361 }
2362
2363 prev_res = old_res;
2364 old_res = &(*old_res)->sibling;
2365
2366 } while (1);
2367
2368 return AE_OK;
2369 }
2370 #endif
2371
vmbus_mmio_remove(void)2372 static void vmbus_mmio_remove(void)
2373 {
2374 struct resource *cur_res;
2375 struct resource *next_res;
2376
2377 if (hyperv_mmio) {
2378 if (fb_mmio) {
2379 __release_region(hyperv_mmio, fb_mmio->start,
2380 resource_size(fb_mmio));
2381 fb_mmio = NULL;
2382 }
2383
2384 for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) {
2385 next_res = cur_res->sibling;
2386 kfree(cur_res);
2387 }
2388 }
2389 }
2390
vmbus_reserve_fb(void)2391 static void __maybe_unused vmbus_reserve_fb(void)
2392 {
2393 resource_size_t start = 0, size;
2394 struct pci_dev *pdev;
2395
2396 if (efi_enabled(EFI_BOOT)) {
2397 /* Gen2 VM: get FB base from EFI framebuffer */
2398 if (IS_ENABLED(CONFIG_SYSFB)) {
2399 start = sysfb_primary_display.screen.lfb_base;
2400 size = max_t(__u32, sysfb_primary_display.screen.lfb_size, 0x800000);
2401 }
2402 } else {
2403 /* Gen1 VM: get FB base from PCI */
2404 pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
2405 PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
2406 if (!pdev)
2407 return;
2408
2409 if (pdev->resource[0].flags & IORESOURCE_MEM) {
2410 start = pci_resource_start(pdev, 0);
2411 size = pci_resource_len(pdev, 0);
2412 }
2413
2414 /*
2415 * Release the PCI device so hyperv_drm driver can grab it
2416 * later.
2417 */
2418 pci_dev_put(pdev);
2419 }
2420
2421 if (!start)
2422 return;
2423
2424 /*
2425 * Make a claim for the frame buffer in the resource tree under the
2426 * first node, which will be the one below 4GB. The length seems to
2427 * be underreported, particularly in a Generation 1 VM. So start out
2428 * reserving a larger area and make it smaller until it succeeds.
2429 */
2430 for (; !fb_mmio && (size >= 0x100000); size >>= 1)
2431 fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0);
2432 }
2433
2434 /**
2435 * vmbus_allocate_mmio() - Pick a memory-mapped I/O range.
2436 * @new: If successful, supplied a pointer to the
2437 * allocated MMIO space.
2438 * @device_obj: Identifies the caller
2439 * @min: Minimum guest physical address of the
2440 * allocation
2441 * @max: Maximum guest physical address
2442 * @size: Size of the range to be allocated
2443 * @align: Alignment of the range to be allocated
2444 * @fb_overlap_ok: Whether this allocation can be allowed
2445 * to overlap the video frame buffer.
2446 *
2447 * This function walks the resources granted to VMBus by the
2448 * _CRS object in the ACPI namespace underneath the parent
2449 * "bridge" whether that's a root PCI bus in the Generation 1
2450 * case or a Module Device in the Generation 2 case. It then
2451 * attempts to allocate from the global MMIO pool in a way that
2452 * matches the constraints supplied in these parameters and by
2453 * that _CRS.
2454 *
2455 * Return: 0 on success, -errno on failure
2456 */
vmbus_allocate_mmio(struct resource ** new,struct hv_device * device_obj,resource_size_t min,resource_size_t max,resource_size_t size,resource_size_t align,bool fb_overlap_ok)2457 int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
2458 resource_size_t min, resource_size_t max,
2459 resource_size_t size, resource_size_t align,
2460 bool fb_overlap_ok)
2461 {
2462 struct resource *iter, *shadow;
2463 resource_size_t range_min, range_max, start, end;
2464 const char *dev_n = dev_name(&device_obj->device);
2465 int retval;
2466
2467 retval = -ENXIO;
2468 mutex_lock(&hyperv_mmio_lock);
2469
2470 /*
2471 * If overlaps with frame buffers are allowed, then first attempt to
2472 * make the allocation from within the reserved region. Because it
2473 * is already reserved, no shadow allocation is necessary.
2474 */
2475 if (fb_overlap_ok && fb_mmio && !(min > fb_mmio->end) &&
2476 !(max < fb_mmio->start)) {
2477
2478 range_min = fb_mmio->start;
2479 range_max = fb_mmio->end;
2480 start = (range_min + align - 1) & ~(align - 1);
2481 for (; start + size - 1 <= range_max; start += align) {
2482 *new = request_mem_region_exclusive(start, size, dev_n);
2483 if (*new) {
2484 retval = 0;
2485 goto exit;
2486 }
2487 }
2488 }
2489
2490 for (iter = hyperv_mmio; iter; iter = iter->sibling) {
2491 if ((iter->start >= max) || (iter->end <= min))
2492 continue;
2493
2494 range_min = iter->start;
2495 range_max = iter->end;
2496 start = (range_min + align - 1) & ~(align - 1);
2497 for (; start + size - 1 <= range_max; start += align) {
2498 end = start + size - 1;
2499
2500 /* Skip the whole fb_mmio region if not fb_overlap_ok */
2501 if (!fb_overlap_ok && fb_mmio &&
2502 (((start >= fb_mmio->start) && (start <= fb_mmio->end)) ||
2503 ((end >= fb_mmio->start) && (end <= fb_mmio->end))))
2504 continue;
2505
2506 shadow = __request_region(iter, start, size, NULL,
2507 IORESOURCE_BUSY);
2508 if (!shadow)
2509 continue;
2510
2511 *new = request_mem_region_exclusive(start, size, dev_n);
2512 if (*new) {
2513 shadow->name = (char *)*new;
2514 retval = 0;
2515 goto exit;
2516 }
2517
2518 __release_region(iter, start, size);
2519 }
2520 }
2521
2522 exit:
2523 mutex_unlock(&hyperv_mmio_lock);
2524 return retval;
2525 }
2526 EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
2527
2528 /**
2529 * vmbus_free_mmio() - Free a memory-mapped I/O range.
2530 * @start: Base address of region to release.
2531 * @size: Size of the range to be allocated
2532 *
2533 * This function releases anything requested by
2534 * vmbus_mmio_allocate().
2535 */
vmbus_free_mmio(resource_size_t start,resource_size_t size)2536 void vmbus_free_mmio(resource_size_t start, resource_size_t size)
2537 {
2538 struct resource *iter;
2539
2540 mutex_lock(&hyperv_mmio_lock);
2541
2542 /*
2543 * If all bytes of the MMIO range to be released are within the
2544 * special case fb_mmio shadow region, skip releasing the shadow
2545 * region since no corresponding __request_region() was done
2546 * in vmbus_allocate_mmio().
2547 */
2548 if (fb_mmio && start >= fb_mmio->start &&
2549 (start + size - 1 <= fb_mmio->end))
2550 goto skip_shadow_release;
2551
2552 for (iter = hyperv_mmio; iter; iter = iter->sibling) {
2553 if ((iter->start >= start + size) || (iter->end <= start))
2554 continue;
2555
2556 __release_region(iter, start, size);
2557 }
2558
2559 skip_shadow_release:
2560 release_mem_region(start, size);
2561 mutex_unlock(&hyperv_mmio_lock);
2562
2563 }
2564 EXPORT_SYMBOL_GPL(vmbus_free_mmio);
2565
2566 #ifdef CONFIG_ACPI
vmbus_acpi_add(struct platform_device * pdev)2567 static int vmbus_acpi_add(struct platform_device *pdev)
2568 {
2569 acpi_status result;
2570 int ret_val = -ENODEV;
2571 struct acpi_device *ancestor;
2572 struct acpi_device *device = ACPI_COMPANION(&pdev->dev);
2573
2574 vmbus_root_device = &device->dev;
2575
2576 /*
2577 * Older versions of Hyper-V for ARM64 fail to include the _CCA
2578 * method on the top level VMbus device in the DSDT. But devices
2579 * are hardware coherent in all current Hyper-V use cases, so fix
2580 * up the ACPI device to behave as if _CCA is present and indicates
2581 * hardware coherence.
2582 */
2583 ACPI_COMPANION_SET(&device->dev, device);
2584 if (IS_ENABLED(CONFIG_ACPI_CCA_REQUIRED) &&
2585 device_get_dma_attr(&device->dev) == DEV_DMA_NOT_SUPPORTED) {
2586 pr_info("No ACPI _CCA found; assuming coherent device I/O\n");
2587 device->flags.cca_seen = true;
2588 device->flags.coherent_dma = true;
2589 }
2590
2591 result = acpi_walk_resources(device->handle, METHOD_NAME__CRS,
2592 vmbus_walk_resources, NULL);
2593
2594 if (ACPI_FAILURE(result))
2595 goto acpi_walk_err;
2596 /*
2597 * Some ancestor of the vmbus acpi device (Gen1 or Gen2
2598 * firmware) is the VMOD that has the mmio ranges. Get that.
2599 */
2600 for (ancestor = acpi_dev_parent(device);
2601 ancestor && ancestor->handle != ACPI_ROOT_OBJECT;
2602 ancestor = acpi_dev_parent(ancestor)) {
2603 result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
2604 vmbus_walk_resources, NULL);
2605
2606 if (ACPI_FAILURE(result))
2607 continue;
2608 if (hyperv_mmio) {
2609 vmbus_reserve_fb();
2610 break;
2611 }
2612 }
2613 ret_val = 0;
2614
2615 acpi_walk_err:
2616 if (ret_val)
2617 vmbus_mmio_remove();
2618 return ret_val;
2619 }
2620 #else
vmbus_acpi_add(struct platform_device * pdev)2621 static int vmbus_acpi_add(struct platform_device *pdev)
2622 {
2623 return 0;
2624 }
2625 #endif
2626 #ifndef HYPERVISOR_CALLBACK_VECTOR
vmbus_set_irq(struct platform_device * pdev)2627 static int vmbus_set_irq(struct platform_device *pdev)
2628 {
2629 struct irq_data *data;
2630 int irq;
2631 irq_hw_number_t hwirq;
2632
2633 irq = platform_get_irq(pdev, 0);
2634 /* platform_get_irq() may not return 0. */
2635 if (irq < 0)
2636 return irq;
2637
2638 data = irq_get_irq_data(irq);
2639 if (!data) {
2640 pr_err("No interrupt data for VMBus virq %d\n", irq);
2641 return -ENODEV;
2642 }
2643 hwirq = irqd_to_hwirq(data);
2644
2645 vmbus_irq = irq;
2646 vmbus_interrupt = hwirq;
2647 pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt);
2648
2649 return 0;
2650 }
2651 #endif
2652
vmbus_device_add(struct platform_device * pdev)2653 static int vmbus_device_add(struct platform_device *pdev)
2654 {
2655 struct resource **cur_res = &hyperv_mmio;
2656 struct of_range range;
2657 struct of_range_parser parser;
2658 struct device_node *np = pdev->dev.of_node;
2659 int ret;
2660
2661 vmbus_root_device = &pdev->dev;
2662
2663 ret = of_range_parser_init(&parser, np);
2664 if (ret)
2665 return ret;
2666
2667 #ifndef HYPERVISOR_CALLBACK_VECTOR
2668 ret = vmbus_set_irq(pdev);
2669 if (ret)
2670 return ret;
2671 #endif
2672 for_each_of_range(&parser, &range) {
2673 struct resource *res;
2674
2675 res = kzalloc_obj(*res);
2676 if (!res) {
2677 vmbus_mmio_remove();
2678 return -ENOMEM;
2679 }
2680
2681 res->name = "hyperv mmio";
2682 res->flags = range.flags;
2683 res->start = range.cpu_addr;
2684 res->end = range.cpu_addr + range.size;
2685
2686 *cur_res = res;
2687 cur_res = &res->sibling;
2688 }
2689
2690 return ret;
2691 }
2692
vmbus_platform_driver_probe(struct platform_device * pdev)2693 static int vmbus_platform_driver_probe(struct platform_device *pdev)
2694 {
2695 if (acpi_disabled)
2696 return vmbus_device_add(pdev);
2697 else
2698 return vmbus_acpi_add(pdev);
2699 }
2700
vmbus_platform_driver_remove(struct platform_device * pdev)2701 static void vmbus_platform_driver_remove(struct platform_device *pdev)
2702 {
2703 vmbus_mmio_remove();
2704 }
2705
2706 #ifdef CONFIG_PM_SLEEP
vmbus_bus_suspend(struct device * dev)2707 static int vmbus_bus_suspend(struct device *dev)
2708 {
2709 struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(
2710 hv_context.cpu_context, VMBUS_CONNECT_CPU);
2711 struct vmbus_channel *channel, *sc;
2712
2713 tasklet_disable(&hv_cpu->msg_dpc);
2714 vmbus_connection.ignore_any_offer_msg = true;
2715 /* The tasklet_enable() takes care of providing a memory barrier */
2716 tasklet_enable(&hv_cpu->msg_dpc);
2717
2718 /* Drain all the workqueues as we are in suspend */
2719 drain_workqueue(vmbus_connection.rescind_work_queue);
2720 drain_workqueue(vmbus_connection.work_queue);
2721 drain_workqueue(vmbus_connection.handle_primary_chan_wq);
2722 drain_workqueue(vmbus_connection.handle_sub_chan_wq);
2723
2724 mutex_lock(&vmbus_connection.channel_mutex);
2725 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2726 if (!is_hvsock_channel(channel))
2727 continue;
2728
2729 vmbus_force_channel_rescinded(channel);
2730 }
2731 mutex_unlock(&vmbus_connection.channel_mutex);
2732
2733 /*
2734 * Wait until all the sub-channels and hv_sock channels have been
2735 * cleaned up. Sub-channels should be destroyed upon suspend, otherwise
2736 * they would conflict with the new sub-channels that will be created
2737 * in the resume path. hv_sock channels should also be destroyed, but
2738 * a hv_sock channel of an established hv_sock connection can not be
2739 * really destroyed since it may still be referenced by the userspace
2740 * application, so we just force the hv_sock channel to be rescinded
2741 * by vmbus_force_channel_rescinded(), and the userspace application
2742 * will thoroughly destroy the channel after hibernation.
2743 *
2744 * Note: the counter nr_chan_close_on_suspend may never go above 0 if
2745 * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM.
2746 */
2747 if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
2748 wait_for_completion(&vmbus_connection.ready_for_suspend_event);
2749
2750 mutex_lock(&vmbus_connection.channel_mutex);
2751
2752 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2753 /*
2754 * Remove the channel from the array of channels and invalidate
2755 * the channel's relid. Upon resume, vmbus_onoffer() will fix
2756 * up the relid (and other fields, if necessary) and add the
2757 * channel back to the array.
2758 */
2759 vmbus_channel_unmap_relid(channel);
2760 channel->offermsg.child_relid = INVALID_RELID;
2761
2762 if (is_hvsock_channel(channel)) {
2763 if (!channel->rescind) {
2764 pr_err("hv_sock channel not rescinded!\n");
2765 WARN_ON_ONCE(1);
2766 }
2767 continue;
2768 }
2769
2770 list_for_each_entry(sc, &channel->sc_list, sc_list) {
2771 pr_err("Sub-channel not deleted!\n");
2772 WARN_ON_ONCE(1);
2773 }
2774 }
2775
2776 mutex_unlock(&vmbus_connection.channel_mutex);
2777
2778 vmbus_initiate_unload(false);
2779
2780 return 0;
2781 }
2782
vmbus_bus_resume(struct device * dev)2783 static int vmbus_bus_resume(struct device *dev)
2784 {
2785 struct vmbus_channel *channel;
2786 struct vmbus_channel_msginfo *msginfo;
2787 size_t msgsize;
2788 int ret;
2789
2790 vmbus_connection.ignore_any_offer_msg = false;
2791
2792 /*
2793 * We only use the 'vmbus_proto_version', which was in use before
2794 * hibernation, to re-negotiate with the host.
2795 */
2796 if (!vmbus_proto_version) {
2797 pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version);
2798 return -EINVAL;
2799 }
2800
2801 msgsize = sizeof(*msginfo) +
2802 sizeof(struct vmbus_channel_initiate_contact);
2803
2804 msginfo = kzalloc(msgsize, GFP_KERNEL);
2805
2806 if (msginfo == NULL)
2807 return -ENOMEM;
2808
2809 ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);
2810
2811 kfree(msginfo);
2812
2813 if (ret != 0)
2814 return ret;
2815
2816 vmbus_request_offers();
2817
2818 mutex_lock(&vmbus_connection.channel_mutex);
2819 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2820 if (channel->offermsg.child_relid != INVALID_RELID)
2821 continue;
2822
2823 /* hvsock channels are not expected to be present. */
2824 if (is_hvsock_channel(channel))
2825 continue;
2826
2827 pr_err("channel %pUl/%pUl not present after resume.\n",
2828 &channel->offermsg.offer.if_type,
2829 &channel->offermsg.offer.if_instance);
2830 /* ToDo: Cleanup these channels here */
2831 }
2832 mutex_unlock(&vmbus_connection.channel_mutex);
2833
2834 /* Reset the event for the next suspend. */
2835 reinit_completion(&vmbus_connection.ready_for_suspend_event);
2836
2837 return 0;
2838 }
2839 #else
2840 #define vmbus_bus_suspend NULL
2841 #define vmbus_bus_resume NULL
2842 #endif /* CONFIG_PM_SLEEP */
2843
2844 static const __maybe_unused struct of_device_id vmbus_of_match[] = {
2845 {
2846 .compatible = "microsoft,vmbus",
2847 },
2848 {
2849 /* sentinel */
2850 },
2851 };
2852 MODULE_DEVICE_TABLE(of, vmbus_of_match);
2853
2854 static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = {
2855 {"VMBUS", 0},
2856 {"VMBus", 0},
2857 {"", 0},
2858 };
2859 MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
2860
2861 /*
2862 * Note: we must use the "no_irq" ops, otherwise hibernation can not work with
2863 * PCI device assignment, because "pci_dev_pm_ops" uses the "noirq" ops: in
2864 * the resume path, the pci "noirq" restore op runs before "non-noirq" op (see
2865 * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() ->
2866 * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's
2867 * resume callback must also run via the "noirq" ops.
2868 *
2869 * Set suspend_noirq/resume_noirq to NULL for Suspend-to-Idle: see the comment
2870 * earlier in this file before vmbus_pm.
2871 */
2872
2873 static const struct dev_pm_ops vmbus_bus_pm = {
2874 .suspend_noirq = NULL,
2875 .resume_noirq = NULL,
2876 .freeze_noirq = vmbus_bus_suspend,
2877 .thaw_noirq = vmbus_bus_resume,
2878 .poweroff_noirq = vmbus_bus_suspend,
2879 .restore_noirq = vmbus_bus_resume
2880 };
2881
2882 static struct platform_driver vmbus_platform_driver = {
2883 .probe = vmbus_platform_driver_probe,
2884 .remove = vmbus_platform_driver_remove,
2885 .driver = {
2886 .name = "vmbus",
2887 .acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids),
2888 .of_match_table = of_match_ptr(vmbus_of_match),
2889 .pm = &vmbus_bus_pm,
2890 .probe_type = PROBE_FORCE_SYNCHRONOUS,
2891 }
2892 };
2893
hv_kexec_handler(void)2894 static void hv_kexec_handler(void)
2895 {
2896 hv_stimer_global_cleanup();
2897 vmbus_initiate_unload(false);
2898 /* Make sure conn_state is set as hv_synic_cleanup checks for it */
2899 mb();
2900 cpuhp_remove_state(hyperv_cpuhp_online);
2901 };
2902
hv_crash_handler(struct pt_regs * regs)2903 static void hv_crash_handler(struct pt_regs *regs)
2904 {
2905 int cpu;
2906
2907 vmbus_initiate_unload(true);
2908 /*
2909 * In crash handler we can't schedule synic cleanup for all CPUs,
2910 * doing the cleanup for current CPU only. This should be sufficient
2911 * for kdump.
2912 */
2913 cpu = smp_processor_id();
2914 hv_stimer_cleanup(cpu);
2915 hv_hyp_synic_disable_regs(cpu);
2916 };
2917
hv_synic_suspend(void * data)2918 static int hv_synic_suspend(void *data)
2919 {
2920 /*
2921 * When we reach here, all the non-boot CPUs have been offlined.
2922 * If we're in a legacy configuration where stimer Direct Mode is
2923 * not enabled, the stimers on the non-boot CPUs have been unbound
2924 * in hv_synic_cleanup() -> hv_stimer_legacy_cleanup() ->
2925 * hv_stimer_cleanup() -> clockevents_unbind_device().
2926 *
2927 * hv_synic_suspend() only runs on CPU0 with interrupts disabled.
2928 * Here we do not call hv_stimer_legacy_cleanup() on CPU0 because:
2929 * 1) it's unnecessary as interrupts remain disabled between
2930 * syscore_suspend() and syscore_resume(): see create_image() and
2931 * resume_target_kernel()
2932 * 2) the stimer on CPU0 is automatically disabled later by
2933 * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ...
2934 * -> clockevents_shutdown() -> ... -> hv_ce_shutdown()
2935 * 3) a warning would be triggered if we call
2936 * clockevents_unbind_device(), which may sleep, in an
2937 * interrupts-disabled context.
2938 */
2939
2940 hv_hyp_synic_disable_regs(0);
2941
2942 return 0;
2943 }
2944
hv_synic_resume(void * data)2945 static void hv_synic_resume(void *data)
2946 {
2947 hv_hyp_synic_enable_regs(0);
2948
2949 /*
2950 * Note: we don't need to call hv_stimer_init(0), because the timer
2951 * on CPU0 is not unbound in hv_synic_suspend(), and the timer is
2952 * automatically re-enabled in timekeeping_resume().
2953 */
2954 }
2955
2956 /* The callbacks run only on CPU0, with irqs_disabled. */
2957 static const struct syscore_ops hv_synic_syscore_ops = {
2958 .suspend = hv_synic_suspend,
2959 .resume = hv_synic_resume,
2960 };
2961
2962 static struct syscore hv_synic_syscore = {
2963 .ops = &hv_synic_syscore_ops,
2964 };
2965
hv_acpi_init(void)2966 static int __init hv_acpi_init(void)
2967 {
2968 int ret;
2969
2970 if (!hv_is_hyperv_initialized())
2971 return -ENODEV;
2972
2973 if (hv_root_partition() && !hv_nested)
2974 return 0;
2975
2976 /*
2977 * Get ACPI resources first.
2978 */
2979 ret = platform_driver_register(&vmbus_platform_driver);
2980 if (ret)
2981 return ret;
2982
2983 if (!vmbus_root_device) {
2984 ret = -ENODEV;
2985 goto cleanup;
2986 }
2987
2988 /*
2989 * If we're on an architecture with a hardcoded hypervisor
2990 * vector (i.e. x86/x64), override the VMbus interrupt found
2991 * in the ACPI tables. Ensure vmbus_irq is not set since the
2992 * normal Linux IRQ mechanism is not used in this case.
2993 */
2994 #ifdef HYPERVISOR_CALLBACK_VECTOR
2995 vmbus_interrupt = HYPERVISOR_CALLBACK_VECTOR;
2996 vmbus_irq = -1;
2997 #endif
2998
2999 hv_debug_init();
3000
3001 ret = vmbus_bus_init();
3002 if (ret)
3003 goto cleanup;
3004
3005 hv_setup_kexec_handler(hv_kexec_handler);
3006 hv_setup_crash_handler(hv_crash_handler);
3007
3008 register_syscore(&hv_synic_syscore);
3009
3010 return 0;
3011
3012 cleanup:
3013 platform_driver_unregister(&vmbus_platform_driver);
3014 vmbus_root_device = NULL;
3015 return ret;
3016 }
3017
vmbus_exit(void)3018 static void __exit vmbus_exit(void)
3019 {
3020 int cpu;
3021
3022 unregister_syscore(&hv_synic_syscore);
3023
3024 hv_remove_kexec_handler();
3025 hv_remove_crash_handler();
3026 vmbus_connection.conn_state = DISCONNECTED;
3027 hv_stimer_global_cleanup();
3028 vmbus_disconnect();
3029 if (vmbus_irq == -1)
3030 hv_remove_vmbus_handler();
3031 else
3032 free_percpu_irq(vmbus_irq, &vmbus_evt);
3033 if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
3034 smpboot_unregister_percpu_thread(&vmbus_irq_threads);
3035 vmbus_irq_initialized = false;
3036 }
3037 for_each_online_cpu(cpu) {
3038 struct hv_per_cpu_context *hv_cpu
3039 = per_cpu_ptr(hv_context.cpu_context, cpu);
3040
3041 tasklet_kill(&hv_cpu->msg_dpc);
3042 }
3043 hv_debug_rm_all_dir();
3044
3045 vmbus_free_channels();
3046 kfree(vmbus_connection.channels);
3047
3048 /*
3049 * The vmbus panic notifier is always registered, hence we should
3050 * also unconditionally unregister it here as well.
3051 */
3052 atomic_notifier_chain_unregister(&panic_notifier_list,
3053 &hyperv_panic_vmbus_unload_block);
3054
3055 bus_unregister(&hv_bus);
3056
3057 cpuhp_remove_state(hyperv_cpuhp_online);
3058 hv_synic_free();
3059 platform_driver_unregister(&vmbus_platform_driver);
3060 }
3061
3062
3063 MODULE_LICENSE("GPL");
3064 MODULE_DESCRIPTION("Microsoft Hyper-V VMBus Driver");
3065
3066 subsys_initcall(hv_acpi_init);
3067 module_exit(vmbus_exit);
3068