xref: /freebsd/sys/dev/hyperv/utilities/hv_snapshot.c (revision 2008043f386721d58158e37e0d7e50df8095942d)
1 /*-
2  * Copyright (c) 2016 Microsoft Corp.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 #include <sys/param.h>
29 #include <sys/kernel.h>
30 #include <sys/conf.h>
31 #include <sys/uio.h>
32 #include <sys/bus.h>
33 #include <sys/malloc.h>
34 #include <sys/mbuf.h>
35 #include <sys/module.h>
36 #include <sys/lock.h>
37 #include <sys/taskqueue.h>
38 #include <sys/selinfo.h>
39 #include <sys/sysctl.h>
40 #include <sys/poll.h>
41 #include <sys/proc.h>
42 #include <sys/queue.h>
43 #include <sys/kthread.h>
44 #include <sys/syscallsubr.h>
45 #include <sys/sysproto.h>
46 #include <sys/un.h>
47 #include <sys/endian.h>
48 #include <sys/sema.h>
49 #include <sys/signal.h>
50 #include <sys/syslog.h>
51 #include <sys/systm.h>
52 #include <sys/mutex.h>
53 #include <sys/callout.h>
54 
55 #include <dev/hyperv/include/hyperv.h>
56 #include <dev/hyperv/utilities/hv_utilreg.h>
57 #include <dev/hyperv/utilities/vmbus_icreg.h>
58 #include <dev/hyperv/utilities/vmbus_icvar.h>
59 
60 #include "hv_snapshot.h"
61 #include "vmbus_if.h"
62 
63 #define VSS_MAJOR		5
64 #define VSS_MINOR		0
65 #define VSS_MSGVER		VMBUS_IC_VERSION(VSS_MAJOR, VSS_MINOR)
66 
67 #define VSS_FWVER_MAJOR		3
68 #define VSS_FWVER		VMBUS_IC_VERSION(VSS_FWVER_MAJOR, 0)
69 
70 #define TIMEOUT_LIMIT		(15)	// seconds
71 enum hv_vss_op {
72 	VSS_OP_CREATE = 0,
73 	VSS_OP_DELETE,
74 	VSS_OP_HOT_BACKUP,
75 	VSS_OP_GET_DM_INFO,
76 	VSS_OP_BU_COMPLETE,
77 	/*
78 	 * Following operations are only supported with IC version >= 5.0
79 	 */
80 	VSS_OP_FREEZE, /* Freeze the file systems in the VM */
81 	VSS_OP_THAW, /* Unfreeze the file systems */
82 	VSS_OP_AUTO_RECOVER,
83 	VSS_OP_COUNT /* Number of operations, must be last */
84 };
85 
86 /*
87  * Header for all VSS messages.
88  */
89 struct hv_vss_hdr {
90 	struct vmbus_icmsg_hdr	ic_hdr;
91 	uint8_t			operation;
92 	uint8_t			reserved[7];
93 } __packed;
94 
95 
96 /*
97  * Flag values for the hv_vss_check_feature. Here supports only
98  * one value.
99  */
100 #define VSS_HBU_NO_AUTO_RECOVERY		0x00000005
101 
102 struct hv_vss_check_feature {
103 	uint32_t flags;
104 } __packed;
105 
106 struct hv_vss_check_dm_info {
107 	uint32_t flags;
108 } __packed;
109 
110 struct hv_vss_msg {
111 	union {
112 		struct hv_vss_hdr vss_hdr;
113 	} hdr;
114 	union {
115 		struct hv_vss_check_feature vss_cf;
116 		struct hv_vss_check_dm_info dm_info;
117 	} body;
118 } __packed;
119 
120 struct hv_vss_req {
121 	struct hv_vss_opt_msg	opt_msg;	/* used to communicate with daemon */
122 	struct hv_vss_msg	msg;		/* used to communicate with host */
123 } __packed;
124 
125 /* hv_vss debug control */
126 static int hv_vss_log = 0;
127 
128 #define	hv_vss_log_error(...)	do {				\
129 	if (hv_vss_log > 0)					\
130 		log(LOG_ERR, "hv_vss: " __VA_ARGS__);		\
131 } while (0)
132 
133 #define	hv_vss_log_info(...) do {				\
134 	if (hv_vss_log > 1)					\
135 		log(LOG_INFO, "hv_vss: " __VA_ARGS__);		\
136 } while (0)
137 
138 static const struct vmbus_ic_desc vmbus_vss_descs[] = {
139 	{
140 		.ic_guid = { .hv_guid = {
141 		    0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42,
142 		    0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40} },
143 		.ic_desc = "Hyper-V VSS"
144 	},
145 	VMBUS_IC_DESC_END
146 };
147 
148 static const char * vss_opt_name[] = {"None", "VSSCheck", "Freeze", "Thaw"};
149 
150 /* character device prototypes */
151 static d_open_t		hv_vss_dev_open;
152 static d_close_t	hv_vss_dev_close;
153 static d_poll_t		hv_vss_dev_daemon_poll;
154 static d_ioctl_t	hv_vss_dev_daemon_ioctl;
155 
156 static d_open_t		hv_appvss_dev_open;
157 static d_close_t	hv_appvss_dev_close;
158 static d_poll_t		hv_appvss_dev_poll;
159 static d_ioctl_t	hv_appvss_dev_ioctl;
160 
161 /* hv_vss character device structure */
162 static struct cdevsw hv_vss_cdevsw =
163 {
164 	.d_version	= D_VERSION,
165 	.d_open		= hv_vss_dev_open,
166 	.d_close	= hv_vss_dev_close,
167 	.d_poll		= hv_vss_dev_daemon_poll,
168 	.d_ioctl	= hv_vss_dev_daemon_ioctl,
169 	.d_name		= FS_VSS_DEV_NAME,
170 };
171 
172 static struct cdevsw hv_appvss_cdevsw =
173 {
174 	.d_version	= D_VERSION,
175 	.d_open		= hv_appvss_dev_open,
176 	.d_close	= hv_appvss_dev_close,
177 	.d_poll		= hv_appvss_dev_poll,
178 	.d_ioctl	= hv_appvss_dev_ioctl,
179 	.d_name		= APP_VSS_DEV_NAME,
180 };
181 
182 struct hv_vss_sc;
183 /*
184  * Global state to track cdev
185  */
186 struct hv_vss_dev_sc {
187 	/*
188 	 * msg was transferred from host to notify queue, and
189 	 * ack queue. Finally, it was recyled to free list.
190 	 */
191 	STAILQ_HEAD(, hv_vss_req_internal) 	to_notify_queue;
192 	STAILQ_HEAD(, hv_vss_req_internal) 	to_ack_queue;
193 	struct hv_vss_sc			*sc;
194 	struct proc				*proc_task;
195 	struct selinfo				hv_vss_selinfo;
196 };
197 /*
198  * Global state to track and synchronize the transaction requests from the host.
199  * The VSS allows user to register their function to do freeze/thaw for application.
200  * VSS kernel will notify both vss daemon and user application if it is registered.
201  * The implementation state transition is illustrated by:
202  * https://clovertrail.github.io/assets/vssdot.png
203  */
204 typedef struct hv_vss_sc {
205 	struct vmbus_ic_softc			util_sc;
206 	device_t				dev;
207 
208 	struct task				task;
209 
210 	/*
211 	 * mutex is used to protect access of list/queue,
212 	 * callout in request is also used this mutex.
213 	 */
214 	struct mtx				pending_mutex;
215 	/*
216 	 * req_free_list contains all free items
217 	 */
218 	LIST_HEAD(, hv_vss_req_internal)	req_free_list;
219 
220 	/* Indicates if daemon registered with driver */
221 	boolean_t				register_done;
222 
223 	boolean_t				app_register_done;
224 
225 	/* cdev for file system freeze/thaw */
226 	struct cdev				*hv_vss_dev;
227 	/* cdev for application freeze/thaw */
228 	struct cdev				*hv_appvss_dev;
229 
230 	/* sc for app */
231 	struct hv_vss_dev_sc			app_sc;
232 	/* sc for deamon */
233 	struct hv_vss_dev_sc			daemon_sc;
234 } hv_vss_sc;
235 
236 typedef struct hv_vss_req_internal {
237 	LIST_ENTRY(hv_vss_req_internal)		link;
238 	STAILQ_ENTRY(hv_vss_req_internal)	slink;
239 	struct hv_vss_req			vss_req;
240 
241 	/* Rcv buffer for communicating with the host*/
242 	uint8_t					*rcv_buf;
243 	/* Length of host message */
244 	uint32_t				host_msg_len;
245 	/* Host message id */
246 	uint64_t				host_msg_id;
247 
248 	hv_vss_sc				*sc;
249 
250 	struct callout				callout;
251 } hv_vss_req_internal;
252 
253 #define SEARCH_REMOVE_REQ_LOCKED(reqp, queue, link, tmp, id)		\
254 	do {								\
255 		STAILQ_FOREACH_SAFE(reqp, queue, link, tmp) {		\
256 			if (reqp->vss_req.opt_msg.msgid == id) {	\
257 				STAILQ_REMOVE(queue,			\
258 				    reqp, hv_vss_req_internal, link);	\
259 				break;					\
260 			}						\
261 		}							\
262 	} while (0)
263 
264 static bool
265 hv_vss_is_daemon_killed_after_launch(hv_vss_sc *sc)
266 {
267 	return (!sc->register_done && sc->daemon_sc.proc_task);
268 }
269 
270 /*
271  * Callback routine that gets called whenever there is a message from host
272  */
273 static void
274 hv_vss_callback(struct vmbus_channel *chan __unused, void *context)
275 {
276 	hv_vss_sc *sc = (hv_vss_sc*)context;
277 	if (hv_vss_is_daemon_killed_after_launch(sc))
278 		hv_vss_log_info("%s: daemon was killed!\n", __func__);
279 	if (sc->register_done || sc->daemon_sc.proc_task) {
280 		hv_vss_log_info("%s: Queuing work item\n", __func__);
281 		if (hv_vss_is_daemon_killed_after_launch(sc))
282 			hv_vss_log_info("%s: daemon was killed!\n", __func__);
283 		taskqueue_enqueue(taskqueue_thread, &sc->task);
284 	} else {
285 		hv_vss_log_info("%s: daemon has never been registered\n", __func__);
286 	}
287 	hv_vss_log_info("%s: received msg from host\n", __func__);
288 }
289 /*
290  * Send the response back to the host.
291  */
292 static void
293 hv_vss_respond_host(uint8_t *rcv_buf, struct vmbus_channel *ch,
294     uint32_t recvlen, uint64_t requestid, uint32_t error)
295 {
296 	struct vmbus_icmsg_hdr *hv_icmsg_hdrp;
297 
298 	hv_icmsg_hdrp = (struct vmbus_icmsg_hdr *)rcv_buf;
299 
300 	hv_icmsg_hdrp->ic_status = error;
301 	hv_icmsg_hdrp->ic_flags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
302 
303 	error = vmbus_chan_send(ch, VMBUS_CHANPKT_TYPE_INBAND, 0,
304 	    rcv_buf, recvlen, requestid);
305 	if (error)
306 		hv_vss_log_info("%s: hv_vss_respond_host: sendpacket error:%d\n",
307 		    __func__, error);
308 }
309 
310 static void
311 hv_vss_notify_host_result_locked(struct hv_vss_req_internal *reqp, uint32_t status)
312 {
313 	struct hv_vss_msg* msg = (struct hv_vss_msg *)reqp->rcv_buf;
314 	hv_vss_sc *sc = reqp->sc;
315 	if (reqp->vss_req.opt_msg.opt == HV_VSS_CHECK) {
316 		msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
317 	}
318 	hv_vss_log_info("%s, %s response %s to host\n", __func__,
319 	    vss_opt_name[reqp->vss_req.opt_msg.opt],
320 	    status == HV_S_OK ? "Success" : "Fail");
321 	hv_vss_respond_host(reqp->rcv_buf, vmbus_get_channel(reqp->sc->dev),
322 	    reqp->host_msg_len, reqp->host_msg_id, status);
323 	/* recycle the request */
324 	LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
325 }
326 
327 static void
328 hv_vss_notify_host_result(struct hv_vss_req_internal *reqp, uint32_t status)
329 {
330 	mtx_lock(&reqp->sc->pending_mutex);
331 	hv_vss_notify_host_result_locked(reqp, status);
332 	mtx_unlock(&reqp->sc->pending_mutex);
333 }
334 
335 static void
336 hv_vss_cp_vssreq_to_user(struct hv_vss_req_internal *reqp,
337     struct hv_vss_opt_msg *userdata)
338 {
339 	struct hv_vss_req *hv_vss_dev_buf;
340 	hv_vss_dev_buf = &reqp->vss_req;
341 	hv_vss_dev_buf->opt_msg.opt = HV_VSS_NONE;
342 	switch (reqp->vss_req.msg.hdr.vss_hdr.operation) {
343 	case VSS_OP_FREEZE:
344 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_FREEZE;
345 		break;
346 	case VSS_OP_THAW:
347 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_THAW;
348 		break;
349 	case VSS_OP_HOT_BACKUP:
350 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_CHECK;
351 		break;
352 	}
353 	*userdata = hv_vss_dev_buf->opt_msg;
354 	hv_vss_log_info("%s, read data from user for "
355 	    "%s (%ju) \n", __func__, vss_opt_name[userdata->opt],
356 	    (uintmax_t)userdata->msgid);
357 }
358 
359 /**
360  * Remove the request id from app notifiy or ack queue,
361  * and recyle the request by inserting it to free list.
362  *
363  * When app was notified but not yet sending ack, the request
364  * should locate in either notify queue or ack queue.
365  */
366 static struct hv_vss_req_internal*
367 hv_vss_drain_req_queue_locked(hv_vss_sc *sc, uint64_t req_id)
368 {
369 	struct hv_vss_req_internal *reqp, *tmp;
370 	SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_notify_queue,
371 	    slink, tmp, req_id);
372 	if (reqp == NULL)
373 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_ack_queue,
374 		    slink, tmp, req_id);
375 	if (reqp == NULL)
376 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_notify_queue,
377 		    slink, tmp, req_id);
378 	if (reqp == NULL)
379 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_ack_queue, slink,
380 		    tmp, req_id);
381 	return (reqp);
382 }
383 /**
384  * Actions for daemon who has been notified.
385  */
386 static void
387 hv_vss_notified(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
388 {
389 	struct hv_vss_req_internal *reqp;
390 	mtx_lock(&dev_sc->sc->pending_mutex);
391 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) {
392 		reqp = STAILQ_FIRST(&dev_sc->to_notify_queue);
393 		hv_vss_cp_vssreq_to_user(reqp, userdata);
394 		STAILQ_REMOVE_HEAD(&dev_sc->to_notify_queue, slink);
395 		/* insert the msg to queue for write */
396 		STAILQ_INSERT_TAIL(&dev_sc->to_ack_queue, reqp, slink);
397 		userdata->status = VSS_SUCCESS;
398 	} else {
399 		/* Timeout occur, thus request was removed from queue. */
400 		hv_vss_log_info("%s: notify queue is empty!\n", __func__);
401 		userdata->status = VSS_FAIL;
402 	}
403 	mtx_unlock(&dev_sc->sc->pending_mutex);
404 }
405 
406 static void
407 hv_vss_notify(struct hv_vss_dev_sc *dev_sc, struct hv_vss_req_internal *reqp)
408 {
409 	uint32_t opt = reqp->vss_req.opt_msg.opt;
410 	mtx_lock(&dev_sc->sc->pending_mutex);
411 	STAILQ_INSERT_TAIL(&dev_sc->to_notify_queue, reqp, slink);
412 	hv_vss_log_info("%s: issuing query %s (%ju) to %s\n", __func__,
413 	    vss_opt_name[opt], (uintmax_t)reqp->vss_req.opt_msg.msgid,
414 	    &dev_sc->sc->app_sc == dev_sc ? "app" : "daemon");
415 	mtx_unlock(&dev_sc->sc->pending_mutex);
416 	selwakeup(&dev_sc->hv_vss_selinfo);
417 }
418 
419 /**
420  * Actions for daemon who has acknowledged.
421  */
422 static void
423 hv_vss_daemon_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
424 {
425 	struct hv_vss_req_internal	*reqp, *tmp;
426 	uint64_t			req_id;
427 	int				opt;
428 	uint32_t			status;
429 
430 	opt = userdata->opt;
431 	req_id = userdata->msgid;
432 	status = userdata->status;
433 	/* make sure the reserved fields are all zeros. */
434 	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
435 	    __offsetof(struct hv_vss_opt_msg, reserved));
436 	mtx_lock(&dev_sc->sc->pending_mutex);
437 	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
438 	mtx_unlock(&dev_sc->sc->pending_mutex);
439 	if (reqp == NULL) {
440 		hv_vss_log_info("%s Timeout: fail to find daemon ack request\n",
441 		    __func__);
442 		userdata->status = VSS_FAIL;
443 		return;
444 	}
445 	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
446 	hv_vss_log_info("%s, get response %d from daemon for %s (%ju) \n", __func__,
447 	    status, vss_opt_name[opt], (uintmax_t)req_id);
448 	switch (opt) {
449 	case HV_VSS_CHECK:
450 	case HV_VSS_FREEZE:
451 		callout_drain(&reqp->callout);
452 		hv_vss_notify_host_result(reqp,
453 		    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
454 		break;
455 	case HV_VSS_THAW:
456 		if (dev_sc->sc->app_register_done) {
457 			if (status == VSS_SUCCESS) {
458 				hv_vss_notify(&dev_sc->sc->app_sc, reqp);
459 			} else {
460 				/* handle error */
461 				callout_drain(&reqp->callout);
462 				hv_vss_notify_host_result(reqp, HV_E_FAIL);
463 			}
464 		} else {
465 			callout_drain(&reqp->callout);
466 			hv_vss_notify_host_result(reqp,
467 			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
468 		}
469 		break;
470 	}
471 }
472 
473 /**
474  * Actions for app who has acknowledged.
475  */
476 static void
477 hv_vss_app_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
478 {
479 	struct hv_vss_req_internal	*reqp, *tmp;
480 	uint64_t			req_id;
481 	int				opt;
482 	uint8_t				status;
483 
484 	opt = userdata->opt;
485 	req_id = userdata->msgid;
486 	status = userdata->status;
487 	/* make sure the reserved fields are all zeros. */
488 	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
489 	    __offsetof(struct hv_vss_opt_msg, reserved));
490 	mtx_lock(&dev_sc->sc->pending_mutex);
491 	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
492 	mtx_unlock(&dev_sc->sc->pending_mutex);
493 	if (reqp == NULL) {
494 		hv_vss_log_info("%s Timeout: fail to find app ack request\n",
495 		    __func__);
496 		userdata->status = VSS_FAIL;
497 		return;
498 	}
499 	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
500 	hv_vss_log_info("%s, get response %d from app for %s (%ju) \n",
501 	    __func__, status, vss_opt_name[opt], (uintmax_t)req_id);
502 	if (dev_sc->sc->register_done) {
503 		switch (opt) {
504 		case HV_VSS_CHECK:
505 		case HV_VSS_FREEZE:
506 			if (status == VSS_SUCCESS) {
507 				hv_vss_notify(&dev_sc->sc->daemon_sc, reqp);
508 			} else {
509 				/* handle error */
510 				callout_drain(&reqp->callout);
511 				hv_vss_notify_host_result(reqp, HV_E_FAIL);
512 			}
513 			break;
514 		case HV_VSS_THAW:
515 			callout_drain(&reqp->callout);
516 			hv_vss_notify_host_result(reqp,
517 			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
518 			break;
519 		}
520 	} else {
521 		hv_vss_log_info("%s, Fatal: vss daemon was killed\n", __func__);
522 	}
523 }
524 
525 static int
526 hv_vss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
527 {
528 	struct proc     *td_proc;
529 	td_proc = td->td_proc;
530 
531 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
532 	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
533 	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
534 
535 	if (dev_sc->sc->register_done)
536 		return (EBUSY);
537 
538 	dev_sc->sc->register_done = true;
539 	hv_vss_callback(vmbus_get_channel(dev_sc->sc->dev), dev_sc->sc);
540 
541 	dev_sc->proc_task = curproc;
542 	return (0);
543 }
544 
545 static int
546 hv_vss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
547 				 struct thread *td)
548 {
549 	struct proc     *td_proc;
550 	td_proc = td->td_proc;
551 
552 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
553 
554 	hv_vss_log_info("%s: %s closes device \"%s\"\n",
555 	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
556 	dev_sc->sc->register_done = false;
557 	return (0);
558 }
559 
560 static int
561 hv_vss_dev_daemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
562     struct thread *td)
563 {
564 	struct proc			*td_proc;
565 	struct hv_vss_dev_sc		*sc;
566 
567 	td_proc = td->td_proc;
568 	sc = (struct hv_vss_dev_sc*)dev->si_drv1;
569 
570 	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
571 
572 	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
573 	switch(cmd) {
574 	case IOCHVVSSREAD:
575 		hv_vss_notified(sc, userdata);
576 		break;
577 	case IOCHVVSSWRITE:
578 		hv_vss_daemon_acked(sc, userdata);
579 		break;
580 	}
581 	return (0);
582 }
583 
584 /*
585  * hv_vss_daemon poll invokes this function to check if data is available
586  * for daemon to read.
587  */
588 static int
589 hv_vss_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
590 {
591 	int revent = 0;
592 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
593 
594 	mtx_lock(&dev_sc->sc->pending_mutex);
595 	/**
596 	 * if there is data ready, inform daemon's poll
597 	 */
598 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
599 		revent = POLLIN;
600 	if (revent == 0)
601 		selrecord(td, &dev_sc->hv_vss_selinfo);
602 	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
603 	mtx_unlock(&dev_sc->sc->pending_mutex);
604 	return (revent);
605 }
606 
607 static int
608 hv_appvss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
609 {
610 	struct proc     *td_proc;
611 	td_proc = td->td_proc;
612 
613 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
614 	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
615 	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
616 
617 	if (dev_sc->sc->app_register_done)
618 		return (EBUSY);
619 
620 	dev_sc->sc->app_register_done = true;
621 	dev_sc->proc_task = curproc;
622 	return (0);
623 }
624 
625 static int
626 hv_appvss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
627 				 struct thread *td)
628 {
629 	struct proc     *td_proc;
630 	td_proc = td->td_proc;
631 
632 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
633 
634 	hv_vss_log_info("%s: %s closes device \"%s\".\n",
635 	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
636 	dev_sc->sc->app_register_done = false;
637 	return (0);
638 }
639 
640 static int
641 hv_appvss_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
642     struct thread *td)
643 {
644 	struct proc			*td_proc;
645 	struct hv_vss_dev_sc		*dev_sc;
646 
647 	td_proc = td->td_proc;
648 	dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
649 
650 	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
651 
652 	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
653 	switch(cmd) {
654 	case IOCHVVSSREAD:
655 		hv_vss_notified(dev_sc, userdata);
656 		break;
657 	case IOCHVVSSWRITE:
658 		hv_vss_app_acked(dev_sc, userdata);
659 		break;
660 	}
661 	return (0);
662 }
663 
664 /*
665  * hv_vss_daemon poll invokes this function to check if data is available
666  * for daemon to read.
667  */
668 static int
669 hv_appvss_dev_poll(struct cdev *dev, int events, struct thread *td)
670 {
671 	int revent = 0;
672 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
673 
674 	mtx_lock(&dev_sc->sc->pending_mutex);
675 	/**
676 	 * if there is data ready, inform daemon's poll
677 	 */
678 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
679 		revent = POLLIN;
680 	if (revent == 0)
681 		selrecord(td, &dev_sc->hv_vss_selinfo);
682 	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
683 	mtx_unlock(&dev_sc->sc->pending_mutex);
684 	return (revent);
685 }
686 
687 static void
688 hv_vss_timeout(void *arg)
689 {
690 	hv_vss_req_internal *reqp = arg;
691 	hv_vss_req_internal *request __diagused;
692 	hv_vss_sc* sc = reqp->sc;
693 	uint64_t req_id = reqp->vss_req.opt_msg.msgid;
694 	/* This thread is locked */
695 	KASSERT(mtx_owned(&sc->pending_mutex), ("mutex lock is not owned!"));
696 	request = hv_vss_drain_req_queue_locked(sc, req_id);
697 	KASSERT(request != NULL, ("timeout but fail to find request"));
698 	hv_vss_notify_host_result_locked(reqp, HV_E_FAIL);
699 }
700 
701 /*
702  * This routine is called whenever a message is received from the host
703  */
704 static void
705 hv_vss_init_req(hv_vss_req_internal *reqp,
706     uint32_t recvlen, uint64_t requestid, uint8_t *vss_buf, hv_vss_sc *sc)
707 {
708 	struct timespec vm_ts;
709 	struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
710 
711 	memset(reqp, 0, __offsetof(hv_vss_req_internal, callout));
712 	reqp->host_msg_len = recvlen;
713 	reqp->host_msg_id = requestid;
714 	reqp->rcv_buf = vss_buf;
715 	reqp->sc = sc;
716 	memcpy(&reqp->vss_req.msg,
717 	    (struct hv_vss_msg *)vss_buf, sizeof(struct hv_vss_msg));
718 	/* set the opt for users */
719 	switch (msg->hdr.vss_hdr.operation) {
720 	case VSS_OP_FREEZE:
721 		reqp->vss_req.opt_msg.opt = HV_VSS_FREEZE;
722 		break;
723 	case VSS_OP_THAW:
724 		reqp->vss_req.opt_msg.opt = HV_VSS_THAW;
725 		break;
726 	case VSS_OP_HOT_BACKUP:
727 		reqp->vss_req.opt_msg.opt = HV_VSS_CHECK;
728 		break;
729 	}
730 	/* Use a timestamp as msg request ID */
731 	nanotime(&vm_ts);
732 	reqp->vss_req.opt_msg.msgid = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
733 }
734 
735 static hv_vss_req_internal*
736 hv_vss_get_new_req_locked(hv_vss_sc *sc)
737 {
738 	hv_vss_req_internal *reqp;
739 	if (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue) ||
740 	    !STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue) ||
741 	    !STAILQ_EMPTY(&sc->app_sc.to_notify_queue) ||
742 	    !STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
743 		/*
744 		 * There is request coming from host before
745 		 * finishing previous requests
746 		 */
747 		hv_vss_log_info("%s: Warning: there is new request "
748 		    "coming before finishing previous requests\n", __func__);
749 		return (NULL);
750 	}
751 	if (LIST_EMPTY(&sc->req_free_list)) {
752 		/* TODO Error: no buffer */
753 		hv_vss_log_info("Error: No buffer\n");
754 		return (NULL);
755 	}
756 	reqp = LIST_FIRST(&sc->req_free_list);
757 	LIST_REMOVE(reqp, link);
758 	return (reqp);
759 }
760 
761 static void
762 hv_vss_start_notify(hv_vss_req_internal *reqp, uint32_t opt)
763 {
764 	hv_vss_sc *sc = reqp->sc;
765 	/*
766 	 * Freeze/Check notification sequence: kernel -> app -> daemon(fs)
767 	 * Thaw notification sequence:         kernel -> daemon(fs) -> app
768 	 *
769 	 * We should wake up the daemon, in case it's doing poll().
770 	 * The response should be received after 5s, otherwise, trigger timeout.
771 	 */
772 	switch (opt) {
773 	case VSS_OP_FREEZE:
774 	case VSS_OP_HOT_BACKUP:
775 		if (sc->app_register_done)
776 			hv_vss_notify(&sc->app_sc, reqp);
777 		else
778 			hv_vss_notify(&sc->daemon_sc, reqp);
779 		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
780 		    hv_vss_timeout, reqp);
781 		break;
782 	case VSS_OP_THAW:
783 		hv_vss_notify(&sc->daemon_sc, reqp);
784 		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
785 		    hv_vss_timeout, reqp);
786 		break;
787 	}
788 }
789 
790 /*
791  * Function to read the vss request buffer from host
792  * and interact with daemon
793  */
794 static void
795 hv_vss_process_request(void *context, int pending __unused)
796 {
797 	uint8_t *vss_buf;
798 	struct vmbus_channel *channel;
799 	uint32_t recvlen = 0;
800 	uint64_t requestid;
801 	struct vmbus_icmsg_hdr *icmsghdrp;
802 	int ret = 0;
803 	hv_vss_sc *sc;
804 	hv_vss_req_internal *reqp;
805 
806 	hv_vss_log_info("%s: entering hv_vss_process_request\n", __func__);
807 
808 	sc = (hv_vss_sc*)context;
809 	vss_buf = sc->util_sc.ic_buf;
810 	channel = vmbus_get_channel(sc->dev);
811 
812 	recvlen = sc->util_sc.ic_buflen;
813 	ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
814 	KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
815 	/* XXX check recvlen to make sure that it contains enough data */
816 
817 	while ((ret == 0) && (recvlen > 0)) {
818 		icmsghdrp = (struct vmbus_icmsg_hdr *)vss_buf;
819 
820 		if (icmsghdrp->ic_type == HV_ICMSGTYPE_NEGOTIATE) {
821 			ret = vmbus_ic_negomsg(&sc->util_sc, vss_buf,
822 			    &recvlen, VSS_FWVER, VSS_MSGVER);
823 			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
824 			    recvlen, requestid, ret);
825 			hv_vss_log_info("%s: version negotiated\n", __func__);
826 		} else if (!hv_vss_is_daemon_killed_after_launch(sc)) {
827 			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
828 			switch(msg->hdr.vss_hdr.operation) {
829 			case VSS_OP_FREEZE:
830 			case VSS_OP_THAW:
831 			case VSS_OP_HOT_BACKUP:
832 				mtx_lock(&sc->pending_mutex);
833 				reqp = hv_vss_get_new_req_locked(sc);
834 				mtx_unlock(&sc->pending_mutex);
835 				if (reqp == NULL) {
836 					/* ignore this request from host */
837 					break;
838 				}
839 				hv_vss_init_req(reqp, recvlen, requestid, vss_buf, sc);
840 				hv_vss_log_info("%s: receive %s (%ju) from host\n",
841 				    __func__,
842 				    vss_opt_name[reqp->vss_req.opt_msg.opt],
843 				    (uintmax_t)reqp->vss_req.opt_msg.msgid);
844 				hv_vss_start_notify(reqp, msg->hdr.vss_hdr.operation);
845 				break;
846 			case VSS_OP_GET_DM_INFO:
847 				hv_vss_log_info("%s: receive GET_DM_INFO from host\n",
848 				    __func__);
849 				msg->body.dm_info.flags = 0;
850 				hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
851 				    recvlen, requestid, HV_S_OK);
852 				break;
853 			default:
854 				device_printf(sc->dev, "Unknown opt from host: %d\n",
855 				    msg->hdr.vss_hdr.operation);
856 				break;
857 			}
858 		} else {
859 			/* daemon was killed for some reason after it was launched */
860 			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
861 			switch(msg->hdr.vss_hdr.operation) {
862 			case VSS_OP_FREEZE:
863 				hv_vss_log_info("%s: response fail for FREEZE\n",
864 				    __func__);
865 				break;
866 			case VSS_OP_THAW:
867 				hv_vss_log_info("%s: response fail for THAW\n",
868 				    __func__);
869 				break;
870 			case VSS_OP_HOT_BACKUP:
871 				hv_vss_log_info("%s: response fail for HOT_BACKUP\n",
872 				    __func__);
873 				msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
874 				break;
875 			case VSS_OP_GET_DM_INFO:
876 				hv_vss_log_info("%s: response fail for GET_DM_INFO\n",
877 				    __func__);
878 				msg->body.dm_info.flags = 0;
879 				break;
880 			default:
881 				device_printf(sc->dev, "Unknown opt from host: %d\n",
882 				    msg->hdr.vss_hdr.operation);
883 				break;
884 			}
885 			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
886 			    recvlen, requestid, HV_E_FAIL);
887 		}
888 		/*
889 		 * Try reading next buffer
890 		 */
891 		recvlen = sc->util_sc.ic_buflen;
892 		ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
893 		KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
894 		/* XXX check recvlen to make sure that it contains enough data */
895 
896 		hv_vss_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
897 		    __func__, context, ret, recvlen);
898 	}
899 }
900 
901 static int
902 hv_vss_probe(device_t dev)
903 {
904 	return (vmbus_ic_probe(dev, vmbus_vss_descs));
905 }
906 
907 static int
908 hv_vss_init_send_receive_queue(device_t dev)
909 {
910 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
911 	int i;
912 	const int max_list = 4; /* It is big enough for the list */
913 	struct hv_vss_req_internal* reqp;
914 
915 	LIST_INIT(&sc->req_free_list);
916 	STAILQ_INIT(&sc->daemon_sc.to_notify_queue);
917 	STAILQ_INIT(&sc->daemon_sc.to_ack_queue);
918 	STAILQ_INIT(&sc->app_sc.to_notify_queue);
919 	STAILQ_INIT(&sc->app_sc.to_ack_queue);
920 
921 	for (i = 0; i < max_list; i++) {
922 		reqp = malloc(sizeof(struct hv_vss_req_internal),
923 		    M_DEVBUF, M_WAITOK|M_ZERO);
924 		LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
925 		callout_init_mtx(&reqp->callout, &sc->pending_mutex, 0);
926 	}
927 	return (0);
928 }
929 
930 static int
931 hv_vss_destroy_send_receive_queue(device_t dev)
932 {
933 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
934 	hv_vss_req_internal* reqp;
935 
936 	while (!LIST_EMPTY(&sc->req_free_list)) {
937 		reqp = LIST_FIRST(&sc->req_free_list);
938 		LIST_REMOVE(reqp, link);
939 		free(reqp, M_DEVBUF);
940 	}
941 
942 	while (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue)) {
943 		reqp = STAILQ_FIRST(&sc->daemon_sc.to_notify_queue);
944 		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_notify_queue, slink);
945 		free(reqp, M_DEVBUF);
946 	}
947 
948 	while (!STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue)) {
949 		reqp = STAILQ_FIRST(&sc->daemon_sc.to_ack_queue);
950 		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_ack_queue, slink);
951 		free(reqp, M_DEVBUF);
952 	}
953 
954 	while (!STAILQ_EMPTY(&sc->app_sc.to_notify_queue)) {
955 		reqp = STAILQ_FIRST(&sc->app_sc.to_notify_queue);
956 		STAILQ_REMOVE_HEAD(&sc->app_sc.to_notify_queue, slink);
957 		free(reqp, M_DEVBUF);
958 	}
959 
960 	while (!STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
961 		reqp = STAILQ_FIRST(&sc->app_sc.to_ack_queue);
962 		STAILQ_REMOVE_HEAD(&sc->app_sc.to_ack_queue, slink);
963 		free(reqp, M_DEVBUF);
964 	}
965 	return (0);
966 }
967 
968 static int
969 hv_vss_attach(device_t dev)
970 {
971 	int error;
972 	struct sysctl_oid_list *child;
973 	struct sysctl_ctx_list *ctx;
974 
975 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
976 
977 	sc->dev = dev;
978 	mtx_init(&sc->pending_mutex, "hv_vss pending mutex", NULL, MTX_DEF);
979 
980 	ctx = device_get_sysctl_ctx(dev);
981 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
982 
983 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_vss_log",
984 	    CTLFLAG_RWTUN, &hv_vss_log, 0, "Hyperv VSS service log level");
985 
986 	TASK_INIT(&sc->task, 0, hv_vss_process_request, sc);
987 	hv_vss_init_send_receive_queue(dev);
988 	/* create character device for file system freeze/thaw */
989 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
990 		    &sc->hv_vss_dev,
991 		    &hv_vss_cdevsw,
992 		    0,
993 		    UID_ROOT,
994 		    GID_WHEEL,
995 		    0640,
996 		    FS_VSS_DEV_NAME);
997 
998 	if (error != 0) {
999 		hv_vss_log_info("Fail to create '%s': %d\n", FS_VSS_DEV_NAME, error);
1000 		return (error);
1001 	}
1002 	sc->hv_vss_dev->si_drv1 = &sc->daemon_sc;
1003 	sc->daemon_sc.sc = sc;
1004 	/* create character device for application freeze/thaw */
1005 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
1006 		    &sc->hv_appvss_dev,
1007 		    &hv_appvss_cdevsw,
1008 		    0,
1009 		    UID_ROOT,
1010 		    GID_WHEEL,
1011 		    0640,
1012 		    APP_VSS_DEV_NAME);
1013 
1014 	if (error != 0) {
1015 		hv_vss_log_info("Fail to create '%s': %d\n", APP_VSS_DEV_NAME, error);
1016 		return (error);
1017 	}
1018 	sc->hv_appvss_dev->si_drv1 = &sc->app_sc;
1019 	sc->app_sc.sc = sc;
1020 
1021 	return (vmbus_ic_attach(dev, hv_vss_callback));
1022 }
1023 
1024 static int
1025 hv_vss_detach(device_t dev)
1026 {
1027 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
1028 	mtx_destroy(&sc->pending_mutex);
1029 	if (sc->daemon_sc.proc_task != NULL) {
1030 		PROC_LOCK(sc->daemon_sc.proc_task);
1031 		kern_psignal(sc->daemon_sc.proc_task, SIGKILL);
1032 		PROC_UNLOCK(sc->daemon_sc.proc_task);
1033 	}
1034 	if (sc->app_sc.proc_task != NULL) {
1035 		PROC_LOCK(sc->app_sc.proc_task);
1036 		kern_psignal(sc->app_sc.proc_task, SIGKILL);
1037 		PROC_UNLOCK(sc->app_sc.proc_task);
1038 	}
1039 	hv_vss_destroy_send_receive_queue(dev);
1040 	destroy_dev(sc->hv_vss_dev);
1041 	destroy_dev(sc->hv_appvss_dev);
1042 	return (vmbus_ic_detach(dev));
1043 }
1044 
1045 static device_method_t vss_methods[] = {
1046 	/* Device interface */
1047 	DEVMETHOD(device_probe, hv_vss_probe),
1048 	DEVMETHOD(device_attach, hv_vss_attach),
1049 	DEVMETHOD(device_detach, hv_vss_detach),
1050 	{ 0, 0 }
1051 };
1052 
1053 static driver_t vss_driver = { "hvvss", vss_methods, sizeof(hv_vss_sc)};
1054 
1055 DRIVER_MODULE(hv_vss, vmbus, vss_driver, NULL, NULL);
1056 MODULE_VERSION(hv_vss, 1);
1057 MODULE_DEPEND(hv_vss, vmbus, 1, 1, 1);
1058