xref: /freebsd/sys/dev/hyperv/utilities/hv_snapshot.c (revision 63d1fd5970ec814904aa0f4580b10a0d302d08b2)
1 /*-
2  * Copyright (c) 2016 Microsoft Corp.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/kernel.h>
32 #include <sys/conf.h>
33 #include <sys/uio.h>
34 #include <sys/bus.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/module.h>
38 #include <sys/lock.h>
39 #include <sys/taskqueue.h>
40 #include <sys/selinfo.h>
41 #include <sys/sysctl.h>
42 #include <sys/poll.h>
43 #include <sys/proc.h>
44 #include <sys/queue.h>
45 #include <sys/kthread.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/sysproto.h>
48 #include <sys/un.h>
49 #include <sys/endian.h>
50 #include <sys/sema.h>
51 #include <sys/signal.h>
52 #include <sys/syslog.h>
53 #include <sys/systm.h>
54 #include <sys/mutex.h>
55 #include <sys/callout.h>
56 
57 #include <dev/hyperv/include/hyperv.h>
58 #include <dev/hyperv/utilities/hv_utilreg.h>
59 #include <dev/hyperv/utilities/vmbus_icreg.h>
60 #include <dev/hyperv/utilities/vmbus_icvar.h>
61 
62 #include "hv_snapshot.h"
63 #include "vmbus_if.h"
64 
65 #define VSS_MAJOR		5
66 #define VSS_MINOR		0
67 #define VSS_MSGVER		VMBUS_IC_VERSION(VSS_MAJOR, VSS_MINOR)
68 
69 #define VSS_FWVER_MAJOR		3
70 #define VSS_FWVER		VMBUS_IC_VERSION(VSS_FWVER_MAJOR, 0)
71 
72 #define TIMEOUT_LIMIT		(15)	// seconds
73 enum hv_vss_op {
74 	VSS_OP_CREATE = 0,
75 	VSS_OP_DELETE,
76 	VSS_OP_HOT_BACKUP,
77 	VSS_OP_GET_DM_INFO,
78 	VSS_OP_BU_COMPLETE,
79 	/*
80 	 * Following operations are only supported with IC version >= 5.0
81 	 */
82 	VSS_OP_FREEZE, /* Freeze the file systems in the VM */
83 	VSS_OP_THAW, /* Unfreeze the file systems */
84 	VSS_OP_AUTO_RECOVER,
85 	VSS_OP_COUNT /* Number of operations, must be last */
86 };
87 
88 /*
89  * Header for all VSS messages.
90  */
91 struct hv_vss_hdr {
92 	struct vmbus_icmsg_hdr	ic_hdr;
93 	uint8_t			operation;
94 	uint8_t			reserved[7];
95 } __packed;
96 
97 
98 /*
99  * Flag values for the hv_vss_check_feature. Here supports only
100  * one value.
101  */
102 #define VSS_HBU_NO_AUTO_RECOVERY		0x00000005
103 
104 struct hv_vss_check_feature {
105 	uint32_t flags;
106 } __packed;
107 
108 struct hv_vss_check_dm_info {
109 	uint32_t flags;
110 } __packed;
111 
112 struct hv_vss_msg {
113 	union {
114 		struct hv_vss_hdr vss_hdr;
115 	} hdr;
116 	union {
117 		struct hv_vss_check_feature vss_cf;
118 		struct hv_vss_check_dm_info dm_info;
119 	} body;
120 } __packed;
121 
122 struct hv_vss_req {
123 	struct hv_vss_opt_msg	opt_msg;	/* used to communicate with daemon */
124 	struct hv_vss_msg	msg;		/* used to communicate with host */
125 } __packed;
126 
127 /* hv_vss debug control */
128 static int hv_vss_log = 0;
129 
130 #define	hv_vss_log_error(...)	do {				\
131 	if (hv_vss_log > 0)					\
132 		log(LOG_ERR, "hv_vss: " __VA_ARGS__);		\
133 } while (0)
134 
135 #define	hv_vss_log_info(...) do {				\
136 	if (hv_vss_log > 1)					\
137 		log(LOG_INFO, "hv_vss: " __VA_ARGS__);		\
138 } while (0)
139 
140 static const struct vmbus_ic_desc vmbus_vss_descs[] = {
141 	{
142 		.ic_guid = { .hv_guid = {
143 		    0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42,
144 		    0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40} },
145 		.ic_desc = "Hyper-V VSS"
146 	},
147 	VMBUS_IC_DESC_END
148 };
149 
150 static const char * vss_opt_name[] = {"None", "VSSCheck", "Freeze", "Thaw"};
151 
152 /* character device prototypes */
153 static d_open_t		hv_vss_dev_open;
154 static d_close_t	hv_vss_dev_close;
155 static d_poll_t		hv_vss_dev_daemon_poll;
156 static d_ioctl_t	hv_vss_dev_daemon_ioctl;
157 
158 static d_open_t		hv_appvss_dev_open;
159 static d_close_t	hv_appvss_dev_close;
160 static d_poll_t		hv_appvss_dev_poll;
161 static d_ioctl_t	hv_appvss_dev_ioctl;
162 
163 /* hv_vss character device structure */
164 static struct cdevsw hv_vss_cdevsw =
165 {
166 	.d_version	= D_VERSION,
167 	.d_open		= hv_vss_dev_open,
168 	.d_close	= hv_vss_dev_close,
169 	.d_poll		= hv_vss_dev_daemon_poll,
170 	.d_ioctl	= hv_vss_dev_daemon_ioctl,
171 	.d_name		= FS_VSS_DEV_NAME,
172 };
173 
174 static struct cdevsw hv_appvss_cdevsw =
175 {
176 	.d_version	= D_VERSION,
177 	.d_open		= hv_appvss_dev_open,
178 	.d_close	= hv_appvss_dev_close,
179 	.d_poll		= hv_appvss_dev_poll,
180 	.d_ioctl	= hv_appvss_dev_ioctl,
181 	.d_name		= APP_VSS_DEV_NAME,
182 };
183 
184 struct hv_vss_sc;
185 /*
186  * Global state to track cdev
187  */
188 struct hv_vss_dev_sc {
189 	/*
190 	 * msg was transferred from host to notify queue, and
191 	 * ack queue. Finally, it was recyled to free list.
192 	 */
193 	STAILQ_HEAD(, hv_vss_req_internal) 	to_notify_queue;
194 	STAILQ_HEAD(, hv_vss_req_internal) 	to_ack_queue;
195 	struct hv_vss_sc			*sc;
196 	struct proc				*proc_task;
197 	struct selinfo				hv_vss_selinfo;
198 };
199 /*
200  * Global state to track and synchronize the transaction requests from the host.
201  * The VSS allows user to register their function to do freeze/thaw for application.
202  * VSS kernel will notify both vss daemon and user application if it is registered.
203  * The implementation state transition is illustrated by:
204  * https://clovertrail.github.io/assets/vssdot.png
205  */
206 typedef struct hv_vss_sc {
207 	struct vmbus_ic_softc			util_sc;
208 	device_t				dev;
209 
210 	struct task				task;
211 
212 	/*
213 	 * mutex is used to protect access of list/queue,
214 	 * callout in request is also used this mutex.
215 	 */
216 	struct mtx				pending_mutex;
217 	/*
218 	 * req_free_list contains all free items
219 	 */
220 	LIST_HEAD(, hv_vss_req_internal)	req_free_list;
221 
222 	/* Indicates if daemon registered with driver */
223 	boolean_t				register_done;
224 
225 	boolean_t				app_register_done;
226 
227 	/* cdev for file system freeze/thaw */
228 	struct cdev				*hv_vss_dev;
229 	/* cdev for application freeze/thaw */
230 	struct cdev				*hv_appvss_dev;
231 
232 	/* sc for app */
233 	struct hv_vss_dev_sc			app_sc;
234 	/* sc for deamon */
235 	struct hv_vss_dev_sc			daemon_sc;
236 } hv_vss_sc;
237 
238 typedef struct hv_vss_req_internal {
239 	LIST_ENTRY(hv_vss_req_internal)		link;
240 	STAILQ_ENTRY(hv_vss_req_internal)	slink;
241 	struct hv_vss_req			vss_req;
242 
243 	/* Rcv buffer for communicating with the host*/
244 	uint8_t					*rcv_buf;
245 	/* Length of host message */
246 	uint32_t				host_msg_len;
247 	/* Host message id */
248 	uint64_t				host_msg_id;
249 
250 	hv_vss_sc				*sc;
251 
252 	struct callout				callout;
253 } hv_vss_req_internal;
254 
255 #define SEARCH_REMOVE_REQ_LOCKED(reqp, queue, link, tmp, id)		\
256 	do {								\
257 		STAILQ_FOREACH_SAFE(reqp, queue, link, tmp) {		\
258 			if (reqp->vss_req.opt_msg.msgid == id) {	\
259 				STAILQ_REMOVE(queue,			\
260 				    reqp, hv_vss_req_internal, link);	\
261 				break;					\
262 			}						\
263 		}							\
264 	} while (0)
265 
266 static bool
267 hv_vss_is_daemon_killed_after_launch(hv_vss_sc *sc)
268 {
269 	return (!sc->register_done && sc->daemon_sc.proc_task);
270 }
271 
272 /*
273  * Callback routine that gets called whenever there is a message from host
274  */
275 static void
276 hv_vss_callback(struct vmbus_channel *chan __unused, void *context)
277 {
278 	hv_vss_sc *sc = (hv_vss_sc*)context;
279 	if (hv_vss_is_daemon_killed_after_launch(sc))
280 		hv_vss_log_info("%s: daemon was killed!\n", __func__);
281 	if (sc->register_done || sc->daemon_sc.proc_task) {
282 		hv_vss_log_info("%s: Queuing work item\n", __func__);
283 		if (hv_vss_is_daemon_killed_after_launch(sc))
284 			hv_vss_log_info("%s: daemon was killed!\n", __func__);
285 		taskqueue_enqueue(taskqueue_thread, &sc->task);
286 	} else {
287 		hv_vss_log_info("%s: daemon has never been registered\n", __func__);
288 	}
289 	hv_vss_log_info("%s: received msg from host\n", __func__);
290 }
291 /*
292  * Send the response back to the host.
293  */
294 static void
295 hv_vss_respond_host(uint8_t *rcv_buf, struct vmbus_channel *ch,
296     uint32_t recvlen, uint64_t requestid, uint32_t error)
297 {
298 	struct vmbus_icmsg_hdr *hv_icmsg_hdrp;
299 
300 	hv_icmsg_hdrp = (struct vmbus_icmsg_hdr *)rcv_buf;
301 
302 	hv_icmsg_hdrp->ic_status = error;
303 	hv_icmsg_hdrp->ic_flags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
304 
305 	error = vmbus_chan_send(ch, VMBUS_CHANPKT_TYPE_INBAND, 0,
306 	    rcv_buf, recvlen, requestid);
307 	if (error)
308 		hv_vss_log_info("%s: hv_vss_respond_host: sendpacket error:%d\n",
309 		    __func__, error);
310 }
311 
312 static void
313 hv_vss_notify_host_result_locked(struct hv_vss_req_internal *reqp, uint32_t status)
314 {
315 	struct hv_vss_msg* msg = (struct hv_vss_msg *)reqp->rcv_buf;
316 	hv_vss_sc *sc = reqp->sc;
317 	if (reqp->vss_req.opt_msg.opt == HV_VSS_CHECK) {
318 		msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
319 	}
320 	hv_vss_log_info("%s, %s response %s to host\n", __func__,
321 	    vss_opt_name[reqp->vss_req.opt_msg.opt],
322 	    status == HV_S_OK ? "Success" : "Fail");
323 	hv_vss_respond_host(reqp->rcv_buf, vmbus_get_channel(reqp->sc->dev),
324 	    reqp->host_msg_len, reqp->host_msg_id, status);
325 	/* recycle the request */
326 	LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
327 }
328 
329 static void
330 hv_vss_notify_host_result(struct hv_vss_req_internal *reqp, uint32_t status)
331 {
332 	mtx_lock(&reqp->sc->pending_mutex);
333 	hv_vss_notify_host_result_locked(reqp, status);
334 	mtx_unlock(&reqp->sc->pending_mutex);
335 }
336 
337 static void
338 hv_vss_cp_vssreq_to_user(struct hv_vss_req_internal *reqp,
339     struct hv_vss_opt_msg *userdata)
340 {
341 	struct hv_vss_req *hv_vss_dev_buf;
342 	hv_vss_dev_buf = &reqp->vss_req;
343 	hv_vss_dev_buf->opt_msg.opt = HV_VSS_NONE;
344 	switch (reqp->vss_req.msg.hdr.vss_hdr.operation) {
345 	case VSS_OP_FREEZE:
346 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_FREEZE;
347 		break;
348 	case VSS_OP_THAW:
349 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_THAW;
350 		break;
351 	case VSS_OP_HOT_BACKUP:
352 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_CHECK;
353 		break;
354 	}
355 	*userdata = hv_vss_dev_buf->opt_msg;
356 	hv_vss_log_info("%s, read data from user for "
357 	    "%s (%ju) \n", __func__, vss_opt_name[userdata->opt],
358 	    (uintmax_t)userdata->msgid);
359 }
360 
361 /**
362  * Remove the request id from app notifiy or ack queue,
363  * and recyle the request by inserting it to free list.
364  *
365  * When app was notified but not yet sending ack, the request
366  * should locate in either notify queue or ack queue.
367  */
368 static struct hv_vss_req_internal*
369 hv_vss_drain_req_queue_locked(hv_vss_sc *sc, uint64_t req_id)
370 {
371 	struct hv_vss_req_internal *reqp, *tmp;
372 	SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_notify_queue,
373 	    slink, tmp, req_id);
374 	if (reqp == NULL)
375 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_ack_queue,
376 		    slink, tmp, req_id);
377 	if (reqp == NULL)
378 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_notify_queue,
379 		    slink, tmp, req_id);
380 	if (reqp == NULL)
381 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_ack_queue, slink,
382 		    tmp, req_id);
383 	return (reqp);
384 }
385 /**
386  * Actions for daemon who has been notified.
387  */
388 static void
389 hv_vss_notified(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
390 {
391 	struct hv_vss_req_internal *reqp;
392 	mtx_lock(&dev_sc->sc->pending_mutex);
393 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) {
394 		reqp = STAILQ_FIRST(&dev_sc->to_notify_queue);
395 		hv_vss_cp_vssreq_to_user(reqp, userdata);
396 		STAILQ_REMOVE_HEAD(&dev_sc->to_notify_queue, slink);
397 		/* insert the msg to queue for write */
398 		STAILQ_INSERT_TAIL(&dev_sc->to_ack_queue, reqp, slink);
399 		userdata->status = VSS_SUCCESS;
400 	} else {
401 		/* Timeout occur, thus request was removed from queue. */
402 		hv_vss_log_info("%s: notify queue is empty!\n", __func__);
403 		userdata->status = VSS_FAIL;
404 	}
405 	mtx_unlock(&dev_sc->sc->pending_mutex);
406 }
407 
408 static void
409 hv_vss_notify(struct hv_vss_dev_sc *dev_sc, struct hv_vss_req_internal *reqp)
410 {
411 	uint32_t opt = reqp->vss_req.opt_msg.opt;
412 	mtx_lock(&dev_sc->sc->pending_mutex);
413 	STAILQ_INSERT_TAIL(&dev_sc->to_notify_queue, reqp, slink);
414 	hv_vss_log_info("%s: issuing query %s (%ju) to %s\n", __func__,
415 	    vss_opt_name[opt], (uintmax_t)reqp->vss_req.opt_msg.msgid,
416 	    &dev_sc->sc->app_sc == dev_sc ? "app" : "daemon");
417 	mtx_unlock(&dev_sc->sc->pending_mutex);
418 	selwakeup(&dev_sc->hv_vss_selinfo);
419 }
420 
421 /**
422  * Actions for daemon who has acknowledged.
423  */
424 static void
425 hv_vss_daemon_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
426 {
427 	struct hv_vss_req_internal	*reqp, *tmp;
428 	uint64_t			req_id;
429 	int				opt;
430 	uint32_t			status;
431 
432 	opt = userdata->opt;
433 	req_id = userdata->msgid;
434 	status = userdata->status;
435 	/* make sure the reserved fields are all zeros. */
436 	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
437 	    __offsetof(struct hv_vss_opt_msg, reserved));
438 	mtx_lock(&dev_sc->sc->pending_mutex);
439 	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
440 	mtx_unlock(&dev_sc->sc->pending_mutex);
441 	if (reqp == NULL) {
442 		hv_vss_log_info("%s Timeout: fail to find daemon ack request\n",
443 		    __func__);
444 		userdata->status = VSS_FAIL;
445 		return;
446 	}
447 	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
448 	hv_vss_log_info("%s, get response %d from daemon for %s (%ju) \n", __func__,
449 	    status, vss_opt_name[opt], (uintmax_t)req_id);
450 	switch (opt) {
451 	case HV_VSS_CHECK:
452 	case HV_VSS_FREEZE:
453 		callout_drain(&reqp->callout);
454 		hv_vss_notify_host_result(reqp,
455 		    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
456 		break;
457 	case HV_VSS_THAW:
458 		if (dev_sc->sc->app_register_done) {
459 			if (status == VSS_SUCCESS) {
460 				hv_vss_notify(&dev_sc->sc->app_sc, reqp);
461 			} else {
462 				/* handle error */
463 				callout_drain(&reqp->callout);
464 				hv_vss_notify_host_result(reqp, HV_E_FAIL);
465 			}
466 		} else {
467 			callout_drain(&reqp->callout);
468 			hv_vss_notify_host_result(reqp,
469 			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
470 		}
471 		break;
472 	}
473 }
474 
475 /**
476  * Actions for app who has acknowledged.
477  */
478 static void
479 hv_vss_app_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
480 {
481 	struct hv_vss_req_internal	*reqp, *tmp;
482 	uint64_t			req_id;
483 	int				opt;
484 	uint8_t				status;
485 
486 	opt = userdata->opt;
487 	req_id = userdata->msgid;
488 	status = userdata->status;
489 	/* make sure the reserved fields are all zeros. */
490 	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
491 	    __offsetof(struct hv_vss_opt_msg, reserved));
492 	mtx_lock(&dev_sc->sc->pending_mutex);
493 	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
494 	mtx_unlock(&dev_sc->sc->pending_mutex);
495 	if (reqp == NULL) {
496 		hv_vss_log_info("%s Timeout: fail to find app ack request\n",
497 		    __func__);
498 		userdata->status = VSS_FAIL;
499 		return;
500 	}
501 	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
502 	hv_vss_log_info("%s, get response %d from app for %s (%ju) \n",
503 	    __func__, status, vss_opt_name[opt], (uintmax_t)req_id);
504 	if (dev_sc->sc->register_done) {
505 		switch (opt) {
506 		case HV_VSS_CHECK:
507 		case HV_VSS_FREEZE:
508 			if (status == VSS_SUCCESS) {
509 				hv_vss_notify(&dev_sc->sc->daemon_sc, reqp);
510 			} else {
511 				/* handle error */
512 				callout_drain(&reqp->callout);
513 				hv_vss_notify_host_result(reqp, HV_E_FAIL);
514 			}
515 			break;
516 		case HV_VSS_THAW:
517 			callout_drain(&reqp->callout);
518 			hv_vss_notify_host_result(reqp,
519 			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
520 			break;
521 		}
522 	} else {
523 		hv_vss_log_info("%s, Fatal: vss daemon was killed\n", __func__);
524 	}
525 }
526 
527 static int
528 hv_vss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
529 {
530 	struct proc     *td_proc;
531 	td_proc = td->td_proc;
532 
533 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
534 	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
535 	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
536 
537 	if (dev_sc->sc->register_done)
538 		return (EBUSY);
539 
540 	dev_sc->sc->register_done = true;
541 	hv_vss_callback(vmbus_get_channel(dev_sc->sc->dev), dev_sc->sc);
542 
543 	dev_sc->proc_task = curproc;
544 	return (0);
545 }
546 
547 static int
548 hv_vss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
549 				 struct thread *td)
550 {
551 	struct proc     *td_proc;
552 	td_proc = td->td_proc;
553 
554 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
555 
556 	hv_vss_log_info("%s: %s closes device \"%s\"\n",
557 	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
558 	dev_sc->sc->register_done = false;
559 	return (0);
560 }
561 
562 static int
563 hv_vss_dev_daemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
564     struct thread *td)
565 {
566 	struct proc			*td_proc;
567 	struct hv_vss_dev_sc		*sc;
568 
569 	td_proc = td->td_proc;
570 	sc = (struct hv_vss_dev_sc*)dev->si_drv1;
571 
572 	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
573 
574 	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
575 	switch(cmd) {
576 	case IOCHVVSSREAD:
577 		hv_vss_notified(sc, userdata);
578 		break;
579 	case IOCHVVSSWRITE:
580 		hv_vss_daemon_acked(sc, userdata);
581 		break;
582 	}
583 	return (0);
584 }
585 
586 /*
587  * hv_vss_daemon poll invokes this function to check if data is available
588  * for daemon to read.
589  */
590 static int
591 hv_vss_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
592 {
593 	int revent = 0;
594 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
595 
596 	mtx_lock(&dev_sc->sc->pending_mutex);
597 	/**
598 	 * if there is data ready, inform daemon's poll
599 	 */
600 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
601 		revent = POLLIN;
602 	if (revent == 0)
603 		selrecord(td, &dev_sc->hv_vss_selinfo);
604 	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
605 	mtx_unlock(&dev_sc->sc->pending_mutex);
606 	return (revent);
607 }
608 
609 static int
610 hv_appvss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
611 {
612 	struct proc     *td_proc;
613 	td_proc = td->td_proc;
614 
615 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
616 	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
617 	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
618 
619 	if (dev_sc->sc->app_register_done)
620 		return (EBUSY);
621 
622 	dev_sc->sc->app_register_done = true;
623 	dev_sc->proc_task = curproc;
624 	return (0);
625 }
626 
627 static int
628 hv_appvss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
629 				 struct thread *td)
630 {
631 	struct proc     *td_proc;
632 	td_proc = td->td_proc;
633 
634 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
635 
636 	hv_vss_log_info("%s: %s closes device \"%s\".\n",
637 	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
638 	dev_sc->sc->app_register_done = false;
639 	return (0);
640 }
641 
642 static int
643 hv_appvss_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
644     struct thread *td)
645 {
646 	struct proc			*td_proc;
647 	struct hv_vss_dev_sc		*dev_sc;
648 
649 	td_proc = td->td_proc;
650 	dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
651 
652 	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
653 
654 	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
655 	switch(cmd) {
656 	case IOCHVVSSREAD:
657 		hv_vss_notified(dev_sc, userdata);
658 		break;
659 	case IOCHVVSSWRITE:
660 		hv_vss_app_acked(dev_sc, userdata);
661 		break;
662 	}
663 	return (0);
664 }
665 
666 /*
667  * hv_vss_daemon poll invokes this function to check if data is available
668  * for daemon to read.
669  */
670 static int
671 hv_appvss_dev_poll(struct cdev *dev, int events, struct thread *td)
672 {
673 	int revent = 0;
674 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
675 
676 	mtx_lock(&dev_sc->sc->pending_mutex);
677 	/**
678 	 * if there is data ready, inform daemon's poll
679 	 */
680 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
681 		revent = POLLIN;
682 	if (revent == 0)
683 		selrecord(td, &dev_sc->hv_vss_selinfo);
684 	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
685 	mtx_unlock(&dev_sc->sc->pending_mutex);
686 	return (revent);
687 }
688 
689 static void
690 hv_vss_timeout(void *arg)
691 {
692 	hv_vss_req_internal *reqp = arg;
693 	hv_vss_req_internal *request;
694 	hv_vss_sc* sc = reqp->sc;
695 	uint64_t req_id = reqp->vss_req.opt_msg.msgid;
696 	/* This thread is locked */
697 	KASSERT(mtx_owned(&sc->pending_mutex), ("mutex lock is not owned!"));
698 	request = hv_vss_drain_req_queue_locked(sc, req_id);
699 	KASSERT(request != NULL, ("timeout but fail to find request"));
700 	hv_vss_notify_host_result_locked(reqp, HV_E_FAIL);
701 }
702 
703 /*
704  * This routine is called whenever a message is received from the host
705  */
706 static void
707 hv_vss_init_req(hv_vss_req_internal *reqp,
708     uint32_t recvlen, uint64_t requestid, uint8_t *vss_buf, hv_vss_sc *sc)
709 {
710 	struct timespec vm_ts;
711 	struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
712 
713 	memset(reqp, 0, __offsetof(hv_vss_req_internal, callout));
714 	reqp->host_msg_len = recvlen;
715 	reqp->host_msg_id = requestid;
716 	reqp->rcv_buf = vss_buf;
717 	reqp->sc = sc;
718 	memcpy(&reqp->vss_req.msg,
719 	    (struct hv_vss_msg *)vss_buf, sizeof(struct hv_vss_msg));
720 	/* set the opt for users */
721 	switch (msg->hdr.vss_hdr.operation) {
722 	case VSS_OP_FREEZE:
723 		reqp->vss_req.opt_msg.opt = HV_VSS_FREEZE;
724 		break;
725 	case VSS_OP_THAW:
726 		reqp->vss_req.opt_msg.opt = HV_VSS_THAW;
727 		break;
728 	case VSS_OP_HOT_BACKUP:
729 		reqp->vss_req.opt_msg.opt = HV_VSS_CHECK;
730 		break;
731 	}
732 	/* Use a timestamp as msg request ID */
733 	nanotime(&vm_ts);
734 	reqp->vss_req.opt_msg.msgid = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
735 }
736 
737 static hv_vss_req_internal*
738 hv_vss_get_new_req_locked(hv_vss_sc *sc)
739 {
740 	hv_vss_req_internal *reqp;
741 	if (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue) ||
742 	    !STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue) ||
743 	    !STAILQ_EMPTY(&sc->app_sc.to_notify_queue) ||
744 	    !STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
745 		/*
746 		 * There is request coming from host before
747 		 * finishing previous requests
748 		 */
749 		hv_vss_log_info("%s: Warning: there is new request "
750 		    "coming before finishing previous requests\n", __func__);
751 		return (NULL);
752 	}
753 	if (LIST_EMPTY(&sc->req_free_list)) {
754 		/* TODO Error: no buffer */
755 		hv_vss_log_info("Error: No buffer\n");
756 		return (NULL);
757 	}
758 	reqp = LIST_FIRST(&sc->req_free_list);
759 	LIST_REMOVE(reqp, link);
760 	return (reqp);
761 }
762 
763 static void
764 hv_vss_start_notify(hv_vss_req_internal *reqp, uint32_t opt)
765 {
766 	hv_vss_sc *sc = reqp->sc;
767 	/*
768 	 * Freeze/Check notification sequence: kernel -> app -> daemon(fs)
769 	 * Thaw notification sequence:         kernel -> daemon(fs) -> app
770 	 *
771 	 * We should wake up the daemon, in case it's doing poll().
772 	 * The response should be received after 5s, otherwise, trigger timeout.
773 	 */
774 	switch (opt) {
775 	case VSS_OP_FREEZE:
776 	case VSS_OP_HOT_BACKUP:
777 		if (sc->app_register_done)
778 			hv_vss_notify(&sc->app_sc, reqp);
779 		else
780 			hv_vss_notify(&sc->daemon_sc, reqp);
781 		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
782 		    hv_vss_timeout, reqp);
783 		break;
784 	case VSS_OP_THAW:
785 		hv_vss_notify(&sc->daemon_sc, reqp);
786 		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
787 		    hv_vss_timeout, reqp);
788 		break;
789 	}
790 }
791 
792 /*
793  * Function to read the vss request buffer from host
794  * and interact with daemon
795  */
796 static void
797 hv_vss_process_request(void *context, int pending __unused)
798 {
799 	uint8_t *vss_buf;
800 	struct vmbus_channel *channel;
801 	uint32_t recvlen = 0;
802 	uint64_t requestid;
803 	struct vmbus_icmsg_hdr *icmsghdrp;
804 	int ret = 0;
805 	hv_vss_sc *sc;
806 	hv_vss_req_internal *reqp;
807 
808 	hv_vss_log_info("%s: entering hv_vss_process_request\n", __func__);
809 
810 	sc = (hv_vss_sc*)context;
811 	vss_buf = sc->util_sc.ic_buf;
812 	channel = vmbus_get_channel(sc->dev);
813 
814 	recvlen = sc->util_sc.ic_buflen;
815 	ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
816 	KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
817 	/* XXX check recvlen to make sure that it contains enough data */
818 
819 	while ((ret == 0) && (recvlen > 0)) {
820 		icmsghdrp = (struct vmbus_icmsg_hdr *)vss_buf;
821 
822 		if (icmsghdrp->ic_type == HV_ICMSGTYPE_NEGOTIATE) {
823 			ret = vmbus_ic_negomsg(&sc->util_sc, vss_buf,
824 			    &recvlen, VSS_FWVER, VSS_MSGVER);
825 			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
826 			    recvlen, requestid, ret);
827 			hv_vss_log_info("%s: version negotiated\n", __func__);
828 		} else if (!hv_vss_is_daemon_killed_after_launch(sc)) {
829 			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
830 			switch(msg->hdr.vss_hdr.operation) {
831 			case VSS_OP_FREEZE:
832 			case VSS_OP_THAW:
833 			case VSS_OP_HOT_BACKUP:
834 				mtx_lock(&sc->pending_mutex);
835 				reqp = hv_vss_get_new_req_locked(sc);
836 				mtx_unlock(&sc->pending_mutex);
837 				if (reqp == NULL) {
838 					/* ignore this request from host */
839 					break;
840 				}
841 				hv_vss_init_req(reqp, recvlen, requestid, vss_buf, sc);
842 				hv_vss_log_info("%s: receive %s (%ju) from host\n",
843 				    __func__,
844 				    vss_opt_name[reqp->vss_req.opt_msg.opt],
845 				    (uintmax_t)reqp->vss_req.opt_msg.msgid);
846 				hv_vss_start_notify(reqp, msg->hdr.vss_hdr.operation);
847 				break;
848 			case VSS_OP_GET_DM_INFO:
849 				hv_vss_log_info("%s: receive GET_DM_INFO from host\n",
850 				    __func__);
851 				msg->body.dm_info.flags = 0;
852 				hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
853 				    recvlen, requestid, HV_S_OK);
854 				break;
855 			default:
856 				device_printf(sc->dev, "Unknown opt from host: %d\n",
857 				    msg->hdr.vss_hdr.operation);
858 				break;
859 			}
860 		} else {
861 			/* daemon was killed for some reason after it was launched */
862 			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
863 			switch(msg->hdr.vss_hdr.operation) {
864 			case VSS_OP_FREEZE:
865 				hv_vss_log_info("%s: response fail for FREEZE\n",
866 				    __func__);
867 				break;
868 			case VSS_OP_THAW:
869 				hv_vss_log_info("%s: response fail for THAW\n",
870 				    __func__);
871 				break;
872 			case VSS_OP_HOT_BACKUP:
873 				hv_vss_log_info("%s: response fail for HOT_BACKUP\n",
874 				    __func__);
875 				msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
876 				break;
877 			case VSS_OP_GET_DM_INFO:
878 				hv_vss_log_info("%s: response fail for GET_DM_INFO\n",
879 				    __func__);
880 				msg->body.dm_info.flags = 0;
881 				break;
882 			default:
883 				device_printf(sc->dev, "Unknown opt from host: %d\n",
884 				    msg->hdr.vss_hdr.operation);
885 				break;
886 			}
887 			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
888 			    recvlen, requestid, HV_E_FAIL);
889 		}
890 		/*
891 		 * Try reading next buffer
892 		 */
893 		recvlen = sc->util_sc.ic_buflen;
894 		ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
895 		KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
896 		/* XXX check recvlen to make sure that it contains enough data */
897 
898 		hv_vss_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
899 		    __func__, context, ret, recvlen);
900 	}
901 }
902 
903 static int
904 hv_vss_probe(device_t dev)
905 {
906 	return (vmbus_ic_probe(dev, vmbus_vss_descs));
907 }
908 
909 static int
910 hv_vss_init_send_receive_queue(device_t dev)
911 {
912 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
913 	int i;
914 	const int max_list = 4; /* It is big enough for the list */
915 	struct hv_vss_req_internal* reqp;
916 
917 	LIST_INIT(&sc->req_free_list);
918 	STAILQ_INIT(&sc->daemon_sc.to_notify_queue);
919 	STAILQ_INIT(&sc->daemon_sc.to_ack_queue);
920 	STAILQ_INIT(&sc->app_sc.to_notify_queue);
921 	STAILQ_INIT(&sc->app_sc.to_ack_queue);
922 
923 	for (i = 0; i < max_list; i++) {
924 		reqp = malloc(sizeof(struct hv_vss_req_internal),
925 		    M_DEVBUF, M_WAITOK|M_ZERO);
926 		LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
927 		callout_init_mtx(&reqp->callout, &sc->pending_mutex, 0);
928 	}
929 	return (0);
930 }
931 
932 static int
933 hv_vss_destroy_send_receive_queue(device_t dev)
934 {
935 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
936 	hv_vss_req_internal* reqp;
937 
938 	while (!LIST_EMPTY(&sc->req_free_list)) {
939 		reqp = LIST_FIRST(&sc->req_free_list);
940 		LIST_REMOVE(reqp, link);
941 		free(reqp, M_DEVBUF);
942 	}
943 
944 	while (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue)) {
945 		reqp = STAILQ_FIRST(&sc->daemon_sc.to_notify_queue);
946 		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_notify_queue, slink);
947 		free(reqp, M_DEVBUF);
948 	}
949 
950 	while (!STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue)) {
951 		reqp = STAILQ_FIRST(&sc->daemon_sc.to_ack_queue);
952 		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_ack_queue, slink);
953 		free(reqp, M_DEVBUF);
954 	}
955 
956 	while (!STAILQ_EMPTY(&sc->app_sc.to_notify_queue)) {
957 		reqp = STAILQ_FIRST(&sc->app_sc.to_notify_queue);
958 		STAILQ_REMOVE_HEAD(&sc->app_sc.to_notify_queue, slink);
959 		free(reqp, M_DEVBUF);
960 	}
961 
962 	while (!STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
963 		reqp = STAILQ_FIRST(&sc->app_sc.to_ack_queue);
964 		STAILQ_REMOVE_HEAD(&sc->app_sc.to_ack_queue, slink);
965 		free(reqp, M_DEVBUF);
966 	}
967 	return (0);
968 }
969 
970 static int
971 hv_vss_attach(device_t dev)
972 {
973 	int error;
974 	struct sysctl_oid_list *child;
975 	struct sysctl_ctx_list *ctx;
976 
977 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
978 
979 	sc->dev = dev;
980 	mtx_init(&sc->pending_mutex, "hv_vss pending mutex", NULL, MTX_DEF);
981 
982 	ctx = device_get_sysctl_ctx(dev);
983 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
984 
985 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_vss_log",
986 	    CTLFLAG_RWTUN, &hv_vss_log, 0, "Hyperv VSS service log level");
987 
988 	TASK_INIT(&sc->task, 0, hv_vss_process_request, sc);
989 	hv_vss_init_send_receive_queue(dev);
990 	/* create character device for file system freeze/thaw */
991 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
992 		    &sc->hv_vss_dev,
993 		    &hv_vss_cdevsw,
994 		    0,
995 		    UID_ROOT,
996 		    GID_WHEEL,
997 		    0640,
998 		    FS_VSS_DEV_NAME);
999 
1000 	if (error != 0) {
1001 		hv_vss_log_info("Fail to create '%s': %d\n", FS_VSS_DEV_NAME, error);
1002 		return (error);
1003 	}
1004 	sc->hv_vss_dev->si_drv1 = &sc->daemon_sc;
1005 	sc->daemon_sc.sc = sc;
1006 	/* create character device for application freeze/thaw */
1007 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
1008 		    &sc->hv_appvss_dev,
1009 		    &hv_appvss_cdevsw,
1010 		    0,
1011 		    UID_ROOT,
1012 		    GID_WHEEL,
1013 		    0640,
1014 		    APP_VSS_DEV_NAME);
1015 
1016 	if (error != 0) {
1017 		hv_vss_log_info("Fail to create '%s': %d\n", APP_VSS_DEV_NAME, error);
1018 		return (error);
1019 	}
1020 	sc->hv_appvss_dev->si_drv1 = &sc->app_sc;
1021 	sc->app_sc.sc = sc;
1022 
1023 	return (vmbus_ic_attach(dev, hv_vss_callback));
1024 }
1025 
1026 static int
1027 hv_vss_detach(device_t dev)
1028 {
1029 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
1030 	mtx_destroy(&sc->pending_mutex);
1031 	if (sc->daemon_sc.proc_task != NULL) {
1032 		PROC_LOCK(sc->daemon_sc.proc_task);
1033 		kern_psignal(sc->daemon_sc.proc_task, SIGKILL);
1034 		PROC_UNLOCK(sc->daemon_sc.proc_task);
1035 	}
1036 	if (sc->app_sc.proc_task != NULL) {
1037 		PROC_LOCK(sc->app_sc.proc_task);
1038 		kern_psignal(sc->app_sc.proc_task, SIGKILL);
1039 		PROC_UNLOCK(sc->app_sc.proc_task);
1040 	}
1041 	hv_vss_destroy_send_receive_queue(dev);
1042 	destroy_dev(sc->hv_vss_dev);
1043 	destroy_dev(sc->hv_appvss_dev);
1044 	return (vmbus_ic_detach(dev));
1045 }
1046 
1047 static device_method_t vss_methods[] = {
1048 	/* Device interface */
1049 	DEVMETHOD(device_probe, hv_vss_probe),
1050 	DEVMETHOD(device_attach, hv_vss_attach),
1051 	DEVMETHOD(device_detach, hv_vss_detach),
1052 	{ 0, 0 }
1053 };
1054 
1055 static driver_t vss_driver = { "hvvss", vss_methods, sizeof(hv_vss_sc)};
1056 
1057 static devclass_t vss_devclass;
1058 
1059 DRIVER_MODULE(hv_vss, vmbus, vss_driver, vss_devclass, NULL, NULL);
1060 MODULE_VERSION(hv_vss, 1);
1061 MODULE_DEPEND(hv_vss, vmbus, 1, 1, 1);
1062