xref: /linux/drivers/accel/habanalabs/common/habanalabs_drv.c (revision 3e0bc2855b573bcffa2a52955a878f537f5ac0cd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 #include <linux/vmalloc.h>
17 #include <linux/version.h>
18 
19 #include <drm/drm_accel.h>
20 #include <drm/drm_drv.h>
21 #include <drm/drm_ioctl.h>
22 
23 #define CREATE_TRACE_POINTS
24 #include <trace/events/habanalabs.h>
25 
26 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
27 
28 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
29 
30 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
31 MODULE_DESCRIPTION(HL_DRIVER_DESC);
32 MODULE_LICENSE("GPL v2");
33 
34 static int hl_major;
35 static DEFINE_IDR(hl_devs_idr);
36 static DEFINE_MUTEX(hl_devs_idr_lock);
37 
38 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
39 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
40 
41 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
42 static int reset_on_lockup = 1;
43 static int memory_scrub;
44 static ulong boot_error_status_mask = ULONG_MAX;
45 
46 module_param(timeout_locked, int, 0444);
47 MODULE_PARM_DESC(timeout_locked,
48 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
49 
50 module_param(reset_on_lockup, int, 0444);
51 MODULE_PARM_DESC(reset_on_lockup,
52 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
53 
54 module_param(memory_scrub, int, 0444);
55 MODULE_PARM_DESC(memory_scrub,
56 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
57 
58 module_param(boot_error_status_mask, ulong, 0444);
59 MODULE_PARM_DESC(boot_error_status_mask,
60 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
61 
62 #define PCI_IDS_GOYA			0x0001
63 #define PCI_IDS_GAUDI			0x1000
64 #define PCI_IDS_GAUDI_SEC		0x1010
65 
66 #define PCI_IDS_GAUDI2			0x1020
67 
68 static const struct pci_device_id ids[] = {
69 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
70 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
71 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
72 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
73 	{ 0, }
74 };
75 MODULE_DEVICE_TABLE(pci, ids);
76 
77 static const struct drm_ioctl_desc hl_drm_ioctls[] = {
78 	DRM_IOCTL_DEF_DRV(HL_INFO, hl_info_ioctl, 0),
79 	DRM_IOCTL_DEF_DRV(HL_CB, hl_cb_ioctl, 0),
80 	DRM_IOCTL_DEF_DRV(HL_CS, hl_cs_ioctl, 0),
81 	DRM_IOCTL_DEF_DRV(HL_WAIT_CS, hl_wait_ioctl, 0),
82 	DRM_IOCTL_DEF_DRV(HL_MEMORY, hl_mem_ioctl, 0),
83 	DRM_IOCTL_DEF_DRV(HL_DEBUG, hl_debug_ioctl, 0),
84 };
85 
86 static const struct file_operations hl_fops = {
87 	.owner = THIS_MODULE,
88 	.open = accel_open,
89 	.release = drm_release,
90 	.unlocked_ioctl = drm_ioctl,
91 	.compat_ioctl = drm_compat_ioctl,
92 	.llseek = noop_llseek,
93 	.mmap = hl_mmap
94 };
95 
96 static const struct drm_driver hl_driver = {
97 	.driver_features = DRIVER_COMPUTE_ACCEL,
98 
99 	.name = HL_NAME,
100 	.desc = HL_DRIVER_DESC,
101 	.major = LINUX_VERSION_MAJOR,
102 	.minor = LINUX_VERSION_PATCHLEVEL,
103 	.patchlevel = LINUX_VERSION_SUBLEVEL,
104 	.date = "20190505",
105 
106 	.fops = &hl_fops,
107 	.open = hl_device_open,
108 	.postclose = hl_device_release,
109 	.ioctls = hl_drm_ioctls,
110 	.num_ioctls = ARRAY_SIZE(hl_drm_ioctls)
111 };
112 
113 /*
114  * get_asic_type - translate device id to asic type
115  *
116  * @hdev: pointer to habanalabs device structure.
117  *
118  * Translate device id and revision id to asic type.
119  * In case of unidentified device, return -1
120  */
121 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
122 {
123 	struct pci_dev *pdev = hdev->pdev;
124 	enum hl_asic_type asic_type = ASIC_INVALID;
125 
126 	switch (pdev->device) {
127 	case PCI_IDS_GOYA:
128 		asic_type = ASIC_GOYA;
129 		break;
130 	case PCI_IDS_GAUDI:
131 		asic_type = ASIC_GAUDI;
132 		break;
133 	case PCI_IDS_GAUDI_SEC:
134 		asic_type = ASIC_GAUDI_SEC;
135 		break;
136 	case PCI_IDS_GAUDI2:
137 		switch (pdev->revision) {
138 		case REV_ID_A:
139 			asic_type = ASIC_GAUDI2;
140 			break;
141 		case REV_ID_B:
142 			asic_type = ASIC_GAUDI2B;
143 			break;
144 		case REV_ID_C:
145 			asic_type = ASIC_GAUDI2C;
146 			break;
147 		default:
148 			break;
149 		}
150 		break;
151 	default:
152 		break;
153 	}
154 
155 	return asic_type;
156 }
157 
158 static bool is_asic_secured(enum hl_asic_type asic_type)
159 {
160 	switch (asic_type) {
161 	case ASIC_GAUDI_SEC:
162 		return true;
163 	default:
164 		return false;
165 	}
166 }
167 
168 /*
169  * hl_device_open() - open function for habanalabs device.
170  * @ddev: pointer to DRM device structure.
171  * @file: pointer to DRM file private data structure.
172  *
173  * Called when process opens an habanalabs device.
174  */
175 int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
176 {
177 	struct hl_device *hdev = to_hl_device(ddev);
178 	enum hl_device_status status;
179 	struct hl_fpriv *hpriv;
180 	int rc;
181 
182 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
183 	if (!hpriv)
184 		return -ENOMEM;
185 
186 	hpriv->hdev = hdev;
187 	mutex_init(&hpriv->notifier_event.lock);
188 	mutex_init(&hpriv->restore_phase_mutex);
189 	mutex_init(&hpriv->ctx_lock);
190 	kref_init(&hpriv->refcount);
191 
192 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
193 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
194 
195 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
196 
197 	mutex_lock(&hdev->fpriv_list_lock);
198 
199 	if (!hl_device_operational(hdev, &status)) {
200 		dev_dbg_ratelimited(hdev->dev,
201 			"Can't open %s because it is %s\n",
202 			dev_name(hdev->dev), hdev->status[status]);
203 
204 		if (status == HL_DEVICE_STATUS_IN_RESET ||
205 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
206 			rc = -EAGAIN;
207 		else
208 			rc = -EPERM;
209 
210 		goto out_err;
211 	}
212 
213 	if (hdev->is_in_dram_scrub) {
214 		dev_dbg_ratelimited(hdev->dev,
215 			"Can't open %s during dram scrub\n",
216 			dev_name(hdev->dev));
217 		rc = -EAGAIN;
218 		goto out_err;
219 	}
220 
221 	if (hdev->compute_ctx_in_release) {
222 		dev_dbg_ratelimited(hdev->dev,
223 			"Can't open %s because another user is still releasing it\n",
224 			dev_name(hdev->dev));
225 		rc = -EAGAIN;
226 		goto out_err;
227 	}
228 
229 	if (hdev->is_compute_ctx_active) {
230 		dev_dbg_ratelimited(hdev->dev,
231 			"Can't open %s because another user is working on it\n",
232 			dev_name(hdev->dev));
233 		rc = -EBUSY;
234 		goto out_err;
235 	}
236 
237 	rc = hl_ctx_create(hdev, hpriv);
238 	if (rc) {
239 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
240 		goto out_err;
241 	}
242 
243 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
244 	mutex_unlock(&hdev->fpriv_list_lock);
245 
246 	hdev->asic_funcs->send_device_activity(hdev, true);
247 
248 	hl_debugfs_add_file(hpriv);
249 
250 	hl_enable_err_info_capture(&hdev->captured_err_info);
251 
252 	hdev->open_counter++;
253 	hdev->last_successful_open_jif = jiffies;
254 	hdev->last_successful_open_ktime = ktime_get();
255 
256 	file_priv->driver_priv = hpriv;
257 	hpriv->file_priv = file_priv;
258 
259 	return 0;
260 
261 out_err:
262 	mutex_unlock(&hdev->fpriv_list_lock);
263 	hl_mem_mgr_fini(&hpriv->mem_mgr);
264 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
265 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
266 	mutex_destroy(&hpriv->ctx_lock);
267 	mutex_destroy(&hpriv->restore_phase_mutex);
268 	mutex_destroy(&hpriv->notifier_event.lock);
269 	put_pid(hpriv->taskpid);
270 
271 	kfree(hpriv);
272 
273 	return rc;
274 }
275 
276 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
277 {
278 	struct hl_device *hdev;
279 	struct hl_fpriv *hpriv;
280 	int rc;
281 
282 	mutex_lock(&hl_devs_idr_lock);
283 	hdev = idr_find(&hl_devs_idr, iminor(inode));
284 	mutex_unlock(&hl_devs_idr_lock);
285 
286 	if (!hdev) {
287 		pr_err("Couldn't find device %d:%d\n",
288 			imajor(inode), iminor(inode));
289 		return -ENXIO;
290 	}
291 
292 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
293 	if (!hpriv)
294 		return -ENOMEM;
295 
296 	/* Prevent other routines from reading partial hpriv data by
297 	 * initializing hpriv fields before inserting it to the list
298 	 */
299 	hpriv->hdev = hdev;
300 	filp->private_data = hpriv;
301 
302 	nonseekable_open(inode, filp);
303 
304 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
305 
306 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
307 
308 	if (!hl_ctrl_device_operational(hdev, NULL)) {
309 		dev_dbg_ratelimited(hdev->dev_ctrl,
310 			"Can't open %s because it is disabled\n",
311 			dev_name(hdev->dev_ctrl));
312 		rc = -EPERM;
313 		goto out_err;
314 	}
315 
316 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
317 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
318 
319 	return 0;
320 
321 out_err:
322 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
323 	filp->private_data = NULL;
324 	put_pid(hpriv->taskpid);
325 
326 	kfree(hpriv);
327 
328 	return rc;
329 }
330 
331 static void set_driver_behavior_per_device(struct hl_device *hdev)
332 {
333 	hdev->nic_ports_mask = 0;
334 	hdev->fw_components = FW_TYPE_ALL_TYPES;
335 	hdev->cpu_queues_enable = 1;
336 	hdev->pldm = 0;
337 	hdev->hard_reset_on_fw_events = 1;
338 	hdev->bmc_enable = 1;
339 	hdev->reset_on_preboot_fail = 1;
340 	hdev->heartbeat = 1;
341 }
342 
343 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
344 {
345 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
346 
347 	hdev->major = hl_major;
348 	hdev->memory_scrub = memory_scrub;
349 	hdev->reset_on_lockup = reset_on_lockup;
350 	hdev->boot_error_status_mask = boot_error_status_mask;
351 }
352 
353 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
354 {
355 	switch (hdev->asic_type) {
356 	case ASIC_GAUDI:
357 	case ASIC_GAUDI_SEC:
358 		/* If user didn't request a different timeout than the default one, we have
359 		 * a different default timeout for Gaudi
360 		 */
361 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
362 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
363 										MSEC_PER_SEC);
364 
365 		hdev->reset_upon_device_release = 0;
366 		break;
367 
368 	case ASIC_GOYA:
369 		hdev->reset_upon_device_release = 0;
370 		break;
371 
372 	default:
373 		hdev->reset_upon_device_release = 1;
374 		break;
375 	}
376 }
377 
378 static int fixup_device_params(struct hl_device *hdev)
379 {
380 	int tmp_timeout;
381 
382 	tmp_timeout = timeout_locked;
383 
384 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
385 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
386 
387 	if (tmp_timeout)
388 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
389 	else
390 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
391 
392 	hdev->stop_on_err = true;
393 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
394 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
395 
396 	/* Enable only after the initialization of the device */
397 	hdev->disabled = true;
398 
399 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
400 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
401 		pr_err("Preboot must be set along with other components");
402 		return -EINVAL;
403 	}
404 
405 	/* If CPU queues not enabled, no way to do heartbeat */
406 	if (!hdev->cpu_queues_enable)
407 		hdev->heartbeat = 0;
408 	fixup_device_params_per_asic(hdev, tmp_timeout);
409 
410 	return 0;
411 }
412 
413 static int allocate_device_id(struct hl_device *hdev)
414 {
415 	int id;
416 
417 	mutex_lock(&hl_devs_idr_lock);
418 	id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
419 	mutex_unlock(&hl_devs_idr_lock);
420 
421 	if (id < 0) {
422 		if (id == -ENOSPC)
423 			pr_err("too many devices in the system\n");
424 		return -EBUSY;
425 	}
426 
427 	hdev->id = id;
428 
429 	/*
430 	 * Firstly initialized with the internal device ID.
431 	 * Will be updated later after the DRM device registration to hold the minor ID.
432 	 */
433 	hdev->cdev_idx = hdev->id;
434 
435 	return 0;
436 }
437 
438 /**
439  * create_hdev - create habanalabs device instance
440  *
441  * @dev: will hold the pointer to the new habanalabs device structure
442  * @pdev: pointer to the pci device
443  *
444  * Allocate memory for habanalabs device and initialize basic fields
445  * Identify the ASIC type
446  * Allocate ID (minor) for the device (only for real devices)
447  */
448 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
449 {
450 	struct hl_device *hdev;
451 	int rc;
452 
453 	*dev = NULL;
454 
455 	hdev = devm_drm_dev_alloc(&pdev->dev, &hl_driver, struct hl_device, drm);
456 	if (IS_ERR(hdev))
457 		return PTR_ERR(hdev);
458 
459 	hdev->dev = hdev->drm.dev;
460 
461 	/* Will be NULL in case of simulator device */
462 	hdev->pdev = pdev;
463 
464 	/* Assign status description string */
465 	strscpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
466 	strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
467 	strscpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
468 	strscpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
469 	strscpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
470 				"in device creation", HL_STR_MAX);
471 	strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
472 				"in reset after device release", HL_STR_MAX);
473 
474 
475 	/* First, we must find out which ASIC are we handling. This is needed
476 	 * to configure the behavior of the driver (kernel parameters)
477 	 */
478 	hdev->asic_type = get_asic_type(hdev);
479 	if (hdev->asic_type == ASIC_INVALID) {
480 		dev_err(&pdev->dev, "Unsupported ASIC\n");
481 		rc = -ENODEV;
482 		goto out_err;
483 	}
484 
485 	copy_kernel_module_params_to_device(hdev);
486 
487 	set_driver_behavior_per_device(hdev);
488 
489 	fixup_device_params(hdev);
490 
491 	rc = allocate_device_id(hdev);
492 	if (rc)
493 		goto out_err;
494 
495 	*dev = hdev;
496 
497 	return 0;
498 
499 out_err:
500 	return rc;
501 }
502 
503 /*
504  * destroy_hdev - destroy habanalabs device instance
505  *
506  * @dev: pointer to the habanalabs device structure
507  *
508  */
509 static void destroy_hdev(struct hl_device *hdev)
510 {
511 	/* Remove device from the device list */
512 	mutex_lock(&hl_devs_idr_lock);
513 	idr_remove(&hl_devs_idr, hdev->id);
514 	mutex_unlock(&hl_devs_idr_lock);
515 
516 }
517 
518 static int hl_pmops_suspend(struct device *dev)
519 {
520 	struct hl_device *hdev = dev_get_drvdata(dev);
521 
522 	pr_debug("Going to suspend PCI device\n");
523 
524 	if (!hdev) {
525 		pr_err("device pointer is NULL in suspend\n");
526 		return 0;
527 	}
528 
529 	return hl_device_suspend(hdev);
530 }
531 
532 static int hl_pmops_resume(struct device *dev)
533 {
534 	struct hl_device *hdev = dev_get_drvdata(dev);
535 
536 	pr_debug("Going to resume PCI device\n");
537 
538 	if (!hdev) {
539 		pr_err("device pointer is NULL in resume\n");
540 		return 0;
541 	}
542 
543 	return hl_device_resume(hdev);
544 }
545 
546 /**
547  * hl_pci_probe - probe PCI habanalabs devices
548  *
549  * @pdev: pointer to pci device
550  * @id: pointer to pci device id structure
551  *
552  * Standard PCI probe function for habanalabs device.
553  * Create a new habanalabs device and initialize it according to the
554  * device's type
555  */
556 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
557 {
558 	struct hl_device *hdev;
559 	int rc;
560 
561 	dev_info(&pdev->dev, HL_NAME
562 		 " device found [%04x:%04x] (rev %x)\n",
563 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
564 
565 	rc = create_hdev(&hdev, pdev);
566 	if (rc)
567 		return rc;
568 
569 	pci_set_drvdata(pdev, hdev);
570 
571 	rc = hl_device_init(hdev);
572 	if (rc) {
573 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
574 		rc = -ENODEV;
575 		goto disable_device;
576 	}
577 
578 	return 0;
579 
580 disable_device:
581 	pci_set_drvdata(pdev, NULL);
582 	destroy_hdev(hdev);
583 
584 	return rc;
585 }
586 
587 /*
588  * hl_pci_remove - remove PCI habanalabs devices
589  *
590  * @pdev: pointer to pci device
591  *
592  * Standard PCI remove function for habanalabs device
593  */
594 static void hl_pci_remove(struct pci_dev *pdev)
595 {
596 	struct hl_device *hdev;
597 
598 	hdev = pci_get_drvdata(pdev);
599 	if (!hdev)
600 		return;
601 
602 	hl_device_fini(hdev);
603 	pci_set_drvdata(pdev, NULL);
604 	destroy_hdev(hdev);
605 }
606 
607 /**
608  * hl_pci_err_detected - a PCI bus error detected on this device
609  *
610  * @pdev: pointer to pci device
611  * @state: PCI error type
612  *
613  * Called by the PCI subsystem whenever a non-correctable
614  * PCI bus error is detected
615  */
616 static pci_ers_result_t
617 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
618 {
619 	struct hl_device *hdev = pci_get_drvdata(pdev);
620 	enum pci_ers_result result;
621 
622 	switch (state) {
623 	case pci_channel_io_normal:
624 		dev_warn(hdev->dev, "PCI normal state error detected\n");
625 		return PCI_ERS_RESULT_CAN_RECOVER;
626 
627 	case pci_channel_io_frozen:
628 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
629 		result = PCI_ERS_RESULT_NEED_RESET;
630 		break;
631 
632 	case pci_channel_io_perm_failure:
633 		dev_warn(hdev->dev, "PCI failure state error detected\n");
634 		result = PCI_ERS_RESULT_DISCONNECT;
635 		break;
636 
637 	default:
638 		result = PCI_ERS_RESULT_NONE;
639 	}
640 
641 	hdev->asic_funcs->halt_engines(hdev, true, false);
642 
643 	return result;
644 }
645 
646 /**
647  * hl_pci_err_resume - resume after a PCI slot reset
648  *
649  * @pdev: pointer to pci device
650  *
651  */
652 static void hl_pci_err_resume(struct pci_dev *pdev)
653 {
654 	struct hl_device *hdev = pci_get_drvdata(pdev);
655 
656 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
657 	hl_device_resume(hdev);
658 }
659 
660 /**
661  * hl_pci_err_slot_reset - a PCI slot reset has just happened
662  *
663  * @pdev: pointer to pci device
664  *
665  * Determine if the driver can recover from the PCI slot reset
666  */
667 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
668 {
669 	struct hl_device *hdev = pci_get_drvdata(pdev);
670 
671 	dev_warn(hdev->dev, "PCI slot reset detected\n");
672 
673 	return PCI_ERS_RESULT_RECOVERED;
674 }
675 
676 static void hl_pci_reset_prepare(struct pci_dev *pdev)
677 {
678 	struct hl_device *hdev;
679 
680 	hdev = pci_get_drvdata(pdev);
681 	if (!hdev)
682 		return;
683 
684 	hdev->disabled = true;
685 }
686 
687 static void hl_pci_reset_done(struct pci_dev *pdev)
688 {
689 	struct hl_device *hdev;
690 	u32 flags;
691 
692 	hdev = pci_get_drvdata(pdev);
693 	if (!hdev)
694 		return;
695 
696 	/*
697 	 * Schedule a thread to trigger hard reset.
698 	 * The reason for this handler, is for rare cases where the driver is up
699 	 * and FLR occurs. This is valid only when working with no VM, so FW handles FLR
700 	 * and resets the device. FW will go back preboot stage, so driver needs to perform
701 	 * hard reset in order to load FW fit again.
702 	 */
703 	flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
704 
705 	hl_device_reset(hdev, flags);
706 }
707 
708 static const struct dev_pm_ops hl_pm_ops = {
709 	.suspend = hl_pmops_suspend,
710 	.resume = hl_pmops_resume,
711 };
712 
713 static const struct pci_error_handlers hl_pci_err_handler = {
714 	.error_detected = hl_pci_err_detected,
715 	.slot_reset = hl_pci_err_slot_reset,
716 	.resume = hl_pci_err_resume,
717 	.reset_prepare = hl_pci_reset_prepare,
718 	.reset_done = hl_pci_reset_done,
719 };
720 
721 static struct pci_driver hl_pci_driver = {
722 	.name = HL_NAME,
723 	.id_table = ids,
724 	.probe = hl_pci_probe,
725 	.remove = hl_pci_remove,
726 	.shutdown = hl_pci_remove,
727 	.driver = {
728 		.name = HL_NAME,
729 		.pm = &hl_pm_ops,
730 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
731 	},
732 	.err_handler = &hl_pci_err_handler,
733 };
734 
735 /*
736  * hl_init - Initialize the habanalabs kernel driver
737  */
738 static int __init hl_init(void)
739 {
740 	int rc;
741 	dev_t dev;
742 
743 	pr_info("loading driver\n");
744 
745 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
746 	if (rc < 0) {
747 		pr_err("unable to get major\n");
748 		return rc;
749 	}
750 
751 	hl_major = MAJOR(dev);
752 
753 	rc = pci_register_driver(&hl_pci_driver);
754 	if (rc) {
755 		pr_err("failed to register pci device\n");
756 		goto remove_major;
757 	}
758 
759 	pr_debug("driver loaded\n");
760 
761 	return 0;
762 
763 remove_major:
764 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
765 	return rc;
766 }
767 
768 /*
769  * hl_exit - Release all resources of the habanalabs kernel driver
770  */
771 static void __exit hl_exit(void)
772 {
773 	pci_unregister_driver(&hl_pci_driver);
774 
775 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
776 
777 	idr_destroy(&hl_devs_idr);
778 
779 	pr_debug("driver removed\n");
780 }
781 
782 module_init(hl_init);
783 module_exit(hl_exit);
784