xref: /linux/drivers/accel/habanalabs/common/habanalabs_drv.c (revision 335bbdf01d25517ae832ac1807fd8323c1f4f3b9)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13 
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 #include <linux/vmalloc.h>
17 #include <linux/version.h>
18 
19 #include <drm/drm_accel.h>
20 #include <drm/drm_drv.h>
21 #include <drm/drm_ioctl.h>
22 
23 #define CREATE_TRACE_POINTS
24 #include <trace/events/habanalabs.h>
25 
26 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
27 
28 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
29 
30 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
31 MODULE_DESCRIPTION(HL_DRIVER_DESC);
32 MODULE_LICENSE("GPL v2");
33 
34 static int hl_major;
35 static DEFINE_IDR(hl_devs_idr);
36 static DEFINE_MUTEX(hl_devs_idr_lock);
37 
38 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
39 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
40 
41 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
42 static int reset_on_lockup = 1;
43 static int memory_scrub;
44 static ulong boot_error_status_mask = ULONG_MAX;
45 
46 module_param(timeout_locked, int, 0444);
47 MODULE_PARM_DESC(timeout_locked,
48 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
49 
50 module_param(reset_on_lockup, int, 0444);
51 MODULE_PARM_DESC(reset_on_lockup,
52 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
53 
54 module_param(memory_scrub, int, 0444);
55 MODULE_PARM_DESC(memory_scrub,
56 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
57 
58 module_param(boot_error_status_mask, ulong, 0444);
59 MODULE_PARM_DESC(boot_error_status_mask,
60 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
61 
62 #define PCI_IDS_GOYA			0x0001
63 #define PCI_IDS_GAUDI			0x1000
64 #define PCI_IDS_GAUDI_SEC		0x1010
65 
66 #define PCI_IDS_GAUDI2			0x1020
67 
68 static const struct pci_device_id ids[] = {
69 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
70 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
71 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
72 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
73 	{ 0, }
74 };
75 MODULE_DEVICE_TABLE(pci, ids);
76 
77 static const struct drm_ioctl_desc hl_drm_ioctls[] = {
78 	DRM_IOCTL_DEF_DRV(HL_INFO, hl_info_ioctl, 0),
79 	DRM_IOCTL_DEF_DRV(HL_CB, hl_cb_ioctl, 0),
80 	DRM_IOCTL_DEF_DRV(HL_CS, hl_cs_ioctl, 0),
81 	DRM_IOCTL_DEF_DRV(HL_WAIT_CS, hl_wait_ioctl, 0),
82 	DRM_IOCTL_DEF_DRV(HL_MEMORY, hl_mem_ioctl, 0),
83 	DRM_IOCTL_DEF_DRV(HL_DEBUG, hl_debug_ioctl, 0),
84 };
85 
86 static const struct file_operations hl_fops = {
87 	.owner = THIS_MODULE,
88 	.open = accel_open,
89 	.release = drm_release,
90 	.unlocked_ioctl = drm_ioctl,
91 	.compat_ioctl = drm_compat_ioctl,
92 	.llseek = noop_llseek,
93 	.mmap = hl_mmap
94 };
95 
96 static const struct drm_driver hl_driver = {
97 	.driver_features = DRIVER_COMPUTE_ACCEL,
98 
99 	.name = HL_NAME,
100 	.desc = HL_DRIVER_DESC,
101 	.major = LINUX_VERSION_MAJOR,
102 	.minor = LINUX_VERSION_PATCHLEVEL,
103 	.patchlevel = LINUX_VERSION_SUBLEVEL,
104 	.date = "20190505",
105 
106 	.fops = &hl_fops,
107 	.open = hl_device_open,
108 	.postclose = hl_device_release,
109 	.ioctls = hl_drm_ioctls,
110 	.num_ioctls = ARRAY_SIZE(hl_drm_ioctls)
111 };
112 
113 /*
114  * get_asic_type - translate device id to asic type
115  *
116  * @hdev: pointer to habanalabs device structure.
117  *
118  * Translate device id and revision id to asic type.
119  * In case of unidentified device, return -1
120  */
121 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
122 {
123 	struct pci_dev *pdev = hdev->pdev;
124 	enum hl_asic_type asic_type = ASIC_INVALID;
125 
126 	switch (pdev->device) {
127 	case PCI_IDS_GOYA:
128 		asic_type = ASIC_GOYA;
129 		break;
130 	case PCI_IDS_GAUDI:
131 		asic_type = ASIC_GAUDI;
132 		break;
133 	case PCI_IDS_GAUDI_SEC:
134 		asic_type = ASIC_GAUDI_SEC;
135 		break;
136 	case PCI_IDS_GAUDI2:
137 		switch (pdev->revision) {
138 		case REV_ID_A:
139 			asic_type = ASIC_GAUDI2;
140 			break;
141 		case REV_ID_B:
142 			asic_type = ASIC_GAUDI2B;
143 			break;
144 		default:
145 			break;
146 		}
147 		break;
148 	default:
149 		break;
150 	}
151 
152 	return asic_type;
153 }
154 
155 static bool is_asic_secured(enum hl_asic_type asic_type)
156 {
157 	switch (asic_type) {
158 	case ASIC_GAUDI_SEC:
159 		return true;
160 	default:
161 		return false;
162 	}
163 }
164 
165 /*
166  * hl_device_open() - open function for habanalabs device.
167  * @ddev: pointer to DRM device structure.
168  * @file: pointer to DRM file private data structure.
169  *
170  * Called when process opens an habanalabs device.
171  */
172 int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
173 {
174 	struct hl_device *hdev = to_hl_device(ddev);
175 	enum hl_device_status status;
176 	struct hl_fpriv *hpriv;
177 	int rc;
178 
179 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
180 	if (!hpriv)
181 		return -ENOMEM;
182 
183 	hpriv->hdev = hdev;
184 	mutex_init(&hpriv->notifier_event.lock);
185 	mutex_init(&hpriv->restore_phase_mutex);
186 	mutex_init(&hpriv->ctx_lock);
187 	kref_init(&hpriv->refcount);
188 
189 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
190 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
191 
192 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
193 
194 	mutex_lock(&hdev->fpriv_list_lock);
195 
196 	if (!hl_device_operational(hdev, &status)) {
197 		dev_dbg_ratelimited(hdev->dev,
198 			"Can't open %s because it is %s\n",
199 			dev_name(hdev->dev), hdev->status[status]);
200 
201 		if (status == HL_DEVICE_STATUS_IN_RESET ||
202 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
203 			rc = -EAGAIN;
204 		else
205 			rc = -EPERM;
206 
207 		goto out_err;
208 	}
209 
210 	if (hdev->is_in_dram_scrub) {
211 		dev_dbg_ratelimited(hdev->dev,
212 			"Can't open %s during dram scrub\n",
213 			dev_name(hdev->dev));
214 		rc = -EAGAIN;
215 		goto out_err;
216 	}
217 
218 	if (hdev->compute_ctx_in_release) {
219 		dev_dbg_ratelimited(hdev->dev,
220 			"Can't open %s because another user is still releasing it\n",
221 			dev_name(hdev->dev));
222 		rc = -EAGAIN;
223 		goto out_err;
224 	}
225 
226 	if (hdev->is_compute_ctx_active) {
227 		dev_dbg_ratelimited(hdev->dev,
228 			"Can't open %s because another user is working on it\n",
229 			dev_name(hdev->dev));
230 		rc = -EBUSY;
231 		goto out_err;
232 	}
233 
234 	rc = hl_ctx_create(hdev, hpriv);
235 	if (rc) {
236 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
237 		goto out_err;
238 	}
239 
240 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
241 	mutex_unlock(&hdev->fpriv_list_lock);
242 
243 	hdev->asic_funcs->send_device_activity(hdev, true);
244 
245 	hl_debugfs_add_file(hpriv);
246 
247 	hl_enable_err_info_capture(&hdev->captured_err_info);
248 
249 	hdev->open_counter++;
250 	hdev->last_successful_open_jif = jiffies;
251 	hdev->last_successful_open_ktime = ktime_get();
252 
253 	file_priv->driver_priv = hpriv;
254 	hpriv->file_priv = file_priv;
255 
256 	return 0;
257 
258 out_err:
259 	mutex_unlock(&hdev->fpriv_list_lock);
260 	hl_mem_mgr_fini(&hpriv->mem_mgr);
261 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
262 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
263 	mutex_destroy(&hpriv->ctx_lock);
264 	mutex_destroy(&hpriv->restore_phase_mutex);
265 	mutex_destroy(&hpriv->notifier_event.lock);
266 	put_pid(hpriv->taskpid);
267 
268 	kfree(hpriv);
269 
270 	return rc;
271 }
272 
273 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
274 {
275 	struct hl_device *hdev;
276 	struct hl_fpriv *hpriv;
277 	int rc;
278 
279 	mutex_lock(&hl_devs_idr_lock);
280 	hdev = idr_find(&hl_devs_idr, iminor(inode));
281 	mutex_unlock(&hl_devs_idr_lock);
282 
283 	if (!hdev) {
284 		pr_err("Couldn't find device %d:%d\n",
285 			imajor(inode), iminor(inode));
286 		return -ENXIO;
287 	}
288 
289 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
290 	if (!hpriv)
291 		return -ENOMEM;
292 
293 	/* Prevent other routines from reading partial hpriv data by
294 	 * initializing hpriv fields before inserting it to the list
295 	 */
296 	hpriv->hdev = hdev;
297 	filp->private_data = hpriv;
298 
299 	nonseekable_open(inode, filp);
300 
301 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
302 
303 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
304 
305 	if (!hl_ctrl_device_operational(hdev, NULL)) {
306 		dev_dbg_ratelimited(hdev->dev_ctrl,
307 			"Can't open %s because it is disabled\n",
308 			dev_name(hdev->dev_ctrl));
309 		rc = -EPERM;
310 		goto out_err;
311 	}
312 
313 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
314 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
315 
316 	return 0;
317 
318 out_err:
319 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
320 	filp->private_data = NULL;
321 	put_pid(hpriv->taskpid);
322 
323 	kfree(hpriv);
324 
325 	return rc;
326 }
327 
328 static void set_driver_behavior_per_device(struct hl_device *hdev)
329 {
330 	hdev->nic_ports_mask = 0;
331 	hdev->fw_components = FW_TYPE_ALL_TYPES;
332 	hdev->cpu_queues_enable = 1;
333 	hdev->pldm = 0;
334 	hdev->hard_reset_on_fw_events = 1;
335 	hdev->bmc_enable = 1;
336 	hdev->reset_on_preboot_fail = 1;
337 	hdev->heartbeat = 1;
338 }
339 
340 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
341 {
342 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
343 
344 	hdev->major = hl_major;
345 	hdev->memory_scrub = memory_scrub;
346 	hdev->reset_on_lockup = reset_on_lockup;
347 	hdev->boot_error_status_mask = boot_error_status_mask;
348 }
349 
350 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
351 {
352 	switch (hdev->asic_type) {
353 	case ASIC_GAUDI:
354 	case ASIC_GAUDI_SEC:
355 		/* If user didn't request a different timeout than the default one, we have
356 		 * a different default timeout for Gaudi
357 		 */
358 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
359 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
360 										MSEC_PER_SEC);
361 
362 		hdev->reset_upon_device_release = 0;
363 		break;
364 
365 	case ASIC_GOYA:
366 		hdev->reset_upon_device_release = 0;
367 		break;
368 
369 	default:
370 		hdev->reset_upon_device_release = 1;
371 		break;
372 	}
373 }
374 
375 static int fixup_device_params(struct hl_device *hdev)
376 {
377 	int tmp_timeout;
378 
379 	tmp_timeout = timeout_locked;
380 
381 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
382 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
383 
384 	if (tmp_timeout)
385 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
386 	else
387 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
388 
389 	hdev->stop_on_err = true;
390 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
391 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
392 
393 	/* Enable only after the initialization of the device */
394 	hdev->disabled = true;
395 
396 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
397 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
398 		pr_err("Preboot must be set along with other components");
399 		return -EINVAL;
400 	}
401 
402 	/* If CPU queues not enabled, no way to do heartbeat */
403 	if (!hdev->cpu_queues_enable)
404 		hdev->heartbeat = 0;
405 	fixup_device_params_per_asic(hdev, tmp_timeout);
406 
407 	return 0;
408 }
409 
410 static int allocate_device_id(struct hl_device *hdev)
411 {
412 	int id;
413 
414 	mutex_lock(&hl_devs_idr_lock);
415 	id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
416 	mutex_unlock(&hl_devs_idr_lock);
417 
418 	if (id < 0) {
419 		if (id == -ENOSPC)
420 			pr_err("too many devices in the system\n");
421 		return -EBUSY;
422 	}
423 
424 	hdev->id = id;
425 
426 	/*
427 	 * Firstly initialized with the internal device ID.
428 	 * Will be updated later after the DRM device registration to hold the minor ID.
429 	 */
430 	hdev->cdev_idx = hdev->id;
431 
432 	return 0;
433 }
434 
435 /**
436  * create_hdev - create habanalabs device instance
437  *
438  * @dev: will hold the pointer to the new habanalabs device structure
439  * @pdev: pointer to the pci device
440  *
441  * Allocate memory for habanalabs device and initialize basic fields
442  * Identify the ASIC type
443  * Allocate ID (minor) for the device (only for real devices)
444  */
445 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
446 {
447 	struct hl_device *hdev;
448 	int rc;
449 
450 	*dev = NULL;
451 
452 	hdev = devm_drm_dev_alloc(&pdev->dev, &hl_driver, struct hl_device, drm);
453 	if (IS_ERR(hdev))
454 		return PTR_ERR(hdev);
455 
456 	hdev->dev = hdev->drm.dev;
457 
458 	/* Will be NULL in case of simulator device */
459 	hdev->pdev = pdev;
460 
461 	/* Assign status description string */
462 	strscpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
463 	strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
464 	strscpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
465 	strscpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
466 	strscpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
467 				"in device creation", HL_STR_MAX);
468 	strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
469 				"in reset after device release", HL_STR_MAX);
470 
471 
472 	/* First, we must find out which ASIC are we handling. This is needed
473 	 * to configure the behavior of the driver (kernel parameters)
474 	 */
475 	hdev->asic_type = get_asic_type(hdev);
476 	if (hdev->asic_type == ASIC_INVALID) {
477 		dev_err(&pdev->dev, "Unsupported ASIC\n");
478 		rc = -ENODEV;
479 		goto out_err;
480 	}
481 
482 	copy_kernel_module_params_to_device(hdev);
483 
484 	set_driver_behavior_per_device(hdev);
485 
486 	fixup_device_params(hdev);
487 
488 	rc = allocate_device_id(hdev);
489 	if (rc)
490 		goto out_err;
491 
492 	*dev = hdev;
493 
494 	return 0;
495 
496 out_err:
497 	return rc;
498 }
499 
500 /*
501  * destroy_hdev - destroy habanalabs device instance
502  *
503  * @dev: pointer to the habanalabs device structure
504  *
505  */
506 static void destroy_hdev(struct hl_device *hdev)
507 {
508 	/* Remove device from the device list */
509 	mutex_lock(&hl_devs_idr_lock);
510 	idr_remove(&hl_devs_idr, hdev->id);
511 	mutex_unlock(&hl_devs_idr_lock);
512 
513 }
514 
515 static int hl_pmops_suspend(struct device *dev)
516 {
517 	struct hl_device *hdev = dev_get_drvdata(dev);
518 
519 	pr_debug("Going to suspend PCI device\n");
520 
521 	if (!hdev) {
522 		pr_err("device pointer is NULL in suspend\n");
523 		return 0;
524 	}
525 
526 	return hl_device_suspend(hdev);
527 }
528 
529 static int hl_pmops_resume(struct device *dev)
530 {
531 	struct hl_device *hdev = dev_get_drvdata(dev);
532 
533 	pr_debug("Going to resume PCI device\n");
534 
535 	if (!hdev) {
536 		pr_err("device pointer is NULL in resume\n");
537 		return 0;
538 	}
539 
540 	return hl_device_resume(hdev);
541 }
542 
543 /**
544  * hl_pci_probe - probe PCI habanalabs devices
545  *
546  * @pdev: pointer to pci device
547  * @id: pointer to pci device id structure
548  *
549  * Standard PCI probe function for habanalabs device.
550  * Create a new habanalabs device and initialize it according to the
551  * device's type
552  */
553 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
554 {
555 	struct hl_device *hdev;
556 	int rc;
557 
558 	dev_info(&pdev->dev, HL_NAME
559 		 " device found [%04x:%04x] (rev %x)\n",
560 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
561 
562 	rc = create_hdev(&hdev, pdev);
563 	if (rc)
564 		return rc;
565 
566 	pci_set_drvdata(pdev, hdev);
567 
568 	rc = hl_device_init(hdev);
569 	if (rc) {
570 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
571 		rc = -ENODEV;
572 		goto disable_device;
573 	}
574 
575 	return 0;
576 
577 disable_device:
578 	pci_set_drvdata(pdev, NULL);
579 	destroy_hdev(hdev);
580 
581 	return rc;
582 }
583 
584 /*
585  * hl_pci_remove - remove PCI habanalabs devices
586  *
587  * @pdev: pointer to pci device
588  *
589  * Standard PCI remove function for habanalabs device
590  */
591 static void hl_pci_remove(struct pci_dev *pdev)
592 {
593 	struct hl_device *hdev;
594 
595 	hdev = pci_get_drvdata(pdev);
596 	if (!hdev)
597 		return;
598 
599 	hl_device_fini(hdev);
600 	pci_set_drvdata(pdev, NULL);
601 	destroy_hdev(hdev);
602 }
603 
604 /**
605  * hl_pci_err_detected - a PCI bus error detected on this device
606  *
607  * @pdev: pointer to pci device
608  * @state: PCI error type
609  *
610  * Called by the PCI subsystem whenever a non-correctable
611  * PCI bus error is detected
612  */
613 static pci_ers_result_t
614 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
615 {
616 	struct hl_device *hdev = pci_get_drvdata(pdev);
617 	enum pci_ers_result result;
618 
619 	switch (state) {
620 	case pci_channel_io_normal:
621 		dev_warn(hdev->dev, "PCI normal state error detected\n");
622 		return PCI_ERS_RESULT_CAN_RECOVER;
623 
624 	case pci_channel_io_frozen:
625 		dev_warn(hdev->dev, "PCI frozen state error detected\n");
626 		result = PCI_ERS_RESULT_NEED_RESET;
627 		break;
628 
629 	case pci_channel_io_perm_failure:
630 		dev_warn(hdev->dev, "PCI failure state error detected\n");
631 		result = PCI_ERS_RESULT_DISCONNECT;
632 		break;
633 
634 	default:
635 		result = PCI_ERS_RESULT_NONE;
636 	}
637 
638 	hdev->asic_funcs->halt_engines(hdev, true, false);
639 
640 	return result;
641 }
642 
643 /**
644  * hl_pci_err_resume - resume after a PCI slot reset
645  *
646  * @pdev: pointer to pci device
647  *
648  */
649 static void hl_pci_err_resume(struct pci_dev *pdev)
650 {
651 	struct hl_device *hdev = pci_get_drvdata(pdev);
652 
653 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
654 	hl_device_resume(hdev);
655 }
656 
657 /**
658  * hl_pci_err_slot_reset - a PCI slot reset has just happened
659  *
660  * @pdev: pointer to pci device
661  *
662  * Determine if the driver can recover from the PCI slot reset
663  */
664 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
665 {
666 	struct hl_device *hdev = pci_get_drvdata(pdev);
667 
668 	dev_warn(hdev->dev, "PCI slot reset detected\n");
669 
670 	return PCI_ERS_RESULT_RECOVERED;
671 }
672 
673 static const struct dev_pm_ops hl_pm_ops = {
674 	.suspend = hl_pmops_suspend,
675 	.resume = hl_pmops_resume,
676 };
677 
678 static const struct pci_error_handlers hl_pci_err_handler = {
679 	.error_detected = hl_pci_err_detected,
680 	.slot_reset = hl_pci_err_slot_reset,
681 	.resume = hl_pci_err_resume,
682 };
683 
684 static struct pci_driver hl_pci_driver = {
685 	.name = HL_NAME,
686 	.id_table = ids,
687 	.probe = hl_pci_probe,
688 	.remove = hl_pci_remove,
689 	.shutdown = hl_pci_remove,
690 	.driver = {
691 		.name = HL_NAME,
692 		.pm = &hl_pm_ops,
693 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
694 	},
695 	.err_handler = &hl_pci_err_handler,
696 };
697 
698 /*
699  * hl_init - Initialize the habanalabs kernel driver
700  */
701 static int __init hl_init(void)
702 {
703 	int rc;
704 	dev_t dev;
705 
706 	pr_info("loading driver\n");
707 
708 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
709 	if (rc < 0) {
710 		pr_err("unable to get major\n");
711 		return rc;
712 	}
713 
714 	hl_major = MAJOR(dev);
715 
716 	rc = pci_register_driver(&hl_pci_driver);
717 	if (rc) {
718 		pr_err("failed to register pci device\n");
719 		goto remove_major;
720 	}
721 
722 	pr_debug("driver loaded\n");
723 
724 	return 0;
725 
726 remove_major:
727 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
728 	return rc;
729 }
730 
731 /*
732  * hl_exit - Release all resources of the habanalabs kernel driver
733  */
734 static void __exit hl_exit(void)
735 {
736 	pci_unregister_driver(&hl_pci_driver);
737 
738 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
739 
740 	idr_destroy(&hl_devs_idr);
741 
742 	pr_debug("driver removed\n");
743 }
744 
745 module_init(hl_init);
746 module_exit(hl_exit);
747