1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2016-2021 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
9 #define pr_fmt(fmt) "habanalabs: " fmt
10
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 #include <linux/vmalloc.h>
17 #include <linux/version.h>
18
19 #include <drm/drm_accel.h>
20 #include <drm/drm_drv.h>
21 #include <drm/drm_ioctl.h>
22
23 #define CREATE_TRACE_POINTS
24 #include <trace/events/habanalabs.h>
25
26 #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
27
28 #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
29
30 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
31 MODULE_DESCRIPTION(HL_DRIVER_DESC);
32 MODULE_LICENSE("GPL v2");
33
34 static int hl_major;
35 static DEFINE_IDR(hl_devs_idr);
36 static DEFINE_MUTEX(hl_devs_idr_lock);
37
38 #define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */
39 #define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */
40
41 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
42 static int reset_on_lockup = 1;
43 static int memory_scrub;
44 static ulong boot_error_status_mask = ULONG_MAX;
45
46 module_param(timeout_locked, int, 0444);
47 MODULE_PARM_DESC(timeout_locked,
48 "Device lockup timeout in seconds (0 = disabled, default 30s)");
49
50 module_param(reset_on_lockup, int, 0444);
51 MODULE_PARM_DESC(reset_on_lockup,
52 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
53
54 module_param(memory_scrub, int, 0444);
55 MODULE_PARM_DESC(memory_scrub,
56 "Scrub device memory in various states (0 = no, 1 = yes, default no)");
57
58 module_param(boot_error_status_mask, ulong, 0444);
59 MODULE_PARM_DESC(boot_error_status_mask,
60 "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
61
62 #define PCI_IDS_GOYA 0x0001
63 #define PCI_IDS_GAUDI 0x1000
64 #define PCI_IDS_GAUDI_SEC 0x1010
65
66 #define PCI_IDS_GAUDI2 0x1020
67
68 static const struct pci_device_id ids[] = {
69 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
70 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
71 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
72 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
73 { 0, }
74 };
75 MODULE_DEVICE_TABLE(pci, ids);
76
77 static const struct drm_ioctl_desc hl_drm_ioctls[] = {
78 DRM_IOCTL_DEF_DRV(HL_INFO, hl_info_ioctl, 0),
79 DRM_IOCTL_DEF_DRV(HL_CB, hl_cb_ioctl, 0),
80 DRM_IOCTL_DEF_DRV(HL_CS, hl_cs_ioctl, 0),
81 DRM_IOCTL_DEF_DRV(HL_WAIT_CS, hl_wait_ioctl, 0),
82 DRM_IOCTL_DEF_DRV(HL_MEMORY, hl_mem_ioctl, 0),
83 DRM_IOCTL_DEF_DRV(HL_DEBUG, hl_debug_ioctl, 0),
84 };
85
86 static const struct file_operations hl_fops = {
87 .owner = THIS_MODULE,
88 .open = accel_open,
89 .release = drm_release,
90 .unlocked_ioctl = drm_ioctl,
91 .compat_ioctl = drm_compat_ioctl,
92 .llseek = noop_llseek,
93 .mmap = hl_mmap
94 };
95
96 static const struct drm_driver hl_driver = {
97 .driver_features = DRIVER_COMPUTE_ACCEL,
98
99 .name = HL_NAME,
100 .desc = HL_DRIVER_DESC,
101 .major = LINUX_VERSION_MAJOR,
102 .minor = LINUX_VERSION_PATCHLEVEL,
103 .patchlevel = LINUX_VERSION_SUBLEVEL,
104
105 .fops = &hl_fops,
106 .open = hl_device_open,
107 .postclose = hl_device_release,
108 .ioctls = hl_drm_ioctls,
109 .num_ioctls = ARRAY_SIZE(hl_drm_ioctls)
110 };
111
112 /*
113 * get_asic_type - translate device id to asic type
114 *
115 * @hdev: pointer to habanalabs device structure.
116 *
117 * Translate device id and revision id to asic type.
118 * In case of unidentified device, return -1
119 */
get_asic_type(struct hl_device * hdev)120 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
121 {
122 struct pci_dev *pdev = hdev->pdev;
123 enum hl_asic_type asic_type = ASIC_INVALID;
124
125 switch (pdev->device) {
126 case PCI_IDS_GOYA:
127 asic_type = ASIC_GOYA;
128 break;
129 case PCI_IDS_GAUDI:
130 asic_type = ASIC_GAUDI;
131 break;
132 case PCI_IDS_GAUDI_SEC:
133 asic_type = ASIC_GAUDI_SEC;
134 break;
135 case PCI_IDS_GAUDI2:
136 switch (pdev->revision) {
137 case REV_ID_A:
138 asic_type = ASIC_GAUDI2;
139 break;
140 case REV_ID_B:
141 asic_type = ASIC_GAUDI2B;
142 break;
143 case REV_ID_C:
144 asic_type = ASIC_GAUDI2C;
145 break;
146 case REV_ID_D:
147 asic_type = ASIC_GAUDI2D;
148 break;
149 default:
150 break;
151 }
152 break;
153 default:
154 break;
155 }
156
157 return asic_type;
158 }
159
is_asic_secured(enum hl_asic_type asic_type)160 static bool is_asic_secured(enum hl_asic_type asic_type)
161 {
162 switch (asic_type) {
163 case ASIC_GAUDI_SEC:
164 return true;
165 default:
166 return false;
167 }
168 }
169
170 /*
171 * hl_device_open() - open function for habanalabs device.
172 * @ddev: pointer to DRM device structure.
173 * @file: pointer to DRM file private data structure.
174 *
175 * Called when process opens an habanalabs device.
176 */
hl_device_open(struct drm_device * ddev,struct drm_file * file_priv)177 int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
178 {
179 struct hl_device *hdev = to_hl_device(ddev);
180 enum hl_device_status status;
181 struct hl_fpriv *hpriv;
182 int rc;
183
184 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
185 if (!hpriv)
186 return -ENOMEM;
187
188 hpriv->hdev = hdev;
189 mutex_init(&hpriv->notifier_event.lock);
190 mutex_init(&hpriv->restore_phase_mutex);
191 mutex_init(&hpriv->ctx_lock);
192 kref_init(&hpriv->refcount);
193
194 hl_ctx_mgr_init(&hpriv->ctx_mgr);
195 hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
196
197 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
198
199 mutex_lock(&hdev->fpriv_list_lock);
200
201 if (!hl_device_operational(hdev, &status)) {
202 dev_dbg_ratelimited(hdev->dev,
203 "Can't open %s because it is %s\n",
204 dev_name(hdev->dev), hdev->status[status]);
205
206 if (status == HL_DEVICE_STATUS_IN_RESET ||
207 status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
208 rc = -EAGAIN;
209 else
210 rc = -EPERM;
211
212 goto out_err;
213 }
214
215 if (hdev->is_in_dram_scrub) {
216 dev_dbg_ratelimited(hdev->dev,
217 "Can't open %s during dram scrub\n",
218 dev_name(hdev->dev));
219 rc = -EAGAIN;
220 goto out_err;
221 }
222
223 if (hdev->compute_ctx_in_release) {
224 dev_dbg_ratelimited(hdev->dev,
225 "Can't open %s because another user is still releasing it\n",
226 dev_name(hdev->dev));
227 rc = -EAGAIN;
228 goto out_err;
229 }
230
231 if (hdev->is_compute_ctx_active) {
232 dev_dbg_ratelimited(hdev->dev,
233 "Can't open %s because another user is working on it\n",
234 dev_name(hdev->dev));
235 rc = -EBUSY;
236 goto out_err;
237 }
238
239 rc = hl_ctx_create(hdev, hpriv);
240 if (rc) {
241 dev_err(hdev->dev, "Failed to create context %d\n", rc);
242 goto out_err;
243 }
244
245 list_add(&hpriv->dev_node, &hdev->fpriv_list);
246 mutex_unlock(&hdev->fpriv_list_lock);
247
248 hdev->asic_funcs->send_device_activity(hdev, true);
249
250 hl_debugfs_add_file(hpriv);
251
252 hl_enable_err_info_capture(&hdev->captured_err_info);
253
254 hdev->open_counter++;
255 hdev->last_successful_open_jif = jiffies;
256 hdev->last_successful_open_ktime = ktime_get();
257
258 file_priv->driver_priv = hpriv;
259 hpriv->file_priv = file_priv;
260
261 return 0;
262
263 out_err:
264 mutex_unlock(&hdev->fpriv_list_lock);
265 hl_mem_mgr_fini(&hpriv->mem_mgr, NULL);
266 hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
267 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
268 mutex_destroy(&hpriv->ctx_lock);
269 mutex_destroy(&hpriv->restore_phase_mutex);
270 mutex_destroy(&hpriv->notifier_event.lock);
271 put_pid(hpriv->taskpid);
272
273 kfree(hpriv);
274
275 return rc;
276 }
277
hl_device_open_ctrl(struct inode * inode,struct file * filp)278 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
279 {
280 struct hl_device *hdev;
281 struct hl_fpriv *hpriv;
282 int rc;
283
284 mutex_lock(&hl_devs_idr_lock);
285 hdev = idr_find(&hl_devs_idr, iminor(inode));
286 mutex_unlock(&hl_devs_idr_lock);
287
288 if (!hdev) {
289 pr_err("Couldn't find device %d:%d\n",
290 imajor(inode), iminor(inode));
291 return -ENXIO;
292 }
293
294 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
295 if (!hpriv)
296 return -ENOMEM;
297
298 /* Prevent other routines from reading partial hpriv data by
299 * initializing hpriv fields before inserting it to the list
300 */
301 hpriv->hdev = hdev;
302 filp->private_data = hpriv;
303
304 nonseekable_open(inode, filp);
305
306 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
307
308 mutex_lock(&hdev->fpriv_ctrl_list_lock);
309
310 if (!hl_ctrl_device_operational(hdev, NULL)) {
311 dev_dbg_ratelimited(hdev->dev_ctrl,
312 "Can't open %s because it is disabled\n",
313 dev_name(hdev->dev_ctrl));
314 rc = -EPERM;
315 goto out_err;
316 }
317
318 list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
319 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
320
321 return 0;
322
323 out_err:
324 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
325 filp->private_data = NULL;
326 put_pid(hpriv->taskpid);
327
328 kfree(hpriv);
329
330 return rc;
331 }
332
set_driver_behavior_per_device(struct hl_device * hdev)333 static void set_driver_behavior_per_device(struct hl_device *hdev)
334 {
335 hdev->nic_ports_mask = 0;
336 hdev->fw_components = FW_TYPE_ALL_TYPES;
337 hdev->cpu_queues_enable = 1;
338 hdev->pldm = 0;
339 hdev->hard_reset_on_fw_events = 1;
340 hdev->bmc_enable = 1;
341 hdev->reset_on_preboot_fail = 1;
342 hdev->heartbeat = 1;
343 }
344
copy_kernel_module_params_to_device(struct hl_device * hdev)345 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
346 {
347 hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
348
349 hdev->major = hl_major;
350 hdev->memory_scrub = memory_scrub;
351 hdev->reset_on_lockup = reset_on_lockup;
352 hdev->boot_error_status_mask = boot_error_status_mask;
353 }
354
fixup_device_params_per_asic(struct hl_device * hdev,int timeout)355 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
356 {
357 switch (hdev->asic_type) {
358 case ASIC_GAUDI:
359 case ASIC_GAUDI_SEC:
360 /* If user didn't request a different timeout than the default one, we have
361 * a different default timeout for Gaudi
362 */
363 if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
364 hdev->timeout_jiffies = secs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED);
365
366 hdev->reset_upon_device_release = 0;
367 break;
368
369 case ASIC_GOYA:
370 hdev->reset_upon_device_release = 0;
371 break;
372
373 default:
374 hdev->reset_upon_device_release = 1;
375 break;
376 }
377 }
378
fixup_device_params(struct hl_device * hdev)379 static int fixup_device_params(struct hl_device *hdev)
380 {
381 int tmp_timeout;
382
383 tmp_timeout = timeout_locked;
384
385 hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
386 hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
387
388 if (tmp_timeout)
389 hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
390 else
391 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
392
393 hdev->stop_on_err = true;
394 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
395 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
396
397 /* Enable only after the initialization of the device */
398 hdev->disabled = true;
399
400 if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
401 (hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
402 pr_err("Preboot must be set along with other components");
403 return -EINVAL;
404 }
405
406 /* If CPU queues not enabled, no way to do heartbeat */
407 if (!hdev->cpu_queues_enable)
408 hdev->heartbeat = 0;
409 fixup_device_params_per_asic(hdev, tmp_timeout);
410
411 return 0;
412 }
413
allocate_device_id(struct hl_device * hdev)414 static int allocate_device_id(struct hl_device *hdev)
415 {
416 int id;
417
418 mutex_lock(&hl_devs_idr_lock);
419 id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
420 mutex_unlock(&hl_devs_idr_lock);
421
422 if (id < 0) {
423 if (id == -ENOSPC)
424 pr_err("too many devices in the system\n");
425 return -EBUSY;
426 }
427
428 hdev->id = id;
429
430 /*
431 * Firstly initialized with the internal device ID.
432 * Will be updated later after the DRM device registration to hold the minor ID.
433 */
434 hdev->cdev_idx = hdev->id;
435
436 return 0;
437 }
438
439 /**
440 * create_hdev - create habanalabs device instance
441 *
442 * @dev: will hold the pointer to the new habanalabs device structure
443 * @pdev: pointer to the pci device
444 *
445 * Allocate memory for habanalabs device and initialize basic fields
446 * Identify the ASIC type
447 * Allocate ID (minor) for the device (only for real devices)
448 */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev)449 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
450 {
451 struct hl_device *hdev;
452 int rc;
453
454 *dev = NULL;
455
456 hdev = devm_drm_dev_alloc(&pdev->dev, &hl_driver, struct hl_device, drm);
457 if (IS_ERR(hdev))
458 return PTR_ERR(hdev);
459
460 hdev->dev = hdev->drm.dev;
461
462 /* Will be NULL in case of simulator device */
463 hdev->pdev = pdev;
464
465 /* Assign status description string */
466 strscpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
467 strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
468 strscpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
469 strscpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
470 strscpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
471 "in device creation", HL_STR_MAX);
472 strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
473 "in reset after device release", HL_STR_MAX);
474
475
476 /* First, we must find out which ASIC are we handling. This is needed
477 * to configure the behavior of the driver (kernel parameters)
478 */
479 hdev->asic_type = get_asic_type(hdev);
480 if (hdev->asic_type == ASIC_INVALID) {
481 dev_err(&pdev->dev, "Unsupported ASIC\n");
482 rc = -ENODEV;
483 goto out_err;
484 }
485
486 copy_kernel_module_params_to_device(hdev);
487
488 set_driver_behavior_per_device(hdev);
489
490 fixup_device_params(hdev);
491
492 rc = allocate_device_id(hdev);
493 if (rc)
494 goto out_err;
495
496 *dev = hdev;
497
498 return 0;
499
500 out_err:
501 return rc;
502 }
503
504 /*
505 * destroy_hdev - destroy habanalabs device instance
506 *
507 * @dev: pointer to the habanalabs device structure
508 *
509 */
destroy_hdev(struct hl_device * hdev)510 static void destroy_hdev(struct hl_device *hdev)
511 {
512 /* Remove device from the device list */
513 mutex_lock(&hl_devs_idr_lock);
514 idr_remove(&hl_devs_idr, hdev->id);
515 mutex_unlock(&hl_devs_idr_lock);
516
517 }
518
hl_pmops_suspend(struct device * dev)519 static int hl_pmops_suspend(struct device *dev)
520 {
521 struct hl_device *hdev = dev_get_drvdata(dev);
522
523 pr_debug("Going to suspend PCI device\n");
524
525 if (!hdev) {
526 pr_err("device pointer is NULL in suspend\n");
527 return 0;
528 }
529
530 return hl_device_suspend(hdev);
531 }
532
hl_pmops_resume(struct device * dev)533 static int hl_pmops_resume(struct device *dev)
534 {
535 struct hl_device *hdev = dev_get_drvdata(dev);
536
537 pr_debug("Going to resume PCI device\n");
538
539 if (!hdev) {
540 pr_err("device pointer is NULL in resume\n");
541 return 0;
542 }
543
544 return hl_device_resume(hdev);
545 }
546
547 /**
548 * hl_pci_probe - probe PCI habanalabs devices
549 *
550 * @pdev: pointer to pci device
551 * @id: pointer to pci device id structure
552 *
553 * Standard PCI probe function for habanalabs device.
554 * Create a new habanalabs device and initialize it according to the
555 * device's type
556 */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)557 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
558 {
559 struct hl_device *hdev;
560 int rc;
561
562 dev_info(&pdev->dev, HL_NAME
563 " device found [%04x:%04x] (rev %x)\n",
564 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
565
566 rc = create_hdev(&hdev, pdev);
567 if (rc)
568 return rc;
569
570 pci_set_drvdata(pdev, hdev);
571
572 rc = hl_device_init(hdev);
573 if (rc) {
574 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
575 rc = -ENODEV;
576 goto disable_device;
577 }
578
579 return 0;
580
581 disable_device:
582 pci_set_drvdata(pdev, NULL);
583 destroy_hdev(hdev);
584
585 return rc;
586 }
587
588 /*
589 * hl_pci_remove - remove PCI habanalabs devices
590 *
591 * @pdev: pointer to pci device
592 *
593 * Standard PCI remove function for habanalabs device
594 */
hl_pci_remove(struct pci_dev * pdev)595 static void hl_pci_remove(struct pci_dev *pdev)
596 {
597 struct hl_device *hdev;
598
599 hdev = pci_get_drvdata(pdev);
600 if (!hdev)
601 return;
602
603 hl_device_fini(hdev);
604 pci_set_drvdata(pdev, NULL);
605 destroy_hdev(hdev);
606 }
607
608 /**
609 * hl_pci_err_detected - a PCI bus error detected on this device
610 *
611 * @pdev: pointer to pci device
612 * @state: PCI error type
613 *
614 * Called by the PCI subsystem whenever a non-correctable
615 * PCI bus error is detected
616 */
617 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)618 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
619 {
620 struct hl_device *hdev = pci_get_drvdata(pdev);
621 enum pci_ers_result result;
622
623 switch (state) {
624 case pci_channel_io_normal:
625 dev_warn(hdev->dev, "PCI normal state error detected\n");
626 return PCI_ERS_RESULT_CAN_RECOVER;
627
628 case pci_channel_io_frozen:
629 dev_warn(hdev->dev, "PCI frozen state error detected\n");
630 result = PCI_ERS_RESULT_NEED_RESET;
631 break;
632
633 case pci_channel_io_perm_failure:
634 dev_warn(hdev->dev, "PCI failure state error detected\n");
635 result = PCI_ERS_RESULT_DISCONNECT;
636 break;
637
638 default:
639 result = PCI_ERS_RESULT_NONE;
640 }
641
642 hdev->asic_funcs->halt_engines(hdev, true, false);
643
644 return result;
645 }
646
647 /**
648 * hl_pci_err_resume - resume after a PCI slot reset
649 *
650 * @pdev: pointer to pci device
651 *
652 */
hl_pci_err_resume(struct pci_dev * pdev)653 static void hl_pci_err_resume(struct pci_dev *pdev)
654 {
655 struct hl_device *hdev = pci_get_drvdata(pdev);
656
657 dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
658 hl_device_resume(hdev);
659 }
660
661 /**
662 * hl_pci_err_slot_reset - a PCI slot reset has just happened
663 *
664 * @pdev: pointer to pci device
665 *
666 * Determine if the driver can recover from the PCI slot reset
667 */
hl_pci_err_slot_reset(struct pci_dev * pdev)668 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
669 {
670 struct hl_device *hdev = pci_get_drvdata(pdev);
671
672 dev_warn(hdev->dev, "PCI slot reset detected\n");
673
674 return PCI_ERS_RESULT_RECOVERED;
675 }
676
hl_pci_reset_prepare(struct pci_dev * pdev)677 static void hl_pci_reset_prepare(struct pci_dev *pdev)
678 {
679 struct hl_device *hdev;
680
681 hdev = pci_get_drvdata(pdev);
682 if (!hdev)
683 return;
684
685 hdev->disabled = true;
686 }
687
hl_pci_reset_done(struct pci_dev * pdev)688 static void hl_pci_reset_done(struct pci_dev *pdev)
689 {
690 struct hl_device *hdev;
691 u32 flags;
692
693 hdev = pci_get_drvdata(pdev);
694 if (!hdev)
695 return;
696
697 /*
698 * Schedule a thread to trigger hard reset.
699 * The reason for this handler, is for rare cases where the driver is up
700 * and FLR occurs. This is valid only when working with no VM, so FW handles FLR
701 * and resets the device. FW will go back preboot stage, so driver needs to perform
702 * hard reset in order to load FW fit again.
703 */
704 flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
705
706 hl_device_reset(hdev, flags);
707 }
708
709 static const struct dev_pm_ops hl_pm_ops = {
710 .suspend = hl_pmops_suspend,
711 .resume = hl_pmops_resume,
712 };
713
714 static const struct pci_error_handlers hl_pci_err_handler = {
715 .error_detected = hl_pci_err_detected,
716 .slot_reset = hl_pci_err_slot_reset,
717 .resume = hl_pci_err_resume,
718 .reset_prepare = hl_pci_reset_prepare,
719 .reset_done = hl_pci_reset_done,
720 };
721
722 static struct pci_driver hl_pci_driver = {
723 .name = HL_NAME,
724 .id_table = ids,
725 .probe = hl_pci_probe,
726 .remove = hl_pci_remove,
727 .shutdown = hl_pci_remove,
728 .driver = {
729 .name = HL_NAME,
730 .pm = &hl_pm_ops,
731 .probe_type = PROBE_PREFER_ASYNCHRONOUS,
732 },
733 .err_handler = &hl_pci_err_handler,
734 };
735
736 /*
737 * hl_init - Initialize the habanalabs kernel driver
738 */
hl_init(void)739 static int __init hl_init(void)
740 {
741 int rc;
742 dev_t dev;
743
744 pr_info("loading driver\n");
745
746 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
747 if (rc < 0) {
748 pr_err("unable to get major\n");
749 return rc;
750 }
751
752 hl_major = MAJOR(dev);
753
754 rc = pci_register_driver(&hl_pci_driver);
755 if (rc) {
756 pr_err("failed to register pci device\n");
757 goto remove_major;
758 }
759
760 pr_debug("driver loaded\n");
761
762 return 0;
763
764 remove_major:
765 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
766 return rc;
767 }
768
769 /*
770 * hl_exit - Release all resources of the habanalabs kernel driver
771 */
hl_exit(void)772 static void __exit hl_exit(void)
773 {
774 pci_unregister_driver(&hl_pci_driver);
775
776 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
777
778 idr_destroy(&hl_devs_idr);
779
780 pr_debug("driver removed\n");
781 }
782
783 module_init(hl_init);
784 module_exit(hl_exit);
785