Lines Matching +full:disable +full:- +full:mmu +full:- +full:reset
1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright 2016-2022 HabanaLabs, Ltd.
36 * hl_set_dram_bar- sets the bar to allow later access to address
54 struct asic_fixed_properties *prop = &hdev->asic_prop;
57 if (is_power_of_2(prop->dram_pci_bar_size))
58 bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
60 bar_base_addr = region->region_base +
61 div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
62 prop->dram_pci_bar_size;
64 old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
76 struct pci_mem_region *region = &hdev->pci_mem_region[region_type];
77 u64 old_base = 0, rc, bar_region_base = region->region_base;
83 return -EIO;
86 acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
87 (addr - bar_region_base);
113 return -EIO;
127 ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag);
130 ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle);
135 trace_habanalabs_dma_alloc(&(hdev)->pdev->dev, (u64) (uintptr_t) ptr, *dma_handle,
150 hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle);
153 hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle);
157 trace_habanalabs_dma_free(&(hdev)->pdev->dev, store_cpu_addr, dma_handle, size, caller);
186 return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
191 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr);
197 struct asic_fixed_properties *prop = &hdev->asic_prop;
201 rc = hdev->asic_funcs->dma_map_sgtable(hdev, sgt, dir);
209 trace_habanalabs_dma_map_page(&(hdev)->pdev->dev,
211 sg->dma_address - prop->device_dma_offset_for_host_access,
213 sg->dma_length,
215 sg->length,
225 struct asic_fixed_properties *prop = &hdev->asic_prop;
229 rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0);
234 if (prop->device_dma_offset_for_host_access)
236 sg->dma_address += prop->device_dma_offset_for_host_access;
244 struct asic_fixed_properties *prop = &hdev->asic_prop;
248 hdev->asic_funcs->dma_unmap_sgtable(hdev, sgt, dir);
252 trace_habanalabs_dma_unmap_page(&(hdev)->pdev->dev,
254 sg->dma_address - prop->device_dma_offset_for_host_access,
256 sg->dma_length,
258 sg->length,
267 struct asic_fixed_properties *prop = &hdev->asic_prop;
272 if (prop->device_dma_offset_for_host_access)
274 sg->dma_address -= prop->device_dma_offset_for_host_access;
276 dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0);
280 * hl_access_cfg_region - access the config region
290 struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG];
294 dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32));
295 return -EINVAL;
300 *val = RREG32(addr - cfg_region->region_base);
303 WREG32(addr - cfg_region->region_base, *val);
306 val_l = RREG32(addr - cfg_region->region_base);
307 val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base);
312 WREG32(addr - cfg_region->region_base, lower_32_bits(*val));
313 WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val));
316 dev_err(hdev->dev, "access type %d is not supported\n", acc_type);
317 return -EOPNOTSUPP;
324 * hl_access_dev_mem - access device memory
343 return -EFAULT;
361 if ((e->actual_size + str_size) < e->allocated_buf_size) {
363 vsnprintf(e->buf + e->actual_size, str_size, fmt, args);
370 e->actual_size += str_size;
377 if (hdev->device_fini_pending) {
379 } else if (hdev->reset_info.in_reset) {
380 if (hdev->reset_info.in_compute_reset)
384 } else if (hdev->reset_info.needs_reset) {
386 } else if (hdev->disabled) {
388 } else if (!hdev->init_done) {
445 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n",
446 dev_name(&hdev->pdev->dev), message,
449 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n",
450 dev_name(&hdev->pdev->dev), message,
453 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n",
454 dev_name(&hdev->pdev->dev), message, idle_mask[1], idle_mask[0]);
456 dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message,
469 hdev = hpriv->hdev;
471 hdev->asic_funcs->send_device_activity(hdev, false);
475 mutex_destroy(&hpriv->ctx_lock);
476 mutex_destroy(&hpriv->restore_phase_mutex);
479 hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
481 /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending
482 * reset that waits for device release.
484 reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active;
486 /* Check the device idle status and reset if not idle.
487 * Skip it if already in reset, or if device is going to be reset in any case.
489 if (!hdev->reset_info.in_reset && !reset_device && !hdev->pldm)
490 device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
498 /* We need to remove the user from the list to make sure the reset process won't
505 * a user open the device while there a reset is about to happen.
507 mutex_lock(&hdev->fpriv_list_lock);
508 list_del(&hpriv->dev_node);
509 mutex_unlock(&hdev->fpriv_list_lock);
511 put_pid(hpriv->taskpid);
517 int rc = hdev->asic_funcs->scrub_device_mem(hdev);
520 dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc);
525 /* Now we can mark the compute_ctx as not active. Even if a reset is running in a different
529 mutex_lock(&hdev->fpriv_list_lock);
530 hdev->is_compute_ctx_active = false;
531 mutex_unlock(&hdev->fpriv_list_lock);
533 hdev->compute_ctx_in_release = 0;
536 if (hpriv->notifier_event.eventfd)
537 eventfd_ctx_put(hpriv->notifier_event.eventfd);
539 mutex_destroy(&hpriv->notifier_event.lock);
546 kref_get(&hpriv->refcount);
551 return kref_put(&hpriv->refcount, hpriv_release);
569 offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num);
572 dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt);
575 offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]",
579 if (mm_fini_stats->n_busy_cb) {
581 offset += scnprintf(buf + offset, size - offset, " [%u live CB handles]",
582 mm_fini_stats->n_busy_cb);
586 scnprintf(buf + offset, size - offset, " [unknown reason]");
588 dev_notice(hdev->dev, "%s%s\n", message, buf);
592 * hl_device_release() - release function for habanalabs device.
600 struct hl_fpriv *hpriv = file_priv->driver_priv;
606 put_pid(hpriv->taskpid);
609 hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
614 hl_mem_mgr_fini(&hpriv->mem_mgr, &mm_fini_stats);
616 hdev->compute_ctx_in_release = 1;
624 hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif;
629 struct hl_fpriv *hpriv = filp->private_data;
630 struct hl_device *hdev = hpriv->hdev;
632 filp->private_data = NULL;
639 mutex_lock(&hdev->fpriv_ctrl_list_lock);
640 list_del(&hpriv->dev_node);
641 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
643 put_pid(hpriv->taskpid);
652 struct hl_device *hdev = hpriv->hdev;
657 return -ENODEV;
660 vm_pgoff = vma->vm_pgoff;
664 vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
669 return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL);
671 return -EINVAL;
675 * hl_mmap - mmap function for habanalabs device
685 struct drm_file *file_priv = filp->private_data;
686 struct hl_fpriv *hpriv = file_priv->driver_priv;
705 * device_init_cdev - Initialize cdev and device for habanalabs device
723 cdev->owner = THIS_MODULE;
727 return -ENOMEM;
730 (*dev)->devt = MKDEV(hdev->major, minor);
731 (*dev)->class = class;
732 (*dev)->release = device_release_func;
741 const struct class *accel_class = hdev->drm.accel->kdev->class;
745 hdev->cdev_idx = hdev->drm.accel->index;
748 snprintf(name, sizeof(name), "accel_controlD%d", hdev->cdev_idx);
749 rc = device_init_cdev(hdev, accel_class, hdev->cdev_idx, &hl_ctrl_ops, name,
750 &hdev->cdev_ctrl, &hdev->dev_ctrl);
754 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
756 dev_err(hdev->dev_ctrl,
763 dev_err(hdev->dev, "failed to initialize sysfs\n");
769 hdev->cdev_sysfs_debugfs_created = true;
774 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
776 put_device(hdev->dev_ctrl);
782 if (!hdev->cdev_sysfs_debugfs_created)
787 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
788 put_device(hdev->dev_ctrl);
795 struct hl_device *hdev = device_reset_work->hdev;
799 flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR;
803 if ((rc == -EBUSY) && !hdev->device_fini_pending) {
810 dev_info(hdev->dev,
811 "Could not reset device (compute_ctx refcount %u). will try again in %u seconds",
812 kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC);
815 dev_info(hdev->dev, "Could not reset device. will try again in %u seconds",
819 queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work,
828 struct hl_device *hdev = watchdog_work->hdev;
831 dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n");
833 flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR;
839 * device_early_init - do some early initialization for the habanalabs device
851 switch (hdev->asic_type) {
854 strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
858 strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name));
862 strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name));
866 strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name));
870 strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
874 strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
878 strscpy(hdev->asic_name, "GAUDI2D", sizeof(hdev->asic_name));
881 dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
882 hdev->asic_type);
883 return -EINVAL;
886 rc = hdev->asic_funcs->early_init(hdev);
894 if (hdev->asic_prop.completion_queues_count) {
895 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
898 if (!hdev->cq_wq) {
899 rc = -ENOMEM;
904 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
905 snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
906 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
907 if (hdev->cq_wq[i] == NULL) {
908 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
909 rc = -ENOMEM;
914 snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
915 hdev->eq_wq = create_singlethread_workqueue(workq_name);
916 if (hdev->eq_wq == NULL) {
917 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
918 rc = -ENOMEM;
922 snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
923 hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
924 if (!hdev->cs_cmplt_wq) {
925 dev_err(hdev->dev,
927 rc = -ENOMEM;
931 snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
932 hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
933 if (!hdev->ts_free_obj_wq) {
934 dev_err(hdev->dev,
936 rc = -ENOMEM;
940 snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
941 hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
942 if (!hdev->prefetch_wq) {
943 dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
944 rc = -ENOMEM;
948 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
949 if (!hdev->hl_chip_info) {
950 rc = -ENOMEM;
958 hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
960 snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
961 hdev->reset_wq = create_singlethread_workqueue(workq_name);
962 if (!hdev->reset_wq) {
963 rc = -ENOMEM;
964 dev_err(hdev->dev, "Failed to create device reset WQ\n");
968 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
970 INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending);
971 hdev->device_reset_work.hdev = hdev;
972 hdev->device_fini_pending = 0;
974 INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work,
976 hdev->device_release_watchdog_work.hdev = hdev;
978 mutex_init(&hdev->send_cpu_message_lock);
979 mutex_init(&hdev->debug_lock);
980 INIT_LIST_HEAD(&hdev->cs_mirror_list);
981 spin_lock_init(&hdev->cs_mirror_lock);
982 spin_lock_init(&hdev->reset_info.lock);
983 INIT_LIST_HEAD(&hdev->fpriv_list);
984 INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
985 mutex_init(&hdev->fpriv_list_lock);
986 mutex_init(&hdev->fpriv_ctrl_list_lock);
987 mutex_init(&hdev->clk_throttling.lock);
992 hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
993 hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
995 kfree(hdev->hl_chip_info);
997 destroy_workqueue(hdev->prefetch_wq);
999 destroy_workqueue(hdev->ts_free_obj_wq);
1001 destroy_workqueue(hdev->cs_cmplt_wq);
1003 destroy_workqueue(hdev->eq_wq);
1005 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1006 if (hdev->cq_wq[i])
1007 destroy_workqueue(hdev->cq_wq[i]);
1008 kfree(hdev->cq_wq);
1012 if (hdev->asic_funcs->early_fini)
1013 hdev->asic_funcs->early_fini(hdev);
1019 * device_early_fini - finalize all that was done in device_early_init
1028 mutex_destroy(&hdev->debug_lock);
1029 mutex_destroy(&hdev->send_cpu_message_lock);
1031 mutex_destroy(&hdev->fpriv_list_lock);
1032 mutex_destroy(&hdev->fpriv_ctrl_list_lock);
1034 mutex_destroy(&hdev->clk_throttling.lock);
1036 hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
1037 hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
1039 kfree(hdev->hl_chip_info);
1041 destroy_workqueue(hdev->prefetch_wq);
1042 destroy_workqueue(hdev->ts_free_obj_wq);
1043 destroy_workqueue(hdev->cs_cmplt_wq);
1044 destroy_workqueue(hdev->eq_wq);
1045 destroy_workqueue(hdev->reset_wq);
1047 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1048 destroy_workqueue(hdev->cq_wq[i]);
1049 kfree(hdev->cq_wq);
1053 if (hdev->asic_funcs->early_fini)
1054 hdev->asic_funcs->early_fini(hdev);
1061 if (!hdev->pdev)
1064 pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);
1066 return (device_id == hdev->pdev->device);
1072 time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts
1073 : hdev->heartbeat_debug_info.last_eq_heartbeat_ts;
1081 snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)",
1087 struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
1088 u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1;
1089 struct asic_fixed_properties *prop = &hdev->asic_prop;
1092 if (!prop->cpucp_info.eq_health_check_supported)
1095 if (!hdev->eq_heartbeat_received) {
1096 dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
1100 dev_err(hdev->dev,
1102 hdev->event_queue.ci,
1103 heartbeat_debug_info->heartbeat_event_counter,
1105 hdev->kernel_queues[cpu_q_id].pi,
1106 atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
1107 atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
1110 hl_eq_dump(hdev, &hdev->event_queue);
1115 hdev->eq_heartbeat_received = false;
1128 if (!hl_device_operational(hdev, NULL) || !hdev->init_done)
1136 if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev)))
1140 dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
1153 * heartbeat immediately post reset.
1155 * scheduled since last reset/init cycle.
1156 * So if the device is not already in reset cycle, reset the flag
1157 * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR
1161 if (!hdev->reset_info.in_reset)
1162 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
1164 schedule_delayed_work(&hdev->work_heartbeat,
1169 * device_late_init - do late stuff initialization for the habanalabs device
1180 if (hdev->asic_funcs->late_init) {
1181 rc = hdev->asic_funcs->late_init(hdev);
1183 dev_err(hdev->dev,
1189 hdev->high_pll = hdev->asic_prop.high_pll;
1190 hdev->late_init_done = true;
1196 * device_late_fini - finalize all that was done in device_late_init
1203 if (!hdev->late_init_done)
1206 if (hdev->asic_funcs->late_fini)
1207 hdev->asic_funcs->late_fini(hdev);
1209 hdev->late_init_done = false;
1217 max_power = hdev->max_power;
1218 dc_power = hdev->asic_prop.dc_power_default;
1219 divisor = max_power - dc_power;
1221 dev_warn(hdev->dev, "device utilization is not supported\n");
1222 return -EOPNOTSUPP;
1231 dividend = (curr_power - dc_power) * 100;
1241 mutex_lock(&hdev->debug_lock);
1244 if (!hdev->in_debug) {
1245 dev_err(hdev->dev,
1246 "Failed to disable debug mode because device was not in debug mode\n");
1247 rc = -EFAULT;
1251 if (!hdev->reset_info.hard_reset_pending)
1252 hdev->asic_funcs->halt_coresight(hdev, ctx);
1254 hdev->in_debug = 0;
1259 if (hdev->in_debug) {
1260 dev_err(hdev->dev,
1262 rc = -EFAULT;
1266 hdev->in_debug = 1;
1269 mutex_unlock(&hdev->debug_lock);
1279 hdev->asic_funcs->hw_queues_lock(hdev);
1280 hdev->asic_funcs->hw_queues_unlock(hdev);
1283 mutex_lock(&hdev->send_cpu_message_lock);
1284 mutex_unlock(&hdev->send_cpu_message_lock);
1287 mutex_lock(&hdev->fpriv_list_lock);
1288 mutex_unlock(&hdev->fpriv_list_lock);
1289 mutex_lock(&hdev->fpriv_ctrl_list_lock);
1290 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
1307 if (hdev->heartbeat)
1308 cancel_delayed_work_sync(&hdev->work_heartbeat);
1314 * Halt the engines and disable interrupts so we won't get any more
1318 hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
1323 /* flush the MMU prefetch workqueue */
1324 flush_workqueue(hdev->prefetch_wq);
1330 * hl_device_suspend - initiate device suspend
1342 pci_save_state(hdev->pdev);
1345 spin_lock(&hdev->reset_info.lock);
1346 if (hdev->reset_info.in_reset) {
1347 spin_unlock(&hdev->reset_info.lock);
1348 dev_err(hdev->dev, "Can't suspend while in reset\n");
1349 return -EIO;
1351 hdev->reset_info.in_reset = 1;
1352 spin_unlock(&hdev->reset_info.lock);
1355 hdev->disabled = true;
1359 rc = hdev->asic_funcs->suspend(hdev);
1361 dev_err(hdev->dev,
1362 "Failed to disable PCI access of device CPU\n");
1365 pci_disable_device(hdev->pdev);
1366 pci_set_power_state(hdev->pdev, PCI_D3hot);
1372 * hl_device_resume - initiate device resume
1384 pci_set_power_state(hdev->pdev, PCI_D0);
1385 pci_restore_state(hdev->pdev);
1386 rc = pci_enable_device_mem(hdev->pdev);
1388 dev_err(hdev->dev,
1393 pci_set_master(hdev->pdev);
1395 rc = hdev->asic_funcs->resume(hdev);
1397 dev_err(hdev->dev, "Failed to resume device after suspend\n");
1403 * for hard reset to be performed
1405 spin_lock(&hdev->reset_info.lock);
1406 hdev->reset_info.in_reset = 0;
1407 spin_unlock(&hdev->reset_info.lock);
1411 dev_err(hdev->dev, "Failed to reset device during resume\n");
1418 pci_disable_device(hdev->pdev);
1431 hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
1432 hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
1443 if (hdev->process_kill_trial_cnt) {
1459 task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
1461 dev_info(hdev->dev, "Killing user process pid=%d\n",
1468 dev_dbg(hdev->dev,
1470 pid_nr(hpriv->taskpid));
1479 * e.g. MMU unmappings, or running other long teardown flow even before
1482 * continuing with the reset.
1487 dev_dbg(hdev->dev,
1488 "Waiting for all unmap operations to finish before hard reset\n");
1490 pending_cnt--;
1500 if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
1501 return -ETIME;
1503 hdev->process_kill_trial_cnt++;
1505 return -EBUSY;
1514 hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
1515 hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
1519 hpriv->hdev = NULL;
1525 /* If reset is due to heartbeat, device CPU is no responsive in
1526 * which case no point sending PCI disable message to it.
1530 /* Disable PCI access from device F/W so he won't send
1531 * us additional interrupts. We disable MSI/MSI-X at
1533 * sending us interrupts after that. We need to disable
1535 * disable, the message won't be send. Also, in case
1536 * of heartbeat, the device CPU is marked as disable
1545 if (hdev->cpu_queues_enable)
1546 disable_irq(pci_irq_vector(hdev->pdev, hdev->asic_prop.eq_interrupt_id));
1555 if (hdev->is_compute_ctx_active)
1559 * 'reset cause' is being updated here, because getting here
1565 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
1568 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
1571 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
1574 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
1578 * If reset cause is same twice, then reset_trigger_repeated
1579 * is set and if this reset is due to a fatal FW error
1582 if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
1583 hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
1584 hdev->reset_info.reset_trigger_repeated = 0;
1586 hdev->reset_info.reset_trigger_repeated = 1;
1592 hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0;
1593 hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0;
1594 hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
1599 if (!hdev->heartbeat)
1609 hdev->eq_heartbeat_received = true;
1611 schedule_delayed_work(&hdev->work_heartbeat,
1616 * hl_device_reset - reset the device
1619 * @flags: reset flags.
1624 * Re-initialize all internal data structures
1639 if (!hdev->init_done) {
1640 dev_err(hdev->dev, "Can't reset before initialization is done\n");
1650 reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
1653 dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
1657 if (!hard_reset && !hdev->asic_prop.supports_compute_reset) {
1658 dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n");
1664 dev_crit(hdev->dev,
1665 "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
1666 return -EINVAL;
1672 if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
1673 dev_dbg(hdev->dev,
1674 "asic doesn't allow inference soft reset - do hard-reset instead\n");
1679 /* Re-entry of reset thread */
1680 if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
1684 * Prevent concurrency in this function - only one reset should be
1686 * get here from a dedicated hard reset thread.
1690 spin_lock(&hdev->reset_info.lock);
1691 if (hdev->reset_info.in_reset) {
1692 /* We allow scheduling of a hard reset only during a compute reset */
1693 if (hard_reset && hdev->reset_info.in_compute_reset)
1694 hdev->reset_info.hard_reset_schedule_flags = flags;
1695 spin_unlock(&hdev->reset_info.lock);
1700 * Update this before in_reset because in_compute_reset implies we are in reset
1702 hdev->reset_info.in_compute_reset = !hard_reset;
1704 hdev->reset_info.in_reset = 1;
1706 spin_unlock(&hdev->reset_info.lock);
1709 * In case of reset-upon-device-release while the release watchdog work is
1710 * scheduled due to a hard-reset, do hard-reset instead of compute-reset.
1712 if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) {
1714 &hdev->device_release_watchdog_work;
1716 hdev->reset_info.watchdog_active = 0;
1718 cancel_delayed_work_sync(&watchdog_work->reset_work);
1720 if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) {
1721 hdev->reset_info.in_compute_reset = 0;
1736 hdev->disabled = true;
1741 dev_info(hdev->dev, "Going to reset device\n");
1743 dev_dbg(hdev->dev, "Going to reset device after release by user\n");
1745 dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
1749 hdev->reset_info.hard_reset_pending = true;
1751 hdev->process_kill_trial_cnt = 0;
1753 hdev->device_reset_work.flags = flags;
1756 * Because the reset function can't run from heartbeat work,
1757 * we need to call the reset function from a dedicated work.
1759 queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0);
1774 if (rc == -EBUSY) {
1775 if (hdev->device_fini_pending) {
1776 dev_crit(hdev->dev,
1777 "%s Failed to kill all open processes, stopping hard reset\n",
1778 dev_name(&(hdev)->pdev->dev));
1782 /* signal reset thread to reschedule */
1787 dev_crit(hdev->dev,
1788 "%s Failed to kill all open processes, stopping hard reset\n",
1789 dev_name(&(hdev)->pdev->dev));
1794 * reading or writing to registers during the reset
1796 flush_workqueue(hdev->eq_wq);
1799 /* Reset the H/W. It will be in idle state after this returns */
1800 hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
1803 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
1806 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
1807 hdev->kernel_ctx = NULL;
1811 hl_eq_reset(hdev, &hdev->event_queue);
1814 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
1816 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1817 hl_cq_reset(hdev, &hdev->completion_queue[i]);
1822 atomic_set(&ctx->thread_ctx_switch_token, 1);
1823 ctx->thread_ctx_switch_wait_token = 0;
1831 /* Finished tear-down, starting to re-initialize */
1834 hdev->device_cpu_disabled = false;
1835 hdev->reset_info.hard_reset_pending = false;
1841 if (hdev->reset_info.reset_trigger_repeated &&
1842 (hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
1843 hdev->reset_info.prev_reset_trigger ==
1845 dev_crit(hdev->dev,
1846 "%s Consecutive fatal errors, stopping hard reset\n",
1847 dev_name(&(hdev)->pdev->dev));
1848 rc = -EIO;
1852 if (hdev->kernel_ctx) {
1853 dev_crit(hdev->dev,
1854 "%s kernel ctx was alive during hard reset, something is terribly wrong\n",
1855 dev_name(&(hdev)->pdev->dev));
1856 rc = -EBUSY;
1862 dev_err(hdev->dev,
1863 "Failed to initialize MMU S/W after hard reset\n");
1868 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
1870 if (!hdev->kernel_ctx) {
1871 rc = -ENOMEM;
1876 hdev->is_compute_ctx_active = false;
1878 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1880 dev_err(hdev->dev,
1881 "failed to init kernel ctx in hard reset\n");
1882 kfree(hdev->kernel_ctx);
1883 hdev->kernel_ctx = NULL;
1893 hdev->disabled = false;
1895 /* F/W security enabled indication might be updated after hard-reset */
1902 rc = hdev->asic_funcs->hw_init(hdev);
1904 dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
1908 /* If device is not idle fail the reset process */
1909 if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
1911 print_idle_status_mask(hdev, "device is not idle after reset", idle_mask);
1912 rc = -EIO;
1917 rc = hdev->asic_funcs->test_queues(hdev);
1919 dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
1926 dev_err(hdev->dev, "Failed late init after hard reset\n");
1932 dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
1936 if (!hdev->asic_prop.fw_security_enabled)
1939 rc = hdev->asic_funcs->compute_reset_late_init(hdev);
1942 dev_err(hdev->dev,
1943 "Failed late init in reset after device release\n");
1945 dev_err(hdev->dev, "Failed late init after compute reset\n");
1950 rc = hdev->asic_funcs->scrub_device_mem(hdev);
1952 dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc);
1956 spin_lock(&hdev->reset_info.lock);
1957 hdev->reset_info.in_compute_reset = 0;
1959 /* Schedule hard reset only if requested and if not already in hard reset.
1960 * We keep 'in_reset' enabled, so no other reset can go in during the hard
1961 * reset schedule
1963 if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
1966 hdev->reset_info.in_reset = 0;
1968 spin_unlock(&hdev->reset_info.lock);
1970 hdev->reset_info.needs_reset = false;
1973 dev_info(hdev->dev,
1975 dev_name(&(hdev)->pdev->dev));
1977 dev_dbg(hdev->dev,
1979 dev_name(&(hdev)->pdev->dev));
1982 hdev->reset_info.hard_reset_cnt++;
1986 /* After reset is done, we are ready to receive events from
1991 hdev->asic_funcs->enable_events_from_fw(hdev);
1994 hdev->reset_info.compute_reset_cnt++;
1997 dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
1998 flags = hdev->reset_info.hard_reset_schedule_flags;
1999 hdev->reset_info.hard_reset_schedule_flags = 0;
2008 hdev->disabled = true;
2010 spin_lock(&hdev->reset_info.lock);
2011 hdev->reset_info.in_compute_reset = 0;
2014 dev_err(hdev->dev,
2015 "%s Failed to reset! Device is NOT usable\n",
2016 dev_name(&(hdev)->pdev->dev));
2017 hdev->reset_info.hard_reset_cnt++;
2020 dev_err(hdev->dev, "Failed to reset device after user release\n");
2023 dev_err(hdev->dev, "Failed to do compute reset\n");
2024 hdev->reset_info.compute_reset_cnt++;
2027 spin_unlock(&hdev->reset_info.lock);
2033 hdev->reset_info.in_reset = 0;
2035 spin_unlock(&hdev->reset_info.lock);
2041 * hl_device_cond_reset() - conditionally reset the device.
2043 * @reset_flags: reset flags.
2046 * Conditionally reset the device, or alternatively schedule a watchdog work to reset the device
2047 * unless another reset precedes it.
2053 /* F/W reset cannot be postponed */
2057 /* Device release watchdog is relevant only if user exists and gets a reset notification */
2059 dev_err(hdev->dev, "Resetting device without a reset indication to user\n");
2068 * There is no point in postponing the reset if user is not registered for events.
2071 * case an immediate reset is not required.
2073 if (!ctx->hpriv->notifier_event.eventfd && !hdev->reset_info.watchdog_active)
2076 /* Schedule the device release watchdog work unless reset is already in progress or if the
2079 spin_lock(&hdev->reset_info.lock);
2080 if (hdev->reset_info.in_reset) {
2081 spin_unlock(&hdev->reset_info.lock);
2085 if (hdev->reset_info.watchdog_active) {
2086 hdev->device_release_watchdog_work.flags |= flags;
2090 hdev->device_release_watchdog_work.flags = flags;
2091 dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n",
2092 hdev->device_release_watchdog_timeout_sec);
2093 schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work,
2094 secs_to_jiffies(hdev->device_release_watchdog_timeout_sec));
2095 hdev->reset_info.watchdog_active = 1;
2097 spin_unlock(&hdev->reset_info.lock);
2118 mutex_lock(¬ifier_event->lock);
2119 notifier_event->events_mask |= event_mask;
2121 if (notifier_event->eventfd)
2122 eventfd_signal(notifier_event->eventfd);
2124 mutex_unlock(¬ifier_event->lock);
2128 * hl_notifier_event_send_all - notify all user processes via eventfd
2139 dev_warn(hdev->dev, "Skip sending zero event");
2143 mutex_lock(&hdev->fpriv_list_lock);
2145 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
2146 hl_notifier_event_send(&hpriv->notifier_event, event_mask);
2148 mutex_unlock(&hdev->fpriv_list_lock);
2152 * hl_device_init - main initialization function for habanalabs device
2172 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
2173 hdev->asic_prop.user_interrupt_count;
2176 hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt),
2178 if (!hdev->user_interrupt) {
2179 rc = -ENOMEM;
2184 if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) {
2189 rc = -ENOMEM;
2192 free_jobs_data = &hdev->user_interrupt[i].ts_free_jobs_data;
2193 free_jobs_data->free_nodes_pool = p;
2194 free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM;
2195 free_jobs_data->next_avail_free_node_idx = 0;
2200 free_jobs_data = &hdev->common_user_cq_interrupt.ts_free_jobs_data;
2204 rc = -ENOMEM;
2208 free_jobs_data->free_nodes_pool = p;
2209 free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM;
2210 free_jobs_data->next_avail_free_node_idx = 0;
2216 rc = hdev->asic_funcs->sw_init(hdev);
2231 dev_err(hdev->dev, "failed to initialize kernel queues\n");
2235 cq_cnt = hdev->asic_prop.completion_queues_count;
2243 hdev->completion_queue = kcalloc(cq_cnt,
2244 sizeof(*hdev->completion_queue),
2247 if (!hdev->completion_queue) {
2248 dev_err(hdev->dev,
2250 rc = -ENOMEM;
2256 rc = hl_cq_init(hdev, &hdev->completion_queue[i],
2257 hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
2259 dev_err(hdev->dev,
2263 hdev->completion_queue[i].cq_idx = i;
2266 hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs,
2268 if (!hdev->shadow_cs_queue) {
2269 rc = -ENOMEM;
2278 rc = hl_eq_init(hdev, &hdev->event_queue);
2280 dev_err(hdev->dev, "failed to initialize event queue\n");
2284 /* MMU S/W must be initialized before kernel context is created */
2287 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
2292 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
2293 if (!hdev->kernel_ctx) {
2294 rc = -ENOMEM;
2298 hdev->is_compute_ctx_active = false;
2300 hdev->asic_funcs->state_dump_init(hdev);
2302 hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC;
2304 hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL;
2308 dev_err(hdev->dev, "failed to initialize debugfs entry structure\n");
2309 kfree(hdev->kernel_ctx);
2316 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
2318 dev_err(hdev->dev, "failed to initialize kernel context\n");
2319 kfree(hdev->kernel_ctx);
2325 dev_err(hdev->dev, "failed to initialize CB pool\n");
2331 dev_err(hdev->dev, "Failed to initialize the decoder module\n");
2345 hdev->disabled = false;
2347 rc = hdev->asic_funcs->hw_init(hdev);
2349 dev_err(hdev->dev, "failed to initialize the H/W\n");
2355 rc = hdev->asic_funcs->test_queues(hdev);
2357 dev_err(hdev->dev, "Failed to detect if device is alive\n");
2364 dev_err(hdev->dev, "Failed late initialization\n");
2369 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
2370 hdev->asic_name,
2371 hdev->asic_prop.dram_size / SZ_1G);
2375 dev_err(hdev->dev, "Failed to initialize memory module\n");
2386 rc = drm_dev_register(&hdev->drm, 0);
2388 dev_err(hdev->dev, "Failed to register DRM device, rc %d\n", rc);
2395 dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n");
2403 if (hdev->asic_prop.set_max_power_on_device_init &&
2404 !hdev->asic_prop.fw_security_enabled)
2410 * hwmon-related sensors the device supports.
2415 dev_err(hdev->dev, "Failed to initialize hwmon\n");
2426 dev_notice(hdev->dev,
2428 dev_name(&(hdev)->pdev->dev));
2435 hdev->asic_funcs->enable_events_from_fw(hdev);
2437 hdev->init_done = true;
2444 if (hl_ctx_put(hdev->kernel_ctx) != 1)
2445 dev_err(hdev->dev,
2452 hl_eq_fini(hdev, &hdev->event_queue);
2454 kfree(hdev->shadow_cs_queue);
2457 hl_cq_fini(hdev, &hdev->completion_queue[i]);
2458 kfree(hdev->completion_queue);
2462 hdev->asic_funcs->sw_fini(hdev);
2464 vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool);
2468 if (!hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool)
2470 vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool);
2472 kfree(hdev->user_interrupt);
2477 hdev->disabled = true;
2479 drm_dev_register(&hdev->drm, 0);
2484 hdev->cdev_idx, dev_name(&hdev->pdev->dev));
2490 * hl_device_fini - main tear-down function for habanalabs device
2504 dev_info(hdev->dev, "Removing device %s\n", dev_name(&(hdev)->pdev->dev));
2506 hdev->device_fini_pending = 1;
2507 flush_delayed_work(&hdev->device_reset_work.reset_work);
2509 if (hdev->pldm)
2515 * This function is competing with the reset function, so try to
2516 * take the reset atomic and if we are already in middle of reset,
2517 * wait until reset function is finished. Reset function is designed
2519 * ports, the hard reset could take between 10-30 seconds
2524 spin_lock(&hdev->reset_info.lock);
2525 device_in_reset = !!hdev->reset_info.in_reset;
2527 hdev->reset_info.in_reset = 1;
2528 spin_unlock(&hdev->reset_info.lock);
2533 spin_lock(&hdev->reset_info.lock);
2534 device_in_reset = !!hdev->reset_info.in_reset;
2536 hdev->reset_info.in_reset = 1;
2537 spin_unlock(&hdev->reset_info.lock);
2540 dev_crit(hdev->dev,
2541 "%s Failed to remove device because reset function did not finish\n",
2542 dev_name(&(hdev)->pdev->dev));
2547 cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work);
2549 /* Disable PCI access from device F/W so it won't send us additional
2550 * interrupts. We disable MSI/MSI-X at the halt_engines function and we
2552 * disable the access here because if the device is marked disable, the
2554 * marked as disable so this message won't be sent
2559 hdev->disabled = true;
2563 hdev->reset_info.hard_reset_pending = true;
2573 dev_info(hdev->dev,
2577 hdev->process_kill_trial_cnt = 0;
2580 dev_crit(hdev->dev, "Failed to kill all open processes (%d)\n", rc);
2584 hdev->process_kill_trial_cnt = 0;
2587 dev_crit(hdev->dev, "Failed to kill all control device open processes (%d)\n", rc);
2593 /* Reset the H/W. It will be in idle state after this returns */
2594 rc = hdev->asic_funcs->hw_fini(hdev, true, false);
2596 dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
2598 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
2601 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
2602 dev_err(hdev->dev, "kernel ctx is still alive\n");
2610 vfree(hdev->captured_err_info.page_fault_info.user_mappings);
2612 hl_eq_fini(hdev, &hdev->event_queue);
2614 kfree(hdev->shadow_cs_queue);
2616 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
2617 hl_cq_fini(hdev, &hdev->completion_queue[i]);
2618 kfree(hdev->completion_queue);
2620 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
2621 hdev->asic_prop.user_interrupt_count;
2624 if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) {
2626 vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool);
2629 kfree(hdev->user_interrupt);
2632 vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool);
2637 hdev->asic_funcs->sw_fini(hdev);
2643 drm_dev_unregister(&hdev->drm);
2655 * hl_rreg - Read an MMIO register
2665 u32 val = readl(hdev->rmmio + reg);
2668 trace_habanalabs_rreg32(&(hdev)->pdev->dev, reg, val);
2674 * hl_wreg - Write to an MMIO register
2678 * @val: 32-bit value
2680 * Writes the 32-bit value into the MMIO register
2686 trace_habanalabs_wreg32(&(hdev)->pdev->dev, reg, val);
2688 writel(val, hdev->rmmio + reg);
2694 struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info;
2697 dev_err(hdev->dev,
2704 if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info.razwi_detected, 0, 1))
2707 razwi_info->razwi.timestamp = ktime_to_ns(ktime_get());
2708 razwi_info->razwi.addr = addr;
2709 razwi_info->razwi.num_of_possible_engines = num_of_engines;
2710 memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0],
2712 razwi_info->razwi.flags = flags;
2714 razwi_info->razwi_info_available = true;
2728 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
2737 /* Reset previous session count*/
2738 pgf_info->num_of_user_mappings = 0;
2742 dev_err(hdev->dev, "Can't get user context for user mappings\n");
2746 mutex_lock(&ctx->mem_hash_lock);
2747 hash_for_each(ctx->mem_hash, i, hnode, node) {
2748 vm_type = hnode->ptr;
2751 pgf_info->num_of_user_mappings++;
2755 if (!pgf_info->num_of_user_mappings)
2761 vfree(pgf_info->user_mappings);
2762 pgf_info->user_mappings =
2763 vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
2764 if (!pgf_info->user_mappings) {
2765 pgf_info->num_of_user_mappings = 0;
2769 hash_for_each(ctx->mem_hash, i, hnode, node) {
2770 vm_type = hnode->ptr;
2772 userptr = hnode->ptr;
2773 pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
2774 pgf_info->user_mappings[map_idx].size = userptr->size;
2777 phys_pg_pack = hnode->ptr;
2778 pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
2779 pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size;
2784 mutex_unlock(&ctx->mem_hash_lock);
2790 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
2793 if (atomic_cmpxchg(&pgf_info->page_fault_detected, 0, 1))
2796 pgf_info->page_fault.timestamp = ktime_to_ns(ktime_get());
2797 pgf_info->page_fault.addr = addr;
2798 pgf_info->page_fault.engine_id = eng_id;
2801 pgf_info->page_fault_info_available = true;
2815 struct hw_err_info *info = &hdev->captured_err_info.hw_err;
2818 if (atomic_cmpxchg(&info->event_detected, 0, 1))
2821 info->event.timestamp = ktime_to_ns(ktime_get());
2822 info->event.event_id = event_id;
2824 info->event_info_available = true;
2837 struct fw_err_info *info = &hdev->captured_err_info.fw_err;
2840 if (atomic_cmpxchg(&info->event_detected, 0, 1))
2843 info->event.timestamp = ktime_to_ns(ktime_get());
2844 info->event.err_type = fw_info->err_type;
2845 if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
2846 info->event.event_id = fw_info->event_id;
2848 info->event_info_available = true;
2855 if (info->event_mask)
2856 *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
2861 struct engine_err_info *info = &hdev->captured_err_info.engine_err;
2864 if (atomic_cmpxchg(&info->event_detected, 0, 1))
2867 info->event.timestamp = ktime_to_ns(ktime_get());
2868 info->event.engine_id = engine_id;
2869 info->event.error_count = error_count;
2870 info->event_info_available = true;
2875 vfree(captured_err_info->page_fault_info.user_mappings);
2877 atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
2878 captured_err_info->undef_opcode.write_enable = true;
2884 struct cpumask *available_mask = &hdev->irq_affinity_mask;
2885 int numa_node = hdev->pdev->dev.numa_node, i;
2892 dev_err(hdev->dev, "No available affinities in current numa node\n");
2904 if (cpumask_empty(&hdev->irq_affinity_mask)) {
2905 dev_dbg(hdev->dev, "affinity mask is empty\n");
2909 if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
2910 dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
2915 hdev->heartbeat_debug_info.heartbeat_event_counter++;
2916 hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds();
2917 hdev->eq_heartbeat_received = true;
2922 struct hl_clk_throttle *clk_throttle = &hdev->clk_throttling;
2925 mutex_lock(&clk_throttle->lock);
2929 clk_throttle->current_reason |= HL_CLK_THROTTLE_POWER;
2930 clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_POWER;
2931 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
2932 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
2933 dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n");
2937 clk_throttle->current_reason &= ~HL_CLK_THROTTLE_POWER;
2938 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
2939 dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n");
2943 clk_throttle->current_reason |= HL_CLK_THROTTLE_THERMAL;
2944 clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
2945 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
2946 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
2948 dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n");
2952 clk_throttle->current_reason &= ~HL_CLK_THROTTLE_THERMAL;
2953 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
2955 dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n");
2959 dev_err(hdev->dev, "Received invalid clock change event %d\n", event_type);
2963 mutex_unlock(&clk_throttle->lock);