device.c - OpenGrok cross reference for /linux/drivers/accel/habanalabs/common/device.c

Lines Matching +full:disable +full:- +full:mmu +full:- +full:reset
1 // SPDX-License-Identifier: GPL-2.0
4  * Copyright 2016-2022 HabanaLabs, Ltd.
36  * hl_set_dram_bar- sets the bar to allow later access to address
54 	struct asic_fixed_properties *prop = &hdev->asic_prop;
57 	if (is_power_of_2(prop->dram_pci_bar_size))
58 		bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
60 		bar_base_addr = region->region_base +
61 				div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
62 				prop->dram_pci_bar_size;
64 	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
76 	struct pci_mem_region *region = &hdev->pci_mem_region[region_type];
77 	u64 old_base = 0, rc, bar_region_base = region->region_base;
83 			return -EIO;
86 	acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
87 			(addr - bar_region_base);
113 			return -EIO;
127 		ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag);
130 		ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle);
135 		trace_habanalabs_dma_alloc(&(hdev)->pdev->dev, (u64) (uintptr_t) ptr, *dma_handle,
150 		hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle);
153 		hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle);
157 	trace_habanalabs_dma_free(&(hdev)->pdev->dev, store_cpu_addr, dma_handle, size, caller);
186 	return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
191 	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr);
197 	struct asic_fixed_properties *prop = &hdev->asic_prop;
201 	rc = hdev->asic_funcs->dma_map_sgtable(hdev, sgt, dir);
209 		trace_habanalabs_dma_map_page(&(hdev)->pdev->dev,
211 					sg->dma_address - prop->device_dma_offset_for_host_access,
213 					sg->dma_length,
215 					sg->length,
225 	struct asic_fixed_properties *prop = &hdev->asic_prop;
229 	rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0);
234 	if (prop->device_dma_offset_for_host_access)
236 			sg->dma_address += prop->device_dma_offset_for_host_access;
244 	struct asic_fixed_properties *prop = &hdev->asic_prop;
248 	hdev->asic_funcs->dma_unmap_sgtable(hdev, sgt, dir);
252 			trace_habanalabs_dma_unmap_page(&(hdev)->pdev->dev,
254 					sg->dma_address - prop->device_dma_offset_for_host_access,
256 					sg->dma_length,
258 					sg->length,
267 	struct asic_fixed_properties *prop = &hdev->asic_prop;
272 	if (prop->device_dma_offset_for_host_access)
274 			sg->dma_address -= prop->device_dma_offset_for_host_access;
276 	dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0);
280  * hl_access_cfg_region - access the config region
290 	struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG];
294 		dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32));
295 		return -EINVAL;
300 		*val = RREG32(addr - cfg_region->region_base);
303 		WREG32(addr - cfg_region->region_base, *val);
306 		val_l = RREG32(addr - cfg_region->region_base);
307 		val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base);
312 		WREG32(addr - cfg_region->region_base, lower_32_bits(*val));
313 		WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val));
316 		dev_err(hdev->dev, "access type %d is not supported\n", acc_type);
317 		return -EOPNOTSUPP;
324  * hl_access_dev_mem - access device memory
343 		return -EFAULT;
361 	if ((e->actual_size + str_size) < e->allocated_buf_size) {
363 		vsnprintf(e->buf + e->actual_size, str_size, fmt, args);
370 	e->actual_size += str_size;
377 	if (hdev->device_fini_pending) {
379 	} else if (hdev->reset_info.in_reset) {
380 		if (hdev->reset_info.in_compute_reset)
384 	} else if (hdev->reset_info.needs_reset) {
386 	} else if (hdev->disabled) {
388 	} else if (!hdev->init_done) {
445 		dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n",
446 			dev_name(&hdev->pdev->dev), message,
449 		dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n",
450 			dev_name(&hdev->pdev->dev), message,
453 		dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n",
454 			dev_name(&hdev->pdev->dev), message, idle_mask[1], idle_mask[0]);
456 		dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message,
469 	hdev = hpriv->hdev;
471 	hdev->asic_funcs->send_device_activity(hdev, false);
475 	mutex_destroy(&hpriv->ctx_lock);
476 	mutex_destroy(&hpriv->restore_phase_mutex);
479 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
481 	/* Device should be reset if reset-upon-device-release is enabled, or if there is a pending
482 	 * reset that waits for device release.
484 	reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active;
486 	/* Check the device idle status and reset if not idle.
487 	 * Skip it if already in reset, or if device is going to be reset in any case.
489 	if (!hdev->reset_info.in_reset && !reset_device && !hdev->pldm)
490 		device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
498 	/* We need to remove the user from the list to make sure the reset process won't
505 	 * a user open the device while there a reset is about to happen.
507 	mutex_lock(&hdev->fpriv_list_lock);
508 	list_del(&hpriv->dev_node);
509 	mutex_unlock(&hdev->fpriv_list_lock);
511 	put_pid(hpriv->taskpid);
517 		int rc = hdev->asic_funcs->scrub_device_mem(hdev);
520 			dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc);
525 	/* Now we can mark the compute_ctx as not active. Even if a reset is running in a different
529 	mutex_lock(&hdev->fpriv_list_lock);
530 	hdev->is_compute_ctx_active = false;
531 	mutex_unlock(&hdev->fpriv_list_lock);
533 	hdev->compute_ctx_in_release = 0;
536 	if (hpriv->notifier_event.eventfd)
537 		eventfd_ctx_put(hpriv->notifier_event.eventfd);
539 	mutex_destroy(&hpriv->notifier_event.lock);
546 	kref_get(&hpriv->refcount);
551 	return kref_put(&hpriv->refcount, hpriv_release);
569 		offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num);
572 	dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt);
575 		offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]",
579 	if (mm_fini_stats->n_busy_cb) {
581 		offset += scnprintf(buf + offset, size - offset, " [%u live CB handles]",
582 				mm_fini_stats->n_busy_cb);
586 		scnprintf(buf + offset, size - offset, " [unknown reason]");
588 	dev_notice(hdev->dev, "%s%s\n", message, buf);
592  * hl_device_release() - release function for habanalabs device.
600 	struct hl_fpriv *hpriv = file_priv->driver_priv;
606 		put_pid(hpriv->taskpid);
609 	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
614 	hl_mem_mgr_fini(&hpriv->mem_mgr, &mm_fini_stats);
616 	hdev->compute_ctx_in_release = 1;
624 	hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif;
629 	struct hl_fpriv *hpriv = filp->private_data;
630 	struct hl_device *hdev = hpriv->hdev;
632 	filp->private_data = NULL;
639 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
640 	list_del(&hpriv->dev_node);
641 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
643 	put_pid(hpriv->taskpid);
652 	struct hl_device *hdev = hpriv->hdev;
657 		return -ENODEV;
660 	vm_pgoff = vma->vm_pgoff;
664 		vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
669 		return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL);
671 	return -EINVAL;
675  * hl_mmap - mmap function for habanalabs device
685 	struct drm_file *file_priv = filp->private_data;
686 	struct hl_fpriv *hpriv = file_priv->driver_priv;
705  * device_init_cdev - Initialize cdev and device for habanalabs device
723 	cdev->owner = THIS_MODULE;
727 		return -ENOMEM;
730 	(*dev)->devt = MKDEV(hdev->major, minor);
731 	(*dev)->class = class;
732 	(*dev)->release = device_release_func;
741 	const struct class *accel_class = hdev->drm.accel->kdev->class;
745 	hdev->cdev_idx = hdev->drm.accel->index;
748 	snprintf(name, sizeof(name), "accel_controlD%d", hdev->cdev_idx);
749 	rc = device_init_cdev(hdev, accel_class, hdev->cdev_idx, &hl_ctrl_ops, name,
750 				&hdev->cdev_ctrl, &hdev->dev_ctrl);
754 	rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
756 		dev_err(hdev->dev_ctrl,
763 		dev_err(hdev->dev, "failed to initialize sysfs\n");
769 	hdev->cdev_sysfs_debugfs_created = true;
774 	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
776 	put_device(hdev->dev_ctrl);
782 	if (!hdev->cdev_sysfs_debugfs_created)
787 	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
788 	put_device(hdev->dev_ctrl);
795 	struct hl_device *hdev = device_reset_work->hdev;
799 	flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR;
803 	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
810 			dev_info(hdev->dev,
811 				"Could not reset device (compute_ctx refcount %u). will try again in %u seconds",
812 				kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC);
815 			dev_info(hdev->dev, "Could not reset device. will try again in %u seconds",
819 		queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work,
828 	struct hl_device *hdev = watchdog_work->hdev;
831 	dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n");
833 	flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR;
839  * device_early_init - do some early initialization for the habanalabs device
851 	switch (hdev->asic_type) {
854 		strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
858 		strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name));
862 		strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name));
866 		strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name));
870 		strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
874 		strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
878 		strscpy(hdev->asic_name, "GAUDI2D", sizeof(hdev->asic_name));
881 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
882 			hdev->asic_type);
883 		return -EINVAL;
886 	rc = hdev->asic_funcs->early_init(hdev);
894 	if (hdev->asic_prop.completion_queues_count) {
895 		hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
898 		if (!hdev->cq_wq) {
899 			rc = -ENOMEM;
904 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
905 		snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
906 		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
907 		if (hdev->cq_wq[i] == NULL) {
908 			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
909 			rc = -ENOMEM;
914 	snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
915 	hdev->eq_wq = create_singlethread_workqueue(workq_name);
916 	if (hdev->eq_wq == NULL) {
917 		dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
918 		rc = -ENOMEM;
922 	snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
923 	hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
924 	if (!hdev->cs_cmplt_wq) {
925 		dev_err(hdev->dev,
927 		rc = -ENOMEM;
931 	snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
932 	hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
933 	if (!hdev->ts_free_obj_wq) {
934 		dev_err(hdev->dev,
936 		rc = -ENOMEM;
940 	snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
941 	hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
942 	if (!hdev->prefetch_wq) {
943 		dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
944 		rc = -ENOMEM;
948 	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
949 	if (!hdev->hl_chip_info) {
950 		rc = -ENOMEM;
958 	hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
960 	snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
961 	hdev->reset_wq = create_singlethread_workqueue(workq_name);
962 	if (!hdev->reset_wq) {
963 		rc = -ENOMEM;
964 		dev_err(hdev->dev, "Failed to create device reset WQ\n");
968 	INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
970 	INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending);
971 	hdev->device_reset_work.hdev = hdev;
972 	hdev->device_fini_pending = 0;
974 	INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work,
976 	hdev->device_release_watchdog_work.hdev = hdev;
978 	mutex_init(&hdev->send_cpu_message_lock);
979 	mutex_init(&hdev->debug_lock);
980 	INIT_LIST_HEAD(&hdev->cs_mirror_list);
981 	spin_lock_init(&hdev->cs_mirror_lock);
982 	spin_lock_init(&hdev->reset_info.lock);
983 	INIT_LIST_HEAD(&hdev->fpriv_list);
984 	INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
985 	mutex_init(&hdev->fpriv_list_lock);
986 	mutex_init(&hdev->fpriv_ctrl_list_lock);
987 	mutex_init(&hdev->clk_throttling.lock);
992 	hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
993 	hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
995 	kfree(hdev->hl_chip_info);
997 	destroy_workqueue(hdev->prefetch_wq);
999 	destroy_workqueue(hdev->ts_free_obj_wq);
1001 	destroy_workqueue(hdev->cs_cmplt_wq);
1003 	destroy_workqueue(hdev->eq_wq);
1005 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1006 		if (hdev->cq_wq[i])
1007 			destroy_workqueue(hdev->cq_wq[i]);
1008 	kfree(hdev->cq_wq);
1012 	if (hdev->asic_funcs->early_fini)
1013 		hdev->asic_funcs->early_fini(hdev);
1019  * device_early_fini - finalize all that was done in device_early_init
1028 	mutex_destroy(&hdev->debug_lock);
1029 	mutex_destroy(&hdev->send_cpu_message_lock);
1031 	mutex_destroy(&hdev->fpriv_list_lock);
1032 	mutex_destroy(&hdev->fpriv_ctrl_list_lock);
1034 	mutex_destroy(&hdev->clk_throttling.lock);
1036 	hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
1037 	hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
1039 	kfree(hdev->hl_chip_info);
1041 	destroy_workqueue(hdev->prefetch_wq);
1042 	destroy_workqueue(hdev->ts_free_obj_wq);
1043 	destroy_workqueue(hdev->cs_cmplt_wq);
1044 	destroy_workqueue(hdev->eq_wq);
1045 	destroy_workqueue(hdev->reset_wq);
1047 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1048 		destroy_workqueue(hdev->cq_wq[i]);
1049 	kfree(hdev->cq_wq);
1053 	if (hdev->asic_funcs->early_fini)
1054 		hdev->asic_funcs->early_fini(hdev);
1061 	if (!hdev->pdev)
1064 	pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);
1066 	return (device_id == hdev->pdev->device);
1072 	time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts
1073 					: hdev->heartbeat_debug_info.last_eq_heartbeat_ts;
1081 	snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)",
1087 	struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
1088 	u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1;
1089 	struct asic_fixed_properties *prop = &hdev->asic_prop;
1092 	if (!prop->cpucp_info.eq_health_check_supported)
1095 	if (!hdev->eq_heartbeat_received) {
1096 		dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
1100 		dev_err(hdev->dev,
1102 			hdev->event_queue.ci,
1103 			heartbeat_debug_info->heartbeat_event_counter,
1105 			hdev->kernel_queues[cpu_q_id].pi,
1106 			atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
1107 			atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
1110 		hl_eq_dump(hdev, &hdev->event_queue);
1115 	hdev->eq_heartbeat_received = false;
1128 	if (!hl_device_operational(hdev, NULL) || !hdev->init_done)
1136 	if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev)))
1140 		dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
1153 	 * heartbeat immediately post reset.
1155 	 * scheduled since last reset/init cycle.
1156 	 * So if the device is not already in reset cycle, reset the flag
1157 	 * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR
1161 	if (!hdev->reset_info.in_reset)
1162 		hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
1164 	schedule_delayed_work(&hdev->work_heartbeat,
1169  * device_late_init - do late stuff initialization for the habanalabs device
1180 	if (hdev->asic_funcs->late_init) {
1181 		rc = hdev->asic_funcs->late_init(hdev);
1183 			dev_err(hdev->dev,
1189 	hdev->high_pll = hdev->asic_prop.high_pll;
1190 	hdev->late_init_done = true;
1196  * device_late_fini - finalize all that was done in device_late_init
1203 	if (!hdev->late_init_done)
1206 	if (hdev->asic_funcs->late_fini)
1207 		hdev->asic_funcs->late_fini(hdev);
1209 	hdev->late_init_done = false;
1217 	max_power = hdev->max_power;
1218 	dc_power = hdev->asic_prop.dc_power_default;
1219 	divisor = max_power - dc_power;
1221 		dev_warn(hdev->dev, "device utilization is not supported\n");
1222 		return -EOPNOTSUPP;
1231 	dividend = (curr_power - dc_power) * 100;
1241 	mutex_lock(&hdev->debug_lock);
1244 		if (!hdev->in_debug) {
1245 			dev_err(hdev->dev,
1246 				"Failed to disable debug mode because device was not in debug mode\n");
1247 			rc = -EFAULT;
1251 		if (!hdev->reset_info.hard_reset_pending)
1252 			hdev->asic_funcs->halt_coresight(hdev, ctx);
1254 		hdev->in_debug = 0;
1259 	if (hdev->in_debug) {
1260 		dev_err(hdev->dev,
1262 		rc = -EFAULT;
1266 	hdev->in_debug = 1;
1269 	mutex_unlock(&hdev->debug_lock);
1279 	hdev->asic_funcs->hw_queues_lock(hdev);
1280 	hdev->asic_funcs->hw_queues_unlock(hdev);
1283 	mutex_lock(&hdev->send_cpu_message_lock);
1284 	mutex_unlock(&hdev->send_cpu_message_lock);
1287 	mutex_lock(&hdev->fpriv_list_lock);
1288 	mutex_unlock(&hdev->fpriv_list_lock);
1289 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
1290 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
1307 		if (hdev->heartbeat)
1308 			cancel_delayed_work_sync(&hdev->work_heartbeat);
1314 	 * Halt the engines and disable interrupts so we won't get any more
1318 	hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
1323 	/* flush the MMU prefetch workqueue */
1324 	flush_workqueue(hdev->prefetch_wq);
1330  * hl_device_suspend - initiate device suspend
1342 	pci_save_state(hdev->pdev);
1345 	spin_lock(&hdev->reset_info.lock);
1346 	if (hdev->reset_info.in_reset) {
1347 		spin_unlock(&hdev->reset_info.lock);
1348 		dev_err(hdev->dev, "Can't suspend while in reset\n");
1349 		return -EIO;
1351 	hdev->reset_info.in_reset = 1;
1352 	spin_unlock(&hdev->reset_info.lock);
1355 	hdev->disabled = true;
1359 	rc = hdev->asic_funcs->suspend(hdev);
1361 		dev_err(hdev->dev,
1362 			"Failed to disable PCI access of device CPU\n");
1365 	pci_disable_device(hdev->pdev);
1366 	pci_set_power_state(hdev->pdev, PCI_D3hot);
1372  * hl_device_resume - initiate device resume
1384 	pci_set_power_state(hdev->pdev, PCI_D0);
1385 	pci_restore_state(hdev->pdev);
1386 	rc = pci_enable_device_mem(hdev->pdev);
1388 		dev_err(hdev->dev,
1393 	pci_set_master(hdev->pdev);
1395 	rc = hdev->asic_funcs->resume(hdev);
1397 		dev_err(hdev->dev, "Failed to resume device after suspend\n");
1403 	 * for hard reset to be performed
1405 	spin_lock(&hdev->reset_info.lock);
1406 	hdev->reset_info.in_reset = 0;
1407 	spin_unlock(&hdev->reset_info.lock);
1411 		dev_err(hdev->dev, "Failed to reset device during resume\n");
1418 	pci_disable_device(hdev->pdev);
1431 	hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
1432 	hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
1443 		if (hdev->process_kill_trial_cnt) {
1459 		task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
1461 			dev_info(hdev->dev, "Killing user process pid=%d\n",
1468 			dev_dbg(hdev->dev,
1470 				pid_nr(hpriv->taskpid));
1479 	 * e.g. MMU unmappings, or running other long teardown flow even before
1482 	 * continuing with the reset.
1487 		dev_dbg(hdev->dev,
1488 			"Waiting for all unmap operations to finish before hard reset\n");
1490 		pending_cnt--;
1500 	if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
1501 		return -ETIME;
1503 	hdev->process_kill_trial_cnt++;
1505 	return -EBUSY;
1514 	hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
1515 	hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
1519 		hpriv->hdev = NULL;
1525 	/* If reset is due to heartbeat, device CPU is no responsive in
1526 	 * which case no point sending PCI disable message to it.
1530 		/* Disable PCI access from device F/W so he won't send
1531 		 * us additional interrupts. We disable MSI/MSI-X at
1533 		 * sending us interrupts after that. We need to disable
1535 		 * disable, the message won't be send. Also, in case
1536 		 * of heartbeat, the device CPU is marked as disable
1545 		if (hdev->cpu_queues_enable)
1546 			disable_irq(pci_irq_vector(hdev->pdev, hdev->asic_prop.eq_interrupt_id));
1555 	if (hdev->is_compute_ctx_active)
1559 	 * 'reset cause' is being updated here, because getting here
1565 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
1568 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
1571 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
1574 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
1578 	 * If reset cause is same twice, then reset_trigger_repeated
1579 	 * is set and if this reset is due to a fatal FW error
1582 	if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
1583 		hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
1584 		hdev->reset_info.reset_trigger_repeated = 0;
1586 		hdev->reset_info.reset_trigger_repeated = 1;
1592 	hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0;
1593 	hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0;
1594 	hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
1599 	if (!hdev->heartbeat)
1609 	hdev->eq_heartbeat_received = true;
1611 	schedule_delayed_work(&hdev->work_heartbeat,
1616  * hl_device_reset - reset the device
1619  * @flags: reset flags.
1624  * Re-initialize all internal data structures
1639 	if (!hdev->init_done) {
1640 		dev_err(hdev->dev, "Can't reset before initialization is done\n");
1650 	reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
1653 		dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
1657 	if (!hard_reset && !hdev->asic_prop.supports_compute_reset) {
1658 		dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n");
1664 			dev_crit(hdev->dev,
1665 				"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
1666 			return -EINVAL;
1672 	if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
1673 		dev_dbg(hdev->dev,
1674 			"asic doesn't allow inference soft reset - do hard-reset instead\n");
1679 	/* Re-entry of reset thread */
1680 	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
1684 	 * Prevent concurrency in this function - only one reset should be
1686 	 * get here from a dedicated hard reset thread.
1690 		spin_lock(&hdev->reset_info.lock);
1691 		if (hdev->reset_info.in_reset) {
1692 			/* We allow scheduling of a hard reset only during a compute reset */
1693 			if (hard_reset && hdev->reset_info.in_compute_reset)
1694 				hdev->reset_info.hard_reset_schedule_flags = flags;
1695 			spin_unlock(&hdev->reset_info.lock);
1700 		 * Update this before in_reset because in_compute_reset implies we are in reset
1702 		hdev->reset_info.in_compute_reset = !hard_reset;
1704 		hdev->reset_info.in_reset = 1;
1706 		spin_unlock(&hdev->reset_info.lock);
1709 		 * In case of reset-upon-device-release while the release watchdog work is
1710 		 * scheduled due to a hard-reset, do hard-reset instead of compute-reset.
1712 		if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) {
1714 					&hdev->device_release_watchdog_work;
1716 			hdev->reset_info.watchdog_active = 0;
1718 				cancel_delayed_work_sync(&watchdog_work->reset_work);
1720 			if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) {
1721 				hdev->reset_info.in_compute_reset = 0;
1736 		hdev->disabled = true;
1741 			dev_info(hdev->dev, "Going to reset device\n");
1743 			dev_dbg(hdev->dev, "Going to reset device after release by user\n");
1745 			dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
1749 		hdev->reset_info.hard_reset_pending = true;
1751 		hdev->process_kill_trial_cnt = 0;
1753 		hdev->device_reset_work.flags = flags;
1756 		 * Because the reset function can't run from heartbeat work,
1757 		 * we need to call the reset function from a dedicated work.
1759 		queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0);
1774 		if (rc == -EBUSY) {
1775 			if (hdev->device_fini_pending) {
1776 				dev_crit(hdev->dev,
1777 					"%s Failed to kill all open processes, stopping hard reset\n",
1778 					dev_name(&(hdev)->pdev->dev));
1782 			/* signal reset thread to reschedule */
1787 			dev_crit(hdev->dev,
1788 				"%s Failed to kill all open processes, stopping hard reset\n",
1789 				dev_name(&(hdev)->pdev->dev));
1794 		 * reading or writing to registers during the reset
1796 		flush_workqueue(hdev->eq_wq);
1799 	/* Reset the H/W. It will be in idle state after this returns */
1800 	hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
1803 		hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
1806 		if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
1807 			hdev->kernel_ctx = NULL;
1811 		hl_eq_reset(hdev, &hdev->event_queue);
1814 	/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
1816 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1817 		hl_cq_reset(hdev, &hdev->completion_queue[i]);
1822 		atomic_set(&ctx->thread_ctx_switch_token, 1);
1823 		ctx->thread_ctx_switch_wait_token = 0;
1831 	/* Finished tear-down, starting to re-initialize */
1834 		hdev->device_cpu_disabled = false;
1835 		hdev->reset_info.hard_reset_pending = false;
1841 		if (hdev->reset_info.reset_trigger_repeated &&
1842 				(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
1843 						hdev->reset_info.prev_reset_trigger ==
1845 			dev_crit(hdev->dev,
1846 				"%s Consecutive fatal errors, stopping hard reset\n",
1847 				dev_name(&(hdev)->pdev->dev));
1848 			rc = -EIO;
1852 		if (hdev->kernel_ctx) {
1853 			dev_crit(hdev->dev,
1854 				"%s kernel ctx was alive during hard reset, something is terribly wrong\n",
1855 				dev_name(&(hdev)->pdev->dev));
1856 			rc = -EBUSY;
1862 			dev_err(hdev->dev,
1863 				"Failed to initialize MMU S/W after hard reset\n");
1868 		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
1870 		if (!hdev->kernel_ctx) {
1871 			rc = -ENOMEM;
1876 		hdev->is_compute_ctx_active = false;
1878 		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1880 			dev_err(hdev->dev,
1881 				"failed to init kernel ctx in hard reset\n");
1882 			kfree(hdev->kernel_ctx);
1883 			hdev->kernel_ctx = NULL;
1893 	hdev->disabled = false;
1895 	/* F/W security enabled indication might be updated after hard-reset */
1902 	rc = hdev->asic_funcs->hw_init(hdev);
1904 		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
1908 	/* If device is not idle fail the reset process */
1909 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
1911 		print_idle_status_mask(hdev, "device is not idle after reset", idle_mask);
1912 		rc = -EIO;
1917 	rc = hdev->asic_funcs->test_queues(hdev);
1919 		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
1926 			dev_err(hdev->dev, "Failed late init after hard reset\n");
1932 			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
1936 		if (!hdev->asic_prop.fw_security_enabled)
1939 		rc = hdev->asic_funcs->compute_reset_late_init(hdev);
1942 				dev_err(hdev->dev,
1943 					"Failed late init in reset after device release\n");
1945 				dev_err(hdev->dev, "Failed late init after compute reset\n");
1950 	rc = hdev->asic_funcs->scrub_device_mem(hdev);
1952 		dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc);
1956 	spin_lock(&hdev->reset_info.lock);
1957 	hdev->reset_info.in_compute_reset = 0;
1959 	/* Schedule hard reset only if requested and if not already in hard reset.
1960 	 * We keep 'in_reset' enabled, so no other reset can go in during the hard
1961 	 * reset schedule
1963 	if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
1966 		hdev->reset_info.in_reset = 0;
1968 	spin_unlock(&hdev->reset_info.lock);
1970 	hdev->reset_info.needs_reset = false;
1973 		dev_info(hdev->dev,
1975 			 dev_name(&(hdev)->pdev->dev));
1977 		dev_dbg(hdev->dev,
1979 			dev_name(&(hdev)->pdev->dev));
1982 		hdev->reset_info.hard_reset_cnt++;
1986 		/* After reset is done, we are ready to receive events from
1991 		hdev->asic_funcs->enable_events_from_fw(hdev);
1994 			hdev->reset_info.compute_reset_cnt++;
1997 			dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
1998 			flags = hdev->reset_info.hard_reset_schedule_flags;
1999 			hdev->reset_info.hard_reset_schedule_flags = 0;
2008 	hdev->disabled = true;
2010 	spin_lock(&hdev->reset_info.lock);
2011 	hdev->reset_info.in_compute_reset = 0;
2014 		dev_err(hdev->dev,
2015 			"%s Failed to reset! Device is NOT usable\n",
2016 			dev_name(&(hdev)->pdev->dev));
2017 		hdev->reset_info.hard_reset_cnt++;
2020 			dev_err(hdev->dev, "Failed to reset device after user release\n");
2023 			dev_err(hdev->dev, "Failed to do compute reset\n");
2024 			hdev->reset_info.compute_reset_cnt++;
2027 		spin_unlock(&hdev->reset_info.lock);
2033 	hdev->reset_info.in_reset = 0;
2035 	spin_unlock(&hdev->reset_info.lock);
2041  * hl_device_cond_reset() - conditionally reset the device.
2043  * @reset_flags: reset flags.
2046  * Conditionally reset the device, or alternatively schedule a watchdog work to reset the device
2047  * unless another reset precedes it.
2053 	/* F/W reset cannot be postponed */
2057 	/* Device release watchdog is relevant only if user exists and gets a reset notification */
2059 		dev_err(hdev->dev, "Resetting device without a reset indication to user\n");
2068 	 * There is no point in postponing the reset if user is not registered for events.
2071 	 * case an immediate reset is not required.
2073 	if (!ctx->hpriv->notifier_event.eventfd && !hdev->reset_info.watchdog_active)
2076 	/* Schedule the device release watchdog work unless reset is already in progress or if the
2079 	spin_lock(&hdev->reset_info.lock);
2080 	if (hdev->reset_info.in_reset) {
2081 		spin_unlock(&hdev->reset_info.lock);
2085 	if (hdev->reset_info.watchdog_active) {
2086 		hdev->device_release_watchdog_work.flags |= flags;
2090 	hdev->device_release_watchdog_work.flags = flags;
2091 	dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n",
2092 		hdev->device_release_watchdog_timeout_sec);
2093 	schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work,
2094 				secs_to_jiffies(hdev->device_release_watchdog_timeout_sec));
2095 	hdev->reset_info.watchdog_active = 1;
2097 	spin_unlock(&hdev->reset_info.lock);
2118 	mutex_lock(&notifier_event->lock);
2119 	notifier_event->events_mask |= event_mask;
2121 	if (notifier_event->eventfd)
2122 		eventfd_signal(notifier_event->eventfd);
2124 	mutex_unlock(&notifier_event->lock);
2128  * hl_notifier_event_send_all - notify all user processes via eventfd
2139 		dev_warn(hdev->dev, "Skip sending zero event");
2143 	mutex_lock(&hdev->fpriv_list_lock);
2145 	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
2146 		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
2148 	mutex_unlock(&hdev->fpriv_list_lock);
2152  * hl_device_init - main initialization function for habanalabs device
2172 	user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
2173 				hdev->asic_prop.user_interrupt_count;
2176 		hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt),
2178 		if (!hdev->user_interrupt) {
2179 			rc = -ENOMEM;
2184 		if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) {
2189 					rc = -ENOMEM;
2192 				free_jobs_data = &hdev->user_interrupt[i].ts_free_jobs_data;
2193 				free_jobs_data->free_nodes_pool = p;
2194 				free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM;
2195 				free_jobs_data->next_avail_free_node_idx = 0;
2200 	free_jobs_data = &hdev->common_user_cq_interrupt.ts_free_jobs_data;
2204 		rc = -ENOMEM;
2208 	free_jobs_data->free_nodes_pool = p;
2209 	free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM;
2210 	free_jobs_data->next_avail_free_node_idx = 0;
2216 	rc = hdev->asic_funcs->sw_init(hdev);
2231 		dev_err(hdev->dev, "failed to initialize kernel queues\n");
2235 	cq_cnt = hdev->asic_prop.completion_queues_count;
2243 		hdev->completion_queue = kcalloc(cq_cnt,
2244 				sizeof(*hdev->completion_queue),
2247 		if (!hdev->completion_queue) {
2248 			dev_err(hdev->dev,
2250 			rc = -ENOMEM;
2256 		rc = hl_cq_init(hdev, &hdev->completion_queue[i],
2257 				hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
2259 			dev_err(hdev->dev,
2263 		hdev->completion_queue[i].cq_idx = i;
2266 	hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs,
2268 	if (!hdev->shadow_cs_queue) {
2269 		rc = -ENOMEM;
2278 	rc = hl_eq_init(hdev, &hdev->event_queue);
2280 		dev_err(hdev->dev, "failed to initialize event queue\n");
2284 	/* MMU S/W must be initialized before kernel context is created */
2287 		dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
2292 	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
2293 	if (!hdev->kernel_ctx) {
2294 		rc = -ENOMEM;
2298 	hdev->is_compute_ctx_active = false;
2300 	hdev->asic_funcs->state_dump_init(hdev);
2302 	hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC;
2304 	hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL;
2308 		dev_err(hdev->dev, "failed to initialize debugfs entry structure\n");
2309 		kfree(hdev->kernel_ctx);
2316 	rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
2318 		dev_err(hdev->dev, "failed to initialize kernel context\n");
2319 		kfree(hdev->kernel_ctx);
2325 		dev_err(hdev->dev, "failed to initialize CB pool\n");
2331 		dev_err(hdev->dev, "Failed to initialize the decoder module\n");
2345 	hdev->disabled = false;
2347 	rc = hdev->asic_funcs->hw_init(hdev);
2349 		dev_err(hdev->dev, "failed to initialize the H/W\n");
2355 	rc = hdev->asic_funcs->test_queues(hdev);
2357 		dev_err(hdev->dev, "Failed to detect if device is alive\n");
2364 		dev_err(hdev->dev, "Failed late initialization\n");
2369 	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
2370 		hdev->asic_name,
2371 		hdev->asic_prop.dram_size / SZ_1G);
2375 		dev_err(hdev->dev, "Failed to initialize memory module\n");
2386 	rc = drm_dev_register(&hdev->drm, 0);
2388 		dev_err(hdev->dev, "Failed to register DRM device, rc %d\n", rc);
2395 		dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n");
2403 	if (hdev->asic_prop.set_max_power_on_device_init &&
2404 			!hdev->asic_prop.fw_security_enabled)
2410 	 * hwmon-related sensors the device supports.
2415 		dev_err(hdev->dev, "Failed to initialize hwmon\n");
2426 	dev_notice(hdev->dev,
2428 		dev_name(&(hdev)->pdev->dev));
2435 	hdev->asic_funcs->enable_events_from_fw(hdev);
2437 	hdev->init_done = true;
2444 	if (hl_ctx_put(hdev->kernel_ctx) != 1)
2445 		dev_err(hdev->dev,
2452 	hl_eq_fini(hdev, &hdev->event_queue);
2454 	kfree(hdev->shadow_cs_queue);
2457 		hl_cq_fini(hdev, &hdev->completion_queue[i]);
2458 	kfree(hdev->completion_queue);
2462 	hdev->asic_funcs->sw_fini(hdev);
2464 	vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool);
2468 			if (!hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool)
2470 			vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool);
2472 		kfree(hdev->user_interrupt);
2477 	hdev->disabled = true;
2479 		drm_dev_register(&hdev->drm, 0);
2484 		hdev->cdev_idx, dev_name(&hdev->pdev->dev));
2490  * hl_device_fini - main tear-down function for habanalabs device
2504 	dev_info(hdev->dev, "Removing device %s\n", dev_name(&(hdev)->pdev->dev));
2506 	hdev->device_fini_pending = 1;
2507 	flush_delayed_work(&hdev->device_reset_work.reset_work);
2509 	if (hdev->pldm)
2515 	 * This function is competing with the reset function, so try to
2516 	 * take the reset atomic and if we are already in middle of reset,
2517 	 * wait until reset function is finished. Reset function is designed
2519 	 * ports, the hard reset could take between 10-30 seconds
2524 	spin_lock(&hdev->reset_info.lock);
2525 	device_in_reset = !!hdev->reset_info.in_reset;
2527 		hdev->reset_info.in_reset = 1;
2528 	spin_unlock(&hdev->reset_info.lock);
2533 		spin_lock(&hdev->reset_info.lock);
2534 		device_in_reset = !!hdev->reset_info.in_reset;
2536 			hdev->reset_info.in_reset = 1;
2537 		spin_unlock(&hdev->reset_info.lock);
2540 			dev_crit(hdev->dev,
2541 				"%s Failed to remove device because reset function did not finish\n",
2542 				dev_name(&(hdev)->pdev->dev));
2547 	cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work);
2549 	/* Disable PCI access from device F/W so it won't send us additional
2550 	 * interrupts. We disable MSI/MSI-X at the halt_engines function and we
2552 	 * disable the access here because if the device is marked disable, the
2554 	 * marked as disable so this message won't be sent
2559 	hdev->disabled = true;
2563 	hdev->reset_info.hard_reset_pending = true;
2573 	dev_info(hdev->dev,
2577 	hdev->process_kill_trial_cnt = 0;
2580 		dev_crit(hdev->dev, "Failed to kill all open processes (%d)\n", rc);
2584 	hdev->process_kill_trial_cnt = 0;
2587 		dev_crit(hdev->dev, "Failed to kill all control device open processes (%d)\n", rc);
2593 	/* Reset the H/W. It will be in idle state after this returns */
2594 	rc = hdev->asic_funcs->hw_fini(hdev, true, false);
2596 		dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
2598 	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
2601 	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
2602 		dev_err(hdev->dev, "kernel ctx is still alive\n");
2610 	vfree(hdev->captured_err_info.page_fault_info.user_mappings);
2612 	hl_eq_fini(hdev, &hdev->event_queue);
2614 	kfree(hdev->shadow_cs_queue);
2616 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
2617 		hl_cq_fini(hdev, &hdev->completion_queue[i]);
2618 	kfree(hdev->completion_queue);
2620 	user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
2621 					hdev->asic_prop.user_interrupt_count;
2624 		if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) {
2626 				vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool);
2629 		kfree(hdev->user_interrupt);
2632 	vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool);
2637 	hdev->asic_funcs->sw_fini(hdev);
2643 	drm_dev_unregister(&hdev->drm);
2655  * hl_rreg - Read an MMIO register
2665 	u32 val = readl(hdev->rmmio + reg);
2668 		trace_habanalabs_rreg32(&(hdev)->pdev->dev, reg, val);
2674  * hl_wreg - Write to an MMIO register
2678  * @val: 32-bit value
2680  * Writes the 32-bit value into the MMIO register
2686 		trace_habanalabs_wreg32(&(hdev)->pdev->dev, reg, val);
2688 	writel(val, hdev->rmmio + reg);
2694 	struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info;
2697 		dev_err(hdev->dev,
2704 	if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info.razwi_detected, 0, 1))
2707 	razwi_info->razwi.timestamp = ktime_to_ns(ktime_get());
2708 	razwi_info->razwi.addr = addr;
2709 	razwi_info->razwi.num_of_possible_engines = num_of_engines;
2710 	memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0],
2712 	razwi_info->razwi.flags = flags;
2714 	razwi_info->razwi_info_available = true;
2728 	struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
2737 	/* Reset previous session count*/
2738 	pgf_info->num_of_user_mappings = 0;
2742 		dev_err(hdev->dev, "Can't get user context for user mappings\n");
2746 	mutex_lock(&ctx->mem_hash_lock);
2747 	hash_for_each(ctx->mem_hash, i, hnode, node) {
2748 		vm_type = hnode->ptr;
2751 			pgf_info->num_of_user_mappings++;
2755 	if (!pgf_info->num_of_user_mappings)
2761 	vfree(pgf_info->user_mappings);
2762 	pgf_info->user_mappings =
2763 			vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
2764 	if (!pgf_info->user_mappings) {
2765 		pgf_info->num_of_user_mappings = 0;
2769 	hash_for_each(ctx->mem_hash, i, hnode, node) {
2770 		vm_type = hnode->ptr;
2772 			userptr = hnode->ptr;
2773 			pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
2774 			pgf_info->user_mappings[map_idx].size = userptr->size;
2777 			phys_pg_pack = hnode->ptr;
2778 			pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
2779 			pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size;
2784 	mutex_unlock(&ctx->mem_hash_lock);
2790 	struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
2793 	if (atomic_cmpxchg(&pgf_info->page_fault_detected, 0, 1))
2796 	pgf_info->page_fault.timestamp = ktime_to_ns(ktime_get());
2797 	pgf_info->page_fault.addr = addr;
2798 	pgf_info->page_fault.engine_id = eng_id;
2801 	pgf_info->page_fault_info_available = true;
2815 	struct hw_err_info *info = &hdev->captured_err_info.hw_err;
2818 	if (atomic_cmpxchg(&info->event_detected, 0, 1))
2821 	info->event.timestamp = ktime_to_ns(ktime_get());
2822 	info->event.event_id = event_id;
2824 	info->event_info_available = true;
2837 	struct fw_err_info *info = &hdev->captured_err_info.fw_err;
2840 	if (atomic_cmpxchg(&info->event_detected, 0, 1))
2843 	info->event.timestamp = ktime_to_ns(ktime_get());
2844 	info->event.err_type = fw_info->err_type;
2845 	if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
2846 		info->event.event_id = fw_info->event_id;
2848 	info->event_info_available = true;
2855 	if (info->event_mask)
2856 		*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
2861 	struct engine_err_info *info = &hdev->captured_err_info.engine_err;
2864 	if (atomic_cmpxchg(&info->event_detected, 0, 1))
2867 	info->event.timestamp = ktime_to_ns(ktime_get());
2868 	info->event.engine_id = engine_id;
2869 	info->event.error_count = error_count;
2870 	info->event_info_available = true;
2875 	vfree(captured_err_info->page_fault_info.user_mappings);
2877 	atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
2878 	captured_err_info->undef_opcode.write_enable = true;
2884 	struct cpumask *available_mask = &hdev->irq_affinity_mask;
2885 	int numa_node = hdev->pdev->dev.numa_node, i;
2892 		dev_err(hdev->dev, "No available affinities in current numa node\n");
2904 	if (cpumask_empty(&hdev->irq_affinity_mask)) {
2905 		dev_dbg(hdev->dev, "affinity mask is empty\n");
2909 	if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
2910 		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
2915 	hdev->heartbeat_debug_info.heartbeat_event_counter++;
2916 	hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds();
2917 	hdev->eq_heartbeat_received = true;
2922 	struct hl_clk_throttle *clk_throttle = &hdev->clk_throttling;
2925 	mutex_lock(&clk_throttle->lock);
2929 		clk_throttle->current_reason |= HL_CLK_THROTTLE_POWER;
2930 		clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_POWER;
2931 		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
2932 		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
2933 		dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n");
2937 		clk_throttle->current_reason &= ~HL_CLK_THROTTLE_POWER;
2938 		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
2939 		dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n");
2943 		clk_throttle->current_reason |= HL_CLK_THROTTLE_THERMAL;
2944 		clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
2945 		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
2946 		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
2948 		dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n");
2952 		clk_throttle->current_reason &= ~HL_CLK_THROTTLE_THERMAL;
2953 		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
2955 		dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n");
2959 		dev_err(hdev->dev, "Received invalid clock change event %d\n", event_type);
2963 	mutex_unlock(&clk_throttle->lock);