Lines Matching +full:max +full:- +full:heartbeat +full:- +full:sec
1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright 2016-2022 HabanaLabs, Ltd.
36 * hl_set_dram_bar- sets the bar to allow later access to address
54 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_set_dram_bar()
57 if (is_power_of_2(prop->dram_pci_bar_size)) in hl_set_dram_bar()
58 bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull); in hl_set_dram_bar()
60 bar_base_addr = region->region_base + in hl_set_dram_bar()
61 div64_u64((addr - region->region_base), prop->dram_pci_bar_size) * in hl_set_dram_bar()
62 prop->dram_pci_bar_size; in hl_set_dram_bar()
64 old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); in hl_set_dram_bar()
76 struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; in hl_access_sram_dram_region()
77 u64 old_base = 0, rc, bar_region_base = region->region_base; in hl_access_sram_dram_region()
83 return -EIO; in hl_access_sram_dram_region()
86 acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + in hl_access_sram_dram_region()
87 (addr - bar_region_base); in hl_access_sram_dram_region()
113 return -EIO; in hl_access_sram_dram_region()
127 ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); in hl_dma_alloc_common()
130 ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); in hl_dma_alloc_common()
135 trace_habanalabs_dma_alloc(&(hdev)->pdev->dev, (u64) (uintptr_t) ptr, *dma_handle, in hl_dma_alloc_common()
150 hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); in hl_asic_dma_free_common()
153 hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); in hl_asic_dma_free_common()
157 trace_habanalabs_dma_free(&(hdev)->pdev->dev, store_cpu_addr, dma_handle, size, caller); in hl_asic_dma_free_common()
186 return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); in hl_cpu_accessible_dma_pool_alloc()
191 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr); in hl_cpu_accessible_dma_pool_free()
197 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_dma_map_sgtable_caller()
201 rc = hdev->asic_funcs->dma_map_sgtable(hdev, sgt, dir); in hl_dma_map_sgtable_caller()
209 trace_habanalabs_dma_map_page(&(hdev)->pdev->dev, in hl_dma_map_sgtable_caller()
211 sg->dma_address - prop->device_dma_offset_for_host_access, in hl_dma_map_sgtable_caller()
213 sg->dma_length, in hl_dma_map_sgtable_caller()
215 sg->length, in hl_dma_map_sgtable_caller()
225 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_asic_dma_map_sgtable()
229 rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0); in hl_asic_dma_map_sgtable()
234 if (prop->device_dma_offset_for_host_access) in hl_asic_dma_map_sgtable()
236 sg->dma_address += prop->device_dma_offset_for_host_access; in hl_asic_dma_map_sgtable()
244 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_dma_unmap_sgtable_caller()
248 hdev->asic_funcs->dma_unmap_sgtable(hdev, sgt, dir); in hl_dma_unmap_sgtable_caller()
252 trace_habanalabs_dma_unmap_page(&(hdev)->pdev->dev, in hl_dma_unmap_sgtable_caller()
254 sg->dma_address - prop->device_dma_offset_for_host_access, in hl_dma_unmap_sgtable_caller()
256 sg->dma_length, in hl_dma_unmap_sgtable_caller()
258 sg->length, in hl_dma_unmap_sgtable_caller()
267 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_asic_dma_unmap_sgtable()
272 if (prop->device_dma_offset_for_host_access) in hl_asic_dma_unmap_sgtable()
274 sg->dma_address -= prop->device_dma_offset_for_host_access; in hl_asic_dma_unmap_sgtable()
276 dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0); in hl_asic_dma_unmap_sgtable()
280 * hl_access_cfg_region - access the config region
290 struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG]; in hl_access_cfg_region()
294 dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32)); in hl_access_cfg_region()
295 return -EINVAL; in hl_access_cfg_region()
300 *val = RREG32(addr - cfg_region->region_base); in hl_access_cfg_region()
303 WREG32(addr - cfg_region->region_base, *val); in hl_access_cfg_region()
306 val_l = RREG32(addr - cfg_region->region_base); in hl_access_cfg_region()
307 val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base); in hl_access_cfg_region()
312 WREG32(addr - cfg_region->region_base, lower_32_bits(*val)); in hl_access_cfg_region()
313 WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val)); in hl_access_cfg_region()
316 dev_err(hdev->dev, "access type %d is not supported\n", acc_type); in hl_access_cfg_region()
317 return -EOPNOTSUPP; in hl_access_cfg_region()
324 * hl_access_dev_mem - access device memory
343 return -EFAULT; in hl_access_dev_mem()
361 if ((e->actual_size + str_size) < e->allocated_buf_size) { in hl_engine_data_sprintf()
363 vsnprintf(e->buf + e->actual_size, str_size, fmt, args); in hl_engine_data_sprintf()
370 e->actual_size += str_size; in hl_engine_data_sprintf()
377 if (hdev->device_fini_pending) { in hl_device_status()
379 } else if (hdev->reset_info.in_reset) { in hl_device_status()
380 if (hdev->reset_info.in_compute_reset) in hl_device_status()
384 } else if (hdev->reset_info.needs_reset) { in hl_device_status()
386 } else if (hdev->disabled) { in hl_device_status()
388 } else if (!hdev->init_done) { in hl_device_status()
445 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n", in print_idle_status_mask()
446 dev_name(&hdev->pdev->dev), message, in print_idle_status_mask()
449 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n", in print_idle_status_mask()
450 dev_name(&hdev->pdev->dev), message, in print_idle_status_mask()
453 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n", in print_idle_status_mask()
454 dev_name(&hdev->pdev->dev), message, idle_mask[1], idle_mask[0]); in print_idle_status_mask()
456 dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message, in print_idle_status_mask()
469 hdev = hpriv->hdev; in hpriv_release()
471 hdev->asic_funcs->send_device_activity(hdev, false); in hpriv_release()
475 mutex_destroy(&hpriv->ctx_lock); in hpriv_release()
476 mutex_destroy(&hpriv->restore_phase_mutex); in hpriv_release()
479 hl_mem_mgr_idr_destroy(&hpriv->mem_mgr); in hpriv_release()
481 /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending in hpriv_release()
484 reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active; in hpriv_release()
489 if (!hdev->reset_info.in_reset && !reset_device && !hdev->pldm) in hpriv_release()
490 device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, in hpriv_release()
507 mutex_lock(&hdev->fpriv_list_lock); in hpriv_release()
508 list_del(&hpriv->dev_node); in hpriv_release()
509 mutex_unlock(&hdev->fpriv_list_lock); in hpriv_release()
511 put_pid(hpriv->taskpid); in hpriv_release()
517 int rc = hdev->asic_funcs->scrub_device_mem(hdev); in hpriv_release()
520 dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc); in hpriv_release()
529 mutex_lock(&hdev->fpriv_list_lock); in hpriv_release()
530 hdev->is_compute_ctx_active = false; in hpriv_release()
531 mutex_unlock(&hdev->fpriv_list_lock); in hpriv_release()
533 hdev->compute_ctx_in_release = 0; in hpriv_release()
536 if (hpriv->notifier_event.eventfd) in hpriv_release()
537 eventfd_ctx_put(hpriv->notifier_event.eventfd); in hpriv_release()
539 mutex_destroy(&hpriv->notifier_event.lock); in hpriv_release()
546 kref_get(&hpriv->refcount); in hl_hpriv_get()
551 return kref_put(&hpriv->refcount, hpriv_release); in hl_hpriv_put()
569 offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num); in print_device_in_use_info()
572 dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt); in print_device_in_use_info()
575 offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]", in print_device_in_use_info()
579 if (mm_fini_stats->n_busy_cb) { in print_device_in_use_info()
581 offset += scnprintf(buf + offset, size - offset, " [%u live CB handles]", in print_device_in_use_info()
582 mm_fini_stats->n_busy_cb); in print_device_in_use_info()
586 scnprintf(buf + offset, size - offset, " [unknown reason]"); in print_device_in_use_info()
588 dev_notice(hdev->dev, "%s%s\n", message, buf); in print_device_in_use_info()
592 * hl_device_release() - release function for habanalabs device.
600 struct hl_fpriv *hpriv = file_priv->driver_priv; in hl_device_release()
606 put_pid(hpriv->taskpid); in hl_device_release()
609 hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); in hl_device_release()
614 hl_mem_mgr_fini(&hpriv->mem_mgr, &mm_fini_stats); in hl_device_release()
616 hdev->compute_ctx_in_release = 1; in hl_device_release()
624 hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif; in hl_device_release()
629 struct hl_fpriv *hpriv = filp->private_data; in hl_device_release_ctrl()
630 struct hl_device *hdev = hpriv->hdev; in hl_device_release_ctrl()
632 filp->private_data = NULL; in hl_device_release_ctrl()
639 mutex_lock(&hdev->fpriv_ctrl_list_lock); in hl_device_release_ctrl()
640 list_del(&hpriv->dev_node); in hl_device_release_ctrl()
641 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in hl_device_release_ctrl()
643 put_pid(hpriv->taskpid); in hl_device_release_ctrl()
652 struct hl_device *hdev = hpriv->hdev; in __hl_mmap()
657 return -ENODEV; in __hl_mmap()
660 vm_pgoff = vma->vm_pgoff; in __hl_mmap()
664 vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff); in __hl_mmap()
669 return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL); in __hl_mmap()
671 return -EINVAL; in __hl_mmap()
675 * hl_mmap - mmap function for habanalabs device
685 struct drm_file *file_priv = filp->private_data; in hl_mmap()
686 struct hl_fpriv *hpriv = file_priv->driver_priv; in hl_mmap()
705 * device_init_cdev - Initialize cdev and device for habanalabs device
723 cdev->owner = THIS_MODULE; in device_init_cdev()
727 return -ENOMEM; in device_init_cdev()
730 (*dev)->devt = MKDEV(hdev->major, minor); in device_init_cdev()
731 (*dev)->class = class; in device_init_cdev()
732 (*dev)->release = device_release_func; in device_init_cdev()
741 const struct class *accel_class = hdev->drm.accel->kdev->class; in cdev_sysfs_debugfs_add()
745 hdev->cdev_idx = hdev->drm.accel->index; in cdev_sysfs_debugfs_add()
748 snprintf(name, sizeof(name), "accel_controlD%d", hdev->cdev_idx); in cdev_sysfs_debugfs_add()
749 rc = device_init_cdev(hdev, accel_class, hdev->cdev_idx, &hl_ctrl_ops, name, in cdev_sysfs_debugfs_add()
750 &hdev->cdev_ctrl, &hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
754 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
756 dev_err(hdev->dev_ctrl, in cdev_sysfs_debugfs_add()
763 dev_err(hdev->dev, "failed to initialize sysfs\n"); in cdev_sysfs_debugfs_add()
769 hdev->cdev_sysfs_debugfs_created = true; in cdev_sysfs_debugfs_add()
774 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
776 put_device(hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
782 if (!hdev->cdev_sysfs_debugfs_created) in cdev_sysfs_debugfs_remove()
787 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); in cdev_sysfs_debugfs_remove()
788 put_device(hdev->dev_ctrl); in cdev_sysfs_debugfs_remove()
795 struct hl_device *hdev = device_reset_work->hdev; in device_hard_reset_pending()
799 flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; in device_hard_reset_pending()
803 if ((rc == -EBUSY) && !hdev->device_fini_pending) { in device_hard_reset_pending()
810 dev_info(hdev->dev, in device_hard_reset_pending()
812 kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC); in device_hard_reset_pending()
815 dev_info(hdev->dev, "Could not reset device. will try again in %u seconds", in device_hard_reset_pending()
819 queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, in device_hard_reset_pending()
828 struct hl_device *hdev = watchdog_work->hdev; in device_release_watchdog_func()
831 dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n"); in device_release_watchdog_func()
833 flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR; in device_release_watchdog_func()
839 * device_early_init - do some early initialization for the habanalabs device
851 switch (hdev->asic_type) { in device_early_init()
854 strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); in device_early_init()
858 strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name)); in device_early_init()
862 strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name)); in device_early_init()
866 strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); in device_early_init()
870 strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); in device_early_init()
874 strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name)); in device_early_init()
878 strscpy(hdev->asic_name, "GAUDI2D", sizeof(hdev->asic_name)); in device_early_init()
881 dev_err(hdev->dev, "Unrecognized ASIC type %d\n", in device_early_init()
882 hdev->asic_type); in device_early_init()
883 return -EINVAL; in device_early_init()
886 rc = hdev->asic_funcs->early_init(hdev); in device_early_init()
894 if (hdev->asic_prop.completion_queues_count) { in device_early_init()
895 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, in device_early_init()
898 if (!hdev->cq_wq) { in device_early_init()
899 rc = -ENOMEM; in device_early_init()
904 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { in device_early_init()
905 snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i); in device_early_init()
906 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); in device_early_init()
907 if (hdev->cq_wq[i] == NULL) { in device_early_init()
908 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); in device_early_init()
909 rc = -ENOMEM; in device_early_init()
914 snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx); in device_early_init()
915 hdev->eq_wq = create_singlethread_workqueue(workq_name); in device_early_init()
916 if (hdev->eq_wq == NULL) { in device_early_init()
917 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); in device_early_init()
918 rc = -ENOMEM; in device_early_init()
922 snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx); in device_early_init()
923 hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); in device_early_init()
924 if (!hdev->cs_cmplt_wq) { in device_early_init()
925 dev_err(hdev->dev, in device_early_init()
927 rc = -ENOMEM; in device_early_init()
931 snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx); in device_early_init()
932 hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); in device_early_init()
933 if (!hdev->ts_free_obj_wq) { in device_early_init()
934 dev_err(hdev->dev, in device_early_init()
936 rc = -ENOMEM; in device_early_init()
940 snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx); in device_early_init()
941 hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); in device_early_init()
942 if (!hdev->prefetch_wq) { in device_early_init()
943 dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); in device_early_init()
944 rc = -ENOMEM; in device_early_init()
948 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL); in device_early_init()
949 if (!hdev->hl_chip_info) { in device_early_init()
950 rc = -ENOMEM; in device_early_init()
958 hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); in device_early_init()
960 snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx); in device_early_init()
961 hdev->reset_wq = create_singlethread_workqueue(workq_name); in device_early_init()
962 if (!hdev->reset_wq) { in device_early_init()
963 rc = -ENOMEM; in device_early_init()
964 dev_err(hdev->dev, "Failed to create device reset WQ\n"); in device_early_init()
968 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); in device_early_init()
970 INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending); in device_early_init()
971 hdev->device_reset_work.hdev = hdev; in device_early_init()
972 hdev->device_fini_pending = 0; in device_early_init()
974 INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work, in device_early_init()
976 hdev->device_release_watchdog_work.hdev = hdev; in device_early_init()
978 mutex_init(&hdev->send_cpu_message_lock); in device_early_init()
979 mutex_init(&hdev->debug_lock); in device_early_init()
980 INIT_LIST_HEAD(&hdev->cs_mirror_list); in device_early_init()
981 spin_lock_init(&hdev->cs_mirror_lock); in device_early_init()
982 spin_lock_init(&hdev->reset_info.lock); in device_early_init()
983 INIT_LIST_HEAD(&hdev->fpriv_list); in device_early_init()
984 INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); in device_early_init()
985 mutex_init(&hdev->fpriv_list_lock); in device_early_init()
986 mutex_init(&hdev->fpriv_ctrl_list_lock); in device_early_init()
987 mutex_init(&hdev->clk_throttling.lock); in device_early_init()
992 hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL); in device_early_init()
993 hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr); in device_early_init()
995 kfree(hdev->hl_chip_info); in device_early_init()
997 destroy_workqueue(hdev->prefetch_wq); in device_early_init()
999 destroy_workqueue(hdev->ts_free_obj_wq); in device_early_init()
1001 destroy_workqueue(hdev->cs_cmplt_wq); in device_early_init()
1003 destroy_workqueue(hdev->eq_wq); in device_early_init()
1005 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in device_early_init()
1006 if (hdev->cq_wq[i]) in device_early_init()
1007 destroy_workqueue(hdev->cq_wq[i]); in device_early_init()
1008 kfree(hdev->cq_wq); in device_early_init()
1012 if (hdev->asic_funcs->early_fini) in device_early_init()
1013 hdev->asic_funcs->early_fini(hdev); in device_early_init()
1019 * device_early_fini - finalize all that was done in device_early_init
1028 mutex_destroy(&hdev->debug_lock); in device_early_fini()
1029 mutex_destroy(&hdev->send_cpu_message_lock); in device_early_fini()
1031 mutex_destroy(&hdev->fpriv_list_lock); in device_early_fini()
1032 mutex_destroy(&hdev->fpriv_ctrl_list_lock); in device_early_fini()
1034 mutex_destroy(&hdev->clk_throttling.lock); in device_early_fini()
1036 hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL); in device_early_fini()
1037 hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr); in device_early_fini()
1039 kfree(hdev->hl_chip_info); in device_early_fini()
1041 destroy_workqueue(hdev->prefetch_wq); in device_early_fini()
1042 destroy_workqueue(hdev->ts_free_obj_wq); in device_early_fini()
1043 destroy_workqueue(hdev->cs_cmplt_wq); in device_early_fini()
1044 destroy_workqueue(hdev->eq_wq); in device_early_fini()
1045 destroy_workqueue(hdev->reset_wq); in device_early_fini()
1047 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in device_early_fini()
1048 destroy_workqueue(hdev->cq_wq[i]); in device_early_fini()
1049 kfree(hdev->cq_wq); in device_early_fini()
1053 if (hdev->asic_funcs->early_fini) in device_early_fini()
1054 hdev->asic_funcs->early_fini(hdev); in device_early_fini()
1061 if (!hdev->pdev) in is_pci_link_healthy()
1064 pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id); in is_pci_link_healthy()
1066 return (device_id == hdev->pdev->device); in is_pci_link_healthy()
1072 time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts in stringify_time_of_last_heartbeat()
1073 : hdev->heartbeat_debug_info.last_eq_heartbeat_ts; in stringify_time_of_last_heartbeat()
1081 snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)", in stringify_time_of_last_heartbeat()
1087 struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info; in hl_device_eq_heartbeat_received()
1088 u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1; in hl_device_eq_heartbeat_received()
1089 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_device_eq_heartbeat_received()
1092 if (!prop->cpucp_info.eq_health_check_supported) in hl_device_eq_heartbeat_received()
1095 if (!hdev->eq_heartbeat_received) { in hl_device_eq_heartbeat_received()
1096 dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); in hl_device_eq_heartbeat_received()
1100 dev_err(hdev->dev, in hl_device_eq_heartbeat_received()
1102 hdev->event_queue.ci, in hl_device_eq_heartbeat_received()
1103 heartbeat_debug_info->heartbeat_event_counter, in hl_device_eq_heartbeat_received()
1105 hdev->kernel_queues[cpu_q_id].pi, in hl_device_eq_heartbeat_received()
1106 atomic_read(&hdev->kernel_queues[cpu_q_id].ci), in hl_device_eq_heartbeat_received()
1107 atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask, in hl_device_eq_heartbeat_received()
1110 hl_eq_dump(hdev, &hdev->event_queue); in hl_device_eq_heartbeat_received()
1115 hdev->eq_heartbeat_received = false; in hl_device_eq_heartbeat_received()
1127 /* Start heartbeat checks only after driver has enabled events from FW */ in hl_device_heartbeat()
1128 if (!hl_device_operational(hdev, NULL) || !hdev->init_done) in hl_device_heartbeat()
1132 * For EQ health check need to check if driver received the heartbeat eq event in hl_device_heartbeat()
1134 * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule. in hl_device_heartbeat()
1136 if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev))) in hl_device_heartbeat()
1140 dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n", in hl_device_heartbeat()
1153 * heartbeat immediately post reset. in hl_device_heartbeat()
1154 * If control reached here, then at least one heartbeat work has been in hl_device_heartbeat()
1158 * status for at least one heartbeat. From this point driver restarts in hl_device_heartbeat()
1161 if (!hdev->reset_info.in_reset) in hl_device_heartbeat()
1162 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; in hl_device_heartbeat()
1164 schedule_delayed_work(&hdev->work_heartbeat, in hl_device_heartbeat()
1169 * device_late_init - do late stuff initialization for the habanalabs device
1180 if (hdev->asic_funcs->late_init) { in device_late_init()
1181 rc = hdev->asic_funcs->late_init(hdev); in device_late_init()
1183 dev_err(hdev->dev, in device_late_init()
1189 hdev->high_pll = hdev->asic_prop.high_pll; in device_late_init()
1190 hdev->late_init_done = true; in device_late_init()
1196 * device_late_fini - finalize all that was done in device_late_init
1203 if (!hdev->late_init_done) in device_late_fini()
1206 if (hdev->asic_funcs->late_fini) in device_late_fini()
1207 hdev->asic_funcs->late_fini(hdev); in device_late_fini()
1209 hdev->late_init_done = false; in device_late_fini()
1217 max_power = hdev->max_power; in hl_device_utilization()
1218 dc_power = hdev->asic_prop.dc_power_default; in hl_device_utilization()
1219 divisor = max_power - dc_power; in hl_device_utilization()
1221 dev_warn(hdev->dev, "device utilization is not supported\n"); in hl_device_utilization()
1222 return -EOPNOTSUPP; in hl_device_utilization()
1231 dividend = (curr_power - dc_power) * 100; in hl_device_utilization()
1241 mutex_lock(&hdev->debug_lock); in hl_device_set_debug_mode()
1244 if (!hdev->in_debug) { in hl_device_set_debug_mode()
1245 dev_err(hdev->dev, in hl_device_set_debug_mode()
1247 rc = -EFAULT; in hl_device_set_debug_mode()
1251 if (!hdev->reset_info.hard_reset_pending) in hl_device_set_debug_mode()
1252 hdev->asic_funcs->halt_coresight(hdev, ctx); in hl_device_set_debug_mode()
1254 hdev->in_debug = 0; in hl_device_set_debug_mode()
1259 if (hdev->in_debug) { in hl_device_set_debug_mode()
1260 dev_err(hdev->dev, in hl_device_set_debug_mode()
1262 rc = -EFAULT; in hl_device_set_debug_mode()
1266 hdev->in_debug = 1; in hl_device_set_debug_mode()
1269 mutex_unlock(&hdev->debug_lock); in hl_device_set_debug_mode()
1279 hdev->asic_funcs->hw_queues_lock(hdev); in take_release_locks()
1280 hdev->asic_funcs->hw_queues_unlock(hdev); in take_release_locks()
1283 mutex_lock(&hdev->send_cpu_message_lock); in take_release_locks()
1284 mutex_unlock(&hdev->send_cpu_message_lock); in take_release_locks()
1287 mutex_lock(&hdev->fpriv_list_lock); in take_release_locks()
1288 mutex_unlock(&hdev->fpriv_list_lock); in take_release_locks()
1289 mutex_lock(&hdev->fpriv_ctrl_list_lock); in take_release_locks()
1290 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in take_release_locks()
1307 if (hdev->heartbeat) in cleanup_resources()
1308 cancel_delayed_work_sync(&hdev->work_heartbeat); in cleanup_resources()
1318 hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset); in cleanup_resources()
1324 flush_workqueue(hdev->prefetch_wq); in cleanup_resources()
1330 * hl_device_suspend - initiate device suspend
1342 pci_save_state(hdev->pdev); in hl_device_suspend()
1345 spin_lock(&hdev->reset_info.lock); in hl_device_suspend()
1346 if (hdev->reset_info.in_reset) { in hl_device_suspend()
1347 spin_unlock(&hdev->reset_info.lock); in hl_device_suspend()
1348 dev_err(hdev->dev, "Can't suspend while in reset\n"); in hl_device_suspend()
1349 return -EIO; in hl_device_suspend()
1351 hdev->reset_info.in_reset = 1; in hl_device_suspend()
1352 spin_unlock(&hdev->reset_info.lock); in hl_device_suspend()
1355 hdev->disabled = true; in hl_device_suspend()
1359 rc = hdev->asic_funcs->suspend(hdev); in hl_device_suspend()
1361 dev_err(hdev->dev, in hl_device_suspend()
1365 pci_disable_device(hdev->pdev); in hl_device_suspend()
1366 pci_set_power_state(hdev->pdev, PCI_D3hot); in hl_device_suspend()
1372 * hl_device_resume - initiate device resume
1384 pci_set_power_state(hdev->pdev, PCI_D0); in hl_device_resume()
1385 pci_restore_state(hdev->pdev); in hl_device_resume()
1386 rc = pci_enable_device_mem(hdev->pdev); in hl_device_resume()
1388 dev_err(hdev->dev, in hl_device_resume()
1393 pci_set_master(hdev->pdev); in hl_device_resume()
1395 rc = hdev->asic_funcs->resume(hdev); in hl_device_resume()
1397 dev_err(hdev->dev, "Failed to resume device after suspend\n"); in hl_device_resume()
1405 spin_lock(&hdev->reset_info.lock); in hl_device_resume()
1406 hdev->reset_info.in_reset = 0; in hl_device_resume()
1407 spin_unlock(&hdev->reset_info.lock); in hl_device_resume()
1411 dev_err(hdev->dev, "Failed to reset device during resume\n"); in hl_device_resume()
1418 pci_disable_device(hdev->pdev); in hl_device_resume()
1431 hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; in device_kill_open_processes()
1432 hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; in device_kill_open_processes()
1443 if (hdev->process_kill_trial_cnt) { in device_kill_open_processes()
1459 task = get_pid_task(hpriv->taskpid, PIDTYPE_PID); in device_kill_open_processes()
1461 dev_info(hdev->dev, "Killing user process pid=%d\n", in device_kill_open_processes()
1468 dev_dbg(hdev->dev, in device_kill_open_processes()
1470 pid_nr(hpriv->taskpid)); in device_kill_open_processes()
1487 dev_dbg(hdev->dev, in device_kill_open_processes()
1490 pending_cnt--; in device_kill_open_processes()
1500 if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS) in device_kill_open_processes()
1501 return -ETIME; in device_kill_open_processes()
1503 hdev->process_kill_trial_cnt++; in device_kill_open_processes()
1505 return -EBUSY; in device_kill_open_processes()
1514 hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; in device_disable_open_processes()
1515 hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; in device_disable_open_processes()
1519 hpriv->hdev = NULL; in device_disable_open_processes()
1525 /* If reset is due to heartbeat, device CPU is no responsive in in send_disable_pci_access()
1531 * us additional interrupts. We disable MSI/MSI-X at in send_disable_pci_access()
1536 * of heartbeat, the device CPU is marked as disable in send_disable_pci_access()
1545 if (hdev->cpu_queues_enable) in send_disable_pci_access()
1546 disable_irq(pci_irq_vector(hdev->pdev, hdev->asic_prop.eq_interrupt_id)); in send_disable_pci_access()
1555 if (hdev->is_compute_ctx_active) in handle_reset_trigger()
1565 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; in handle_reset_trigger()
1568 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR; in handle_reset_trigger()
1571 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; in handle_reset_trigger()
1574 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; in handle_reset_trigger()
1582 if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) { in handle_reset_trigger()
1583 hdev->reset_info.prev_reset_trigger = cur_reset_trigger; in handle_reset_trigger()
1584 hdev->reset_info.reset_trigger_repeated = 0; in handle_reset_trigger()
1586 hdev->reset_info.reset_trigger_repeated = 1; in handle_reset_trigger()
1592 hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0; in reset_heartbeat_debug_info()
1593 hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0; in reset_heartbeat_debug_info()
1594 hdev->heartbeat_debug_info.heartbeat_event_counter = 0; in reset_heartbeat_debug_info()
1599 if (!hdev->heartbeat) in device_heartbeat_schedule()
1605 * Before scheduling the heartbeat driver will check if eq event has received. in device_heartbeat_schedule()
1609 hdev->eq_heartbeat_received = true; in device_heartbeat_schedule()
1611 schedule_delayed_work(&hdev->work_heartbeat, in device_heartbeat_schedule()
1616 * hl_device_reset - reset the device
1624 * Re-initialize all internal data structures
1639 if (!hdev->init_done) { in hl_device_reset()
1640 dev_err(hdev->dev, "Can't reset before initialization is done\n"); in hl_device_reset()
1650 reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release; in hl_device_reset()
1653 dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n"); in hl_device_reset()
1657 if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { in hl_device_reset()
1658 dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n"); in hl_device_reset()
1664 dev_crit(hdev->dev, in hl_device_reset()
1665 "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n"); in hl_device_reset()
1666 return -EINVAL; in hl_device_reset()
1672 if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { in hl_device_reset()
1673 dev_dbg(hdev->dev, in hl_device_reset()
1674 "asic doesn't allow inference soft reset - do hard-reset instead\n"); in hl_device_reset()
1679 /* Re-entry of reset thread */ in hl_device_reset()
1680 if (from_hard_reset_thread && hdev->process_kill_trial_cnt) in hl_device_reset()
1684 * Prevent concurrency in this function - only one reset should be in hl_device_reset()
1690 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1691 if (hdev->reset_info.in_reset) { in hl_device_reset()
1693 if (hard_reset && hdev->reset_info.in_compute_reset) in hl_device_reset()
1694 hdev->reset_info.hard_reset_schedule_flags = flags; in hl_device_reset()
1695 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1702 hdev->reset_info.in_compute_reset = !hard_reset; in hl_device_reset()
1704 hdev->reset_info.in_reset = 1; in hl_device_reset()
1706 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1709 * In case of reset-upon-device-release while the release watchdog work is in hl_device_reset()
1710 * scheduled due to a hard-reset, do hard-reset instead of compute-reset. in hl_device_reset()
1712 if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { in hl_device_reset()
1714 &hdev->device_release_watchdog_work; in hl_device_reset()
1716 hdev->reset_info.watchdog_active = 0; in hl_device_reset()
1718 cancel_delayed_work_sync(&watchdog_work->reset_work); in hl_device_reset()
1720 if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) { in hl_device_reset()
1721 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1736 hdev->disabled = true; in hl_device_reset()
1741 dev_info(hdev->dev, "Going to reset device\n"); in hl_device_reset()
1743 dev_dbg(hdev->dev, "Going to reset device after release by user\n"); in hl_device_reset()
1745 dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); in hl_device_reset()
1749 hdev->reset_info.hard_reset_pending = true; in hl_device_reset()
1751 hdev->process_kill_trial_cnt = 0; in hl_device_reset()
1753 hdev->device_reset_work.flags = flags; in hl_device_reset()
1756 * Because the reset function can't run from heartbeat work, in hl_device_reset()
1759 queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0); in hl_device_reset()
1774 if (rc == -EBUSY) { in hl_device_reset()
1775 if (hdev->device_fini_pending) { in hl_device_reset()
1776 dev_crit(hdev->dev, in hl_device_reset()
1778 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1787 dev_crit(hdev->dev, in hl_device_reset()
1789 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1796 flush_workqueue(hdev->eq_wq); in hl_device_reset()
1800 hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); in hl_device_reset()
1803 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; in hl_device_reset()
1806 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) in hl_device_reset()
1807 hdev->kernel_ctx = NULL; in hl_device_reset()
1811 hl_eq_reset(hdev, &hdev->event_queue); in hl_device_reset()
1814 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */ in hl_device_reset()
1816 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in hl_device_reset()
1817 hl_cq_reset(hdev, &hdev->completion_queue[i]); in hl_device_reset()
1822 atomic_set(&ctx->thread_ctx_switch_token, 1); in hl_device_reset()
1823 ctx->thread_ctx_switch_wait_token = 0; in hl_device_reset()
1831 /* Finished tear-down, starting to re-initialize */ in hl_device_reset()
1834 hdev->device_cpu_disabled = false; in hl_device_reset()
1835 hdev->reset_info.hard_reset_pending = false; in hl_device_reset()
1841 if (hdev->reset_info.reset_trigger_repeated && in hl_device_reset()
1842 (hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR || in hl_device_reset()
1843 hdev->reset_info.prev_reset_trigger == in hl_device_reset()
1845 dev_crit(hdev->dev, in hl_device_reset()
1847 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1848 rc = -EIO; in hl_device_reset()
1852 if (hdev->kernel_ctx) { in hl_device_reset()
1853 dev_crit(hdev->dev, in hl_device_reset()
1855 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1856 rc = -EBUSY; in hl_device_reset()
1862 dev_err(hdev->dev, in hl_device_reset()
1868 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), in hl_device_reset()
1870 if (!hdev->kernel_ctx) { in hl_device_reset()
1871 rc = -ENOMEM; in hl_device_reset()
1876 hdev->is_compute_ctx_active = false; in hl_device_reset()
1878 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); in hl_device_reset()
1880 dev_err(hdev->dev, in hl_device_reset()
1882 kfree(hdev->kernel_ctx); in hl_device_reset()
1883 hdev->kernel_ctx = NULL; in hl_device_reset()
1893 hdev->disabled = false; in hl_device_reset()
1895 /* F/W security enabled indication might be updated after hard-reset */ in hl_device_reset()
1902 rc = hdev->asic_funcs->hw_init(hdev); in hl_device_reset()
1904 dev_err(hdev->dev, "failed to initialize the H/W after reset\n"); in hl_device_reset()
1909 if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, in hl_device_reset()
1912 rc = -EIO; in hl_device_reset()
1917 rc = hdev->asic_funcs->test_queues(hdev); in hl_device_reset()
1919 dev_err(hdev->dev, "Failed to detect if device is alive after reset\n"); in hl_device_reset()
1926 dev_err(hdev->dev, "Failed late init after hard reset\n"); in hl_device_reset()
1932 dev_err(hdev->dev, "Failed to init memory module after hard reset\n"); in hl_device_reset()
1936 if (!hdev->asic_prop.fw_security_enabled) in hl_device_reset()
1939 rc = hdev->asic_funcs->compute_reset_late_init(hdev); in hl_device_reset()
1942 dev_err(hdev->dev, in hl_device_reset()
1945 dev_err(hdev->dev, "Failed late init after compute reset\n"); in hl_device_reset()
1950 rc = hdev->asic_funcs->scrub_device_mem(hdev); in hl_device_reset()
1952 dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc); in hl_device_reset()
1956 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1957 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1963 if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags) in hl_device_reset()
1966 hdev->reset_info.in_reset = 0; in hl_device_reset()
1968 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1970 hdev->reset_info.needs_reset = false; in hl_device_reset()
1973 dev_info(hdev->dev, in hl_device_reset()
1975 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1977 dev_dbg(hdev->dev, in hl_device_reset()
1979 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1982 hdev->reset_info.hard_reset_cnt++; in hl_device_reset()
1991 hdev->asic_funcs->enable_events_from_fw(hdev); in hl_device_reset()
1994 hdev->reset_info.compute_reset_cnt++; in hl_device_reset()
1997 dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); in hl_device_reset()
1998 flags = hdev->reset_info.hard_reset_schedule_flags; in hl_device_reset()
1999 hdev->reset_info.hard_reset_schedule_flags = 0; in hl_device_reset()
2008 hdev->disabled = true; in hl_device_reset()
2010 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
2011 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
2014 dev_err(hdev->dev, in hl_device_reset()
2016 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
2017 hdev->reset_info.hard_reset_cnt++; in hl_device_reset()
2020 dev_err(hdev->dev, "Failed to reset device after user release\n"); in hl_device_reset()
2023 dev_err(hdev->dev, "Failed to do compute reset\n"); in hl_device_reset()
2024 hdev->reset_info.compute_reset_cnt++; in hl_device_reset()
2027 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
2033 hdev->reset_info.in_reset = 0; in hl_device_reset()
2035 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
2041 * hl_device_cond_reset() - conditionally reset the device.
2059 dev_err(hdev->dev, "Resetting device without a reset indication to user\n"); in hl_device_cond_reset()
2073 if (!ctx->hpriv->notifier_event.eventfd && !hdev->reset_info.watchdog_active) in hl_device_cond_reset()
2079 spin_lock(&hdev->reset_info.lock); in hl_device_cond_reset()
2080 if (hdev->reset_info.in_reset) { in hl_device_cond_reset()
2081 spin_unlock(&hdev->reset_info.lock); in hl_device_cond_reset()
2085 if (hdev->reset_info.watchdog_active) { in hl_device_cond_reset()
2086 hdev->device_release_watchdog_work.flags |= flags; in hl_device_cond_reset()
2090 hdev->device_release_watchdog_work.flags = flags; in hl_device_cond_reset()
2091 dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n", in hl_device_cond_reset()
2092 hdev->device_release_watchdog_timeout_sec); in hl_device_cond_reset()
2093 schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, in hl_device_cond_reset()
2094 msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); in hl_device_cond_reset()
2095 hdev->reset_info.watchdog_active = 1; in hl_device_cond_reset()
2097 spin_unlock(&hdev->reset_info.lock); in hl_device_cond_reset()
2118 mutex_lock(¬ifier_event->lock); in hl_notifier_event_send()
2119 notifier_event->events_mask |= event_mask; in hl_notifier_event_send()
2121 if (notifier_event->eventfd) in hl_notifier_event_send()
2122 eventfd_signal(notifier_event->eventfd); in hl_notifier_event_send()
2124 mutex_unlock(¬ifier_event->lock); in hl_notifier_event_send()
2128 * hl_notifier_event_send_all - notify all user processes via eventfd
2139 dev_warn(hdev->dev, "Skip sending zero event"); in hl_notifier_event_send_all()
2143 mutex_lock(&hdev->fpriv_list_lock); in hl_notifier_event_send_all()
2145 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) in hl_notifier_event_send_all()
2146 hl_notifier_event_send(&hpriv->notifier_event, event_mask); in hl_notifier_event_send_all()
2148 mutex_unlock(&hdev->fpriv_list_lock); in hl_notifier_event_send_all()
2152 * hl_device_init - main initialization function for habanalabs device
2172 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + in hl_device_init()
2173 hdev->asic_prop.user_interrupt_count; in hl_device_init()
2176 hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt), in hl_device_init()
2178 if (!hdev->user_interrupt) { in hl_device_init()
2179 rc = -ENOMEM; in hl_device_init()
2184 if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) { in hl_device_init()
2189 rc = -ENOMEM; in hl_device_init()
2192 free_jobs_data = &hdev->user_interrupt[i].ts_free_jobs_data; in hl_device_init()
2193 free_jobs_data->free_nodes_pool = p; in hl_device_init()
2194 free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM; in hl_device_init()
2195 free_jobs_data->next_avail_free_node_idx = 0; in hl_device_init()
2200 free_jobs_data = &hdev->common_user_cq_interrupt.ts_free_jobs_data; in hl_device_init()
2204 rc = -ENOMEM; in hl_device_init()
2208 free_jobs_data->free_nodes_pool = p; in hl_device_init()
2209 free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM; in hl_device_init()
2210 free_jobs_data->next_avail_free_node_idx = 0; in hl_device_init()
2216 rc = hdev->asic_funcs->sw_init(hdev); in hl_device_init()
2231 dev_err(hdev->dev, "failed to initialize kernel queues\n"); in hl_device_init()
2235 cq_cnt = hdev->asic_prop.completion_queues_count; in hl_device_init()
2243 hdev->completion_queue = kcalloc(cq_cnt, in hl_device_init()
2244 sizeof(*hdev->completion_queue), in hl_device_init()
2247 if (!hdev->completion_queue) { in hl_device_init()
2248 dev_err(hdev->dev, in hl_device_init()
2250 rc = -ENOMEM; in hl_device_init()
2256 rc = hl_cq_init(hdev, &hdev->completion_queue[i], in hl_device_init()
2257 hdev->asic_funcs->get_queue_id_for_cq(hdev, i)); in hl_device_init()
2259 dev_err(hdev->dev, in hl_device_init()
2263 hdev->completion_queue[i].cq_idx = i; in hl_device_init()
2266 hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs, in hl_device_init()
2268 if (!hdev->shadow_cs_queue) { in hl_device_init()
2269 rc = -ENOMEM; in hl_device_init()
2278 rc = hl_eq_init(hdev, &hdev->event_queue); in hl_device_init()
2280 dev_err(hdev->dev, "failed to initialize event queue\n"); in hl_device_init()
2287 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n"); in hl_device_init()
2292 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); in hl_device_init()
2293 if (!hdev->kernel_ctx) { in hl_device_init()
2294 rc = -ENOMEM; in hl_device_init()
2298 hdev->is_compute_ctx_active = false; in hl_device_init()
2300 hdev->asic_funcs->state_dump_init(hdev); in hl_device_init()
2302 hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; in hl_device_init()
2304 hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; in hl_device_init()
2308 dev_err(hdev->dev, "failed to initialize debugfs entry structure\n"); in hl_device_init()
2309 kfree(hdev->kernel_ctx); in hl_device_init()
2316 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); in hl_device_init()
2318 dev_err(hdev->dev, "failed to initialize kernel context\n"); in hl_device_init()
2319 kfree(hdev->kernel_ctx); in hl_device_init()
2325 dev_err(hdev->dev, "failed to initialize CB pool\n"); in hl_device_init()
2331 dev_err(hdev->dev, "Failed to initialize the decoder module\n"); in hl_device_init()
2345 hdev->disabled = false; in hl_device_init()
2347 rc = hdev->asic_funcs->hw_init(hdev); in hl_device_init()
2349 dev_err(hdev->dev, "failed to initialize the H/W\n"); in hl_device_init()
2355 rc = hdev->asic_funcs->test_queues(hdev); in hl_device_init()
2357 dev_err(hdev->dev, "Failed to detect if device is alive\n"); in hl_device_init()
2364 dev_err(hdev->dev, "Failed late initialization\n"); in hl_device_init()
2369 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n", in hl_device_init()
2370 hdev->asic_name, in hl_device_init()
2371 hdev->asic_prop.dram_size / SZ_1G); in hl_device_init()
2375 dev_err(hdev->dev, "Failed to initialize memory module\n"); in hl_device_init()
2386 rc = drm_dev_register(&hdev->drm, 0); in hl_device_init()
2388 dev_err(hdev->dev, "Failed to register DRM device, rc %d\n", rc); in hl_device_init()
2395 dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n"); in hl_device_init()
2400 /* Need to call this again because the max power might change, in hl_device_init()
2403 if (hdev->asic_prop.set_max_power_on_device_init && in hl_device_init()
2404 !hdev->asic_prop.fw_security_enabled) in hl_device_init()
2410 * hwmon-related sensors the device supports. in hl_device_init()
2415 dev_err(hdev->dev, "Failed to initialize hwmon\n"); in hl_device_init()
2420 /* Scheduling the EQ heartbeat thread must come after driver is done with all in hl_device_init()
2422 * to respond to heartbeat packets. in hl_device_init()
2426 dev_notice(hdev->dev, in hl_device_init()
2428 dev_name(&(hdev)->pdev->dev)); in hl_device_init()
2435 hdev->asic_funcs->enable_events_from_fw(hdev); in hl_device_init()
2437 hdev->init_done = true; in hl_device_init()
2444 if (hl_ctx_put(hdev->kernel_ctx) != 1) in hl_device_init()
2445 dev_err(hdev->dev, in hl_device_init()
2452 hl_eq_fini(hdev, &hdev->event_queue); in hl_device_init()
2454 kfree(hdev->shadow_cs_queue); in hl_device_init()
2457 hl_cq_fini(hdev, &hdev->completion_queue[i]); in hl_device_init()
2458 kfree(hdev->completion_queue); in hl_device_init()
2462 hdev->asic_funcs->sw_fini(hdev); in hl_device_init()
2464 vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool); in hl_device_init()
2468 if (!hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool) in hl_device_init()
2470 vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool); in hl_device_init()
2472 kfree(hdev->user_interrupt); in hl_device_init()
2477 hdev->disabled = true; in hl_device_init()
2479 drm_dev_register(&hdev->drm, 0); in hl_device_init()
2484 hdev->cdev_idx, dev_name(&hdev->pdev->dev)); in hl_device_init()
2490 * hl_device_fini - main tear-down function for habanalabs device
2504 dev_info(hdev->dev, "Removing device %s\n", dev_name(&(hdev)->pdev->dev)); in hl_device_fini()
2506 hdev->device_fini_pending = 1; in hl_device_fini()
2507 flush_delayed_work(&hdev->device_reset_work.reset_work); in hl_device_fini()
2509 if (hdev->pldm) in hl_device_fini()
2519 * ports, the hard reset could take between 10-30 seconds in hl_device_fini()
2524 spin_lock(&hdev->reset_info.lock); in hl_device_fini()
2525 device_in_reset = !!hdev->reset_info.in_reset; in hl_device_fini()
2527 hdev->reset_info.in_reset = 1; in hl_device_fini()
2528 spin_unlock(&hdev->reset_info.lock); in hl_device_fini()
2533 spin_lock(&hdev->reset_info.lock); in hl_device_fini()
2534 device_in_reset = !!hdev->reset_info.in_reset; in hl_device_fini()
2536 hdev->reset_info.in_reset = 1; in hl_device_fini()
2537 spin_unlock(&hdev->reset_info.lock); in hl_device_fini()
2540 dev_crit(hdev->dev, in hl_device_fini()
2542 dev_name(&(hdev)->pdev->dev)); in hl_device_fini()
2547 cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work); in hl_device_fini()
2550 * interrupts. We disable MSI/MSI-X at the halt_engines function and we in hl_device_fini()
2553 * message won't be send. Also, in case of heartbeat, the device CPU is in hl_device_fini()
2559 hdev->disabled = true; in hl_device_fini()
2563 hdev->reset_info.hard_reset_pending = true; in hl_device_fini()
2573 dev_info(hdev->dev, in hl_device_fini()
2577 hdev->process_kill_trial_cnt = 0; in hl_device_fini()
2580 dev_crit(hdev->dev, "Failed to kill all open processes (%d)\n", rc); in hl_device_fini()
2584 hdev->process_kill_trial_cnt = 0; in hl_device_fini()
2587 dev_crit(hdev->dev, "Failed to kill all control device open processes (%d)\n", rc); in hl_device_fini()
2594 rc = hdev->asic_funcs->hw_fini(hdev, true, false); in hl_device_fini()
2596 dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc); in hl_device_fini()
2598 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; in hl_device_fini()
2601 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) in hl_device_fini()
2602 dev_err(hdev->dev, "kernel ctx is still alive\n"); in hl_device_fini()
2610 vfree(hdev->captured_err_info.page_fault_info.user_mappings); in hl_device_fini()
2612 hl_eq_fini(hdev, &hdev->event_queue); in hl_device_fini()
2614 kfree(hdev->shadow_cs_queue); in hl_device_fini()
2616 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in hl_device_fini()
2617 hl_cq_fini(hdev, &hdev->completion_queue[i]); in hl_device_fini()
2618 kfree(hdev->completion_queue); in hl_device_fini()
2620 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + in hl_device_fini()
2621 hdev->asic_prop.user_interrupt_count; in hl_device_fini()
2624 if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) { in hl_device_fini()
2626 vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool); in hl_device_fini()
2629 kfree(hdev->user_interrupt); in hl_device_fini()
2632 vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool); in hl_device_fini()
2637 hdev->asic_funcs->sw_fini(hdev); in hl_device_fini()
2643 drm_dev_unregister(&hdev->drm); in hl_device_fini()
2655 * hl_rreg - Read an MMIO register
2665 u32 val = readl(hdev->rmmio + reg); in hl_rreg()
2668 trace_habanalabs_rreg32(&(hdev)->pdev->dev, reg, val); in hl_rreg()
2674 * hl_wreg - Write to an MMIO register
2678 * @val: 32-bit value
2680 * Writes the 32-bit value into the MMIO register
2686 trace_habanalabs_wreg32(&(hdev)->pdev->dev, reg, val); in hl_wreg()
2688 writel(val, hdev->rmmio + reg); in hl_wreg()
2694 struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info; in hl_capture_razwi()
2697 dev_err(hdev->dev, in hl_capture_razwi()
2704 if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info.razwi_detected, 0, 1)) in hl_capture_razwi()
2707 razwi_info->razwi.timestamp = ktime_to_ns(ktime_get()); in hl_capture_razwi()
2708 razwi_info->razwi.addr = addr; in hl_capture_razwi()
2709 razwi_info->razwi.num_of_possible_engines = num_of_engines; in hl_capture_razwi()
2710 memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0], in hl_capture_razwi()
2712 razwi_info->razwi.flags = flags; in hl_capture_razwi()
2714 razwi_info->razwi_info_available = true; in hl_capture_razwi()
2728 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info; in hl_capture_user_mappings()
2738 pgf_info->num_of_user_mappings = 0; in hl_capture_user_mappings()
2742 dev_err(hdev->dev, "Can't get user context for user mappings\n"); in hl_capture_user_mappings()
2746 mutex_lock(&ctx->mem_hash_lock); in hl_capture_user_mappings()
2747 hash_for_each(ctx->mem_hash, i, hnode, node) { in hl_capture_user_mappings()
2748 vm_type = hnode->ptr; in hl_capture_user_mappings()
2751 pgf_info->num_of_user_mappings++; in hl_capture_user_mappings()
2755 if (!pgf_info->num_of_user_mappings) in hl_capture_user_mappings()
2761 vfree(pgf_info->user_mappings); in hl_capture_user_mappings()
2762 pgf_info->user_mappings = in hl_capture_user_mappings()
2763 vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); in hl_capture_user_mappings()
2764 if (!pgf_info->user_mappings) { in hl_capture_user_mappings()
2765 pgf_info->num_of_user_mappings = 0; in hl_capture_user_mappings()
2769 hash_for_each(ctx->mem_hash, i, hnode, node) { in hl_capture_user_mappings()
2770 vm_type = hnode->ptr; in hl_capture_user_mappings()
2772 userptr = hnode->ptr; in hl_capture_user_mappings()
2773 pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; in hl_capture_user_mappings()
2774 pgf_info->user_mappings[map_idx].size = userptr->size; in hl_capture_user_mappings()
2777 phys_pg_pack = hnode->ptr; in hl_capture_user_mappings()
2778 pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; in hl_capture_user_mappings()
2779 pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size; in hl_capture_user_mappings()
2784 mutex_unlock(&ctx->mem_hash_lock); in hl_capture_user_mappings()
2790 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info; in hl_capture_page_fault()
2793 if (atomic_cmpxchg(&pgf_info->page_fault_detected, 0, 1)) in hl_capture_page_fault()
2796 pgf_info->page_fault.timestamp = ktime_to_ns(ktime_get()); in hl_capture_page_fault()
2797 pgf_info->page_fault.addr = addr; in hl_capture_page_fault()
2798 pgf_info->page_fault.engine_id = eng_id; in hl_capture_page_fault()
2801 pgf_info->page_fault_info_available = true; in hl_capture_page_fault()
2815 struct hw_err_info *info = &hdev->captured_err_info.hw_err; in hl_capture_hw_err()
2818 if (atomic_cmpxchg(&info->event_detected, 0, 1)) in hl_capture_hw_err()
2821 info->event.timestamp = ktime_to_ns(ktime_get()); in hl_capture_hw_err()
2822 info->event.event_id = event_id; in hl_capture_hw_err()
2824 info->event_info_available = true; in hl_capture_hw_err()
2837 struct fw_err_info *info = &hdev->captured_err_info.fw_err; in hl_capture_fw_err()
2840 if (atomic_cmpxchg(&info->event_detected, 0, 1)) in hl_capture_fw_err()
2843 info->event.timestamp = ktime_to_ns(ktime_get()); in hl_capture_fw_err()
2844 info->event.err_type = fw_info->err_type; in hl_capture_fw_err()
2845 if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR) in hl_capture_fw_err()
2846 info->event.event_id = fw_info->event_id; in hl_capture_fw_err()
2848 info->event_info_available = true; in hl_capture_fw_err()
2855 if (info->event_mask) in hl_handle_fw_err()
2856 *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; in hl_handle_fw_err()
2861 struct engine_err_info *info = &hdev->captured_err_info.engine_err; in hl_capture_engine_err()
2864 if (atomic_cmpxchg(&info->event_detected, 0, 1)) in hl_capture_engine_err()
2867 info->event.timestamp = ktime_to_ns(ktime_get()); in hl_capture_engine_err()
2868 info->event.engine_id = engine_id; in hl_capture_engine_err()
2869 info->event.error_count = error_count; in hl_capture_engine_err()
2870 info->event_info_available = true; in hl_capture_engine_err()
2875 vfree(captured_err_info->page_fault_info.user_mappings); in hl_enable_err_info_capture()
2877 atomic_set(&captured_err_info->cs_timeout.write_enable, 1); in hl_enable_err_info_capture()
2878 captured_err_info->undef_opcode.write_enable = true; in hl_enable_err_info_capture()
2884 struct cpumask *available_mask = &hdev->irq_affinity_mask; in hl_init_cpu_for_irq()
2885 int numa_node = hdev->pdev->dev.numa_node, i; in hl_init_cpu_for_irq()
2892 dev_err(hdev->dev, "No available affinities in current numa node\n"); in hl_init_cpu_for_irq()
2904 if (cpumask_empty(&hdev->irq_affinity_mask)) { in hl_set_irq_affinity()
2905 dev_dbg(hdev->dev, "affinity mask is empty\n"); in hl_set_irq_affinity()
2909 if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask)) in hl_set_irq_affinity()
2910 dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq); in hl_set_irq_affinity()
2915 hdev->heartbeat_debug_info.heartbeat_event_counter++; in hl_eq_heartbeat_event_handle()
2916 hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds(); in hl_eq_heartbeat_event_handle()
2917 hdev->eq_heartbeat_received = true; in hl_eq_heartbeat_event_handle()
2922 struct hl_clk_throttle *clk_throttle = &hdev->clk_throttling; in hl_handle_clk_change_event()
2925 mutex_lock(&clk_throttle->lock); in hl_handle_clk_change_event()
2929 clk_throttle->current_reason |= HL_CLK_THROTTLE_POWER; in hl_handle_clk_change_event()
2930 clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_POWER; in hl_handle_clk_change_event()
2931 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get(); in hl_handle_clk_change_event()
2932 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time; in hl_handle_clk_change_event()
2933 dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); in hl_handle_clk_change_event()
2937 clk_throttle->current_reason &= ~HL_CLK_THROTTLE_POWER; in hl_handle_clk_change_event()
2938 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get(); in hl_handle_clk_change_event()
2939 dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); in hl_handle_clk_change_event()
2943 clk_throttle->current_reason |= HL_CLK_THROTTLE_THERMAL; in hl_handle_clk_change_event()
2944 clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_THERMAL; in hl_handle_clk_change_event()
2945 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get(); in hl_handle_clk_change_event()
2946 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time; in hl_handle_clk_change_event()
2948 dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); in hl_handle_clk_change_event()
2952 clk_throttle->current_reason &= ~HL_CLK_THROTTLE_THERMAL; in hl_handle_clk_change_event()
2953 clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get(); in hl_handle_clk_change_event()
2955 dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); in hl_handle_clk_change_event()
2959 dev_err(hdev->dev, "Received invalid clock change event %d\n", event_type); in hl_handle_clk_change_event()
2963 mutex_unlock(&clk_throttle->lock); in hl_handle_clk_change_event()