Lines Matching +full:sys +full:- +full:mgr

101 	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT ||  in get_ras_block_str()
102 ras_block->block >= ARRAY_SIZE(ras_block_string)) in get_ras_block_str()
105 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) in get_ras_block_str()
106 return ras_mca_block_string[ras_block->sub_block_index]; in get_ras_block_str()
108 return ras_block_string[ras_block->block]; in get_ras_block_str()
154 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
160 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
171 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
173 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
176 return -EINVAL; in amdgpu_reserve_page_direct()
180 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
202 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
203 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
204 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
212 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; in amdgpu_ras_debugfs_read()
214 .head = obj->head, in amdgpu_ras_debugfs_read()
219 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
220 return -EINVAL; in amdgpu_ras_debugfs_read()
223 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_debugfs_read()
224 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_debugfs_read()
225 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_debugfs_read()
226 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_debugfs_read()
235 s -= *pos; in amdgpu_ras_debugfs_read()
240 return -EINVAL; in amdgpu_ras_debugfs_read()
263 return -EINVAL; in amdgpu_ras_find_block_id_by_name()
274 int op = -1; in amdgpu_ras_debugfs_ctrl_parse_data()
282 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
289 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
301 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
303 if (op != -1) { in amdgpu_ras_debugfs_ctrl_parse_data()
307 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
309 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
310 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
316 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
318 data->head.block = block_id; in amdgpu_ras_debugfs_ctrl_parse_data()
321 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
323 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
325 data->head.type = AMDGPU_RAS_ERROR__POISON; in amdgpu_ras_debugfs_ctrl_parse_data()
327 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
329 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
340 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
341 data->head.sub_block_index = sub_block; in amdgpu_ras_debugfs_ctrl_parse_data()
342 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
343 data->inject.value = value; in amdgpu_ras_debugfs_ctrl_parse_data()
344 data->inject.instance_mask = instance_mask; in amdgpu_ras_debugfs_ctrl_parse_data()
348 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
351 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
360 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; in amdgpu_ras_instance_mask_check()
361 uint32_t mask, inst_mask = data->inject.instance_mask; in amdgpu_ras_instance_mask_check()
365 data->inject.instance_mask = 0; in amdgpu_ras_instance_mask_check()
366 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
373 switch (data->head.block) { in amdgpu_ras_instance_mask_check()
375 mask = GENMASK(num_xcc - 1, 0); in amdgpu_ras_instance_mask_check()
378 mask = GENMASK(adev->sdma.num_instances - 1, 0); in amdgpu_ras_instance_mask_check()
382 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); in amdgpu_ras_instance_mask_check()
390 data->inject.instance_mask &= mask; in amdgpu_ras_instance_mask_check()
391 if (inst_mask != data->inject.instance_mask) in amdgpu_ras_instance_mask_check()
392 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
394 inst_mask, data->inject.instance_mask); in amdgpu_ras_instance_mask_check()
419 * - 0: disable RAS on the block. Take ::head as its data.
420 * - 1: enable RAS on the block. Take ::head as its data.
421 * - 2: inject errors on the block. Take ::inject as its data.
432 * .. code-block:: bash
434 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
435 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
436 …* echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/…
448 * ue is multi-uncorrectable
449 * ce is single-correctable
452 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
458 * .. code-block:: bash
460 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
461 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
462 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
467 * /sys/class/drm/card[0/1/2...]/device/ras/features
470 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
474 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
482 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write()
487 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
505 return -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
515 if ((data.inject.address >= adev->gmc.mc_vram_size && in amdgpu_ras_debugfs_ctrl_write()
516 adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
518 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
521 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
528 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
540 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
559 * .. code-block:: bash
571 (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_eeprom_write()
575 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
580 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
605 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
616 * .. code-block:: bash
627 .head = obj->head, in amdgpu_ras_sysfs_read()
630 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
633 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
634 return -EINVAL; in amdgpu_ras_sysfs_read()
636 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_sysfs_read()
637 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_sysfs_read()
638 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
639 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_sysfs_read()
652 #define get_obj(obj) do { (obj)->use++; } while (0)
653 #define alive_obj(obj) ((obj)->use)
657 if (obj && (--obj->use == 0)) { in put_obj()
658 list_del(&obj->node); in put_obj()
659 amdgpu_ras_error_data_fini(&obj->err_data); in put_obj()
662 if (obj && (obj->use < 0)) in put_obj()
663 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); in put_obj()
673 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
676 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_create_obj()
679 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_create_obj()
680 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_create_obj()
683 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_create_obj()
685 obj = &con->objs[head->block]; in amdgpu_ras_create_obj()
691 if (amdgpu_ras_error_data_init(&obj->err_data)) in amdgpu_ras_create_obj()
694 obj->head = *head; in amdgpu_ras_create_obj()
695 obj->adev = adev; in amdgpu_ras_create_obj()
696 list_add(&obj->node, &con->head); in amdgpu_ras_create_obj()
710 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
714 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_find_obj()
717 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_find_obj()
718 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_find_obj()
721 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_find_obj()
723 obj = &con->objs[head->block]; in amdgpu_ras_find_obj()
729 obj = &con->objs[i]; in amdgpu_ras_find_obj()
743 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
751 return con->features & BIT(head->block); in amdgpu_ras_is_feature_enabled()
766 * Ras framework checks con->hw_supported to see if it need do in __amdgpu_ras_feature_enable()
768 * IP checks con->support to see if it need disable ras. in __amdgpu_ras_feature_enable()
777 return -EINVAL; in __amdgpu_ras_feature_enable()
782 con->features |= BIT(head->block); in __amdgpu_ras_feature_enable()
785 con->features &= ~BIT(head->block); in __amdgpu_ras_feature_enable()
802 return -EINVAL; in amdgpu_ras_feature_enable()
804 /* For non-gfx ip, do not enable ras feature if it is not allowed */ in amdgpu_ras_feature_enable()
807 if (head->block != AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
812 if (head->block == AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
817 return -ENOMEM; in amdgpu_ras_feature_enable()
820 info->disable_features = (struct ta_ras_disable_features_input) { in amdgpu_ras_feature_enable()
821 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
822 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
825 info->enable_features = (struct ta_ras_enable_features_input) { in amdgpu_ras_feature_enable()
826 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
827 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
831 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
833 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
858 return -EINVAL; in amdgpu_ras_feature_enable_on_boot()
860 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_feature_enable_on_boot()
866 * with error code -EAGAIN. in amdgpu_ras_feature_enable_on_boot()
873 if (ret == -EINVAL) { in amdgpu_ras_feature_enable_on_boot()
876 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
886 /* gfx block ras disable cmd must send to ras-ta */ in amdgpu_ras_feature_enable_on_boot()
887 if (head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
888 con->features |= BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
893 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
894 con->features &= ~BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
908 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_disable_all_features()
913 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
916 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
921 return con->features; in amdgpu_ras_disable_all_features()
974 return con->features; in amdgpu_ras_enable_all_features()
982 return -EINVAL; in amdgpu_ras_block_match_default()
984 if (block_obj->ras_comm.block == block) in amdgpu_ras_block_match_default()
987 return -EINVAL; in amdgpu_ras_block_match_default()
999 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_get_ras_block()
1000 if (!node->ras_obj) { in amdgpu_ras_get_ras_block()
1001 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
1005 obj = node->ras_obj; in amdgpu_ras_get_ras_block()
1006 if (obj->ras_block_match) { in amdgpu_ras_get_ras_block()
1007 if (obj->ras_block_match(obj, block, sub_block_index) == 0) in amdgpu_ras_get_ras_block()
1027 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
1028 if (ret == -EOPNOTSUPP) { in amdgpu_ras_get_ecc_info()
1029 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1030 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
1031 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1036 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1037 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
1038 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1040 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1041 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
1042 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1044 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1045 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
1046 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1061 u64 event_id = qctx->evid.event_id; in amdgpu_ras_error_print_error_data()
1065 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1066 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1067 if (err_info->ue_count) { in amdgpu_ras_error_print_error_data()
1070 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1071 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1072 err_info->ue_count, in amdgpu_ras_error_print_error_data()
1077 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1078 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1079 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1082 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); in amdgpu_ras_error_print_error_data()
1088 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1089 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1090 if (err_info->de_count) { in amdgpu_ras_error_print_error_data()
1093 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1094 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1095 err_info->de_count, in amdgpu_ras_error_print_error_data()
1100 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1101 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1102 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1105 mcm_info->socket_id, mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1106 err_info->de_count, blk_name); in amdgpu_ras_error_print_error_data()
1110 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1111 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1112 if (err_info->ce_count) { in amdgpu_ras_error_print_error_data()
1115 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1116 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1117 err_info->ce_count, in amdgpu_ras_error_print_error_data()
1122 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1123 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1124 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1127 mcm_info->socket_id, mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1128 err_info->ce_count, blk_name); in amdgpu_ras_error_print_error_data()
1136 return !list_empty(&data->err_node_list); in err_data_has_source_info()
1144 struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); in amdgpu_ras_error_generate_report()
1145 const char *blk_name = get_ras_block_str(&query_if->head); in amdgpu_ras_error_generate_report()
1146 u64 event_id = qctx->evid.event_id; in amdgpu_ras_error_generate_report()
1148 if (err_data->ce_count) { in amdgpu_ras_error_generate_report()
1152 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1153 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1154 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1155 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1159 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1160 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1161 ras_mgr->err_data.ce_count, in amdgpu_ras_error_generate_report()
1166 ras_mgr->err_data.ce_count, in amdgpu_ras_error_generate_report()
1171 if (err_data->ue_count) { in amdgpu_ras_error_generate_report()
1175 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1176 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1177 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1178 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1182 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1183 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1184 ras_mgr->err_data.ue_count, in amdgpu_ras_error_generate_report()
1189 ras_mgr->err_data.ue_count, in amdgpu_ras_error_generate_report()
1194 if (err_data->de_count) { in amdgpu_ras_error_generate_report()
1198 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1199 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1200 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1201 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1205 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1206 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1207 ras_mgr->err_data.de_count, in amdgpu_ras_error_generate_report()
1212 ras_mgr->err_data.de_count, in amdgpu_ras_error_generate_report()
1224 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); in amdgpu_ras_virt_error_generate_report()
1225 const char *blk_name = get_ras_block_str(&query_if->head); in amdgpu_ras_virt_error_generate_report()
1226 u64 event_id = qctx->evid.event_id; in amdgpu_ras_virt_error_generate_report()
1228 new_ce = err_data->ce_count - obj->err_data.ce_count; in amdgpu_ras_virt_error_generate_report()
1229 new_ue = err_data->ue_count - obj->err_data.ue_count; in amdgpu_ras_virt_error_generate_report()
1230 new_de = err_data->de_count - obj->err_data.de_count; in amdgpu_ras_virt_error_generate_report()
1261 err_info = &err_node->err_info; in amdgpu_rasmgr_error_data_statistic_update()
1262 amdgpu_ras_error_statistic_de_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1263 &err_info->mcm_info, err_info->de_count); in amdgpu_rasmgr_error_data_statistic_update()
1264 amdgpu_ras_error_statistic_ce_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1265 &err_info->mcm_info, err_info->ce_count); in amdgpu_rasmgr_error_data_statistic_update()
1266 amdgpu_ras_error_statistic_ue_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1267 &err_info->mcm_info, err_info->ue_count); in amdgpu_rasmgr_error_data_statistic_update()
1271 obj->err_data.ue_count += err_data->ue_count; in amdgpu_rasmgr_error_data_statistic_update()
1272 obj->err_data.ce_count += err_data->ce_count; in amdgpu_rasmgr_error_data_statistic_update()
1273 obj->err_data.de_count += err_data->de_count; in amdgpu_rasmgr_error_data_statistic_update()
1281 obj->err_data.ue_count = err_data->ue_count; in amdgpu_ras_mgr_virt_error_data_statistics_update()
1282 obj->err_data.ce_count = err_data->ce_count; in amdgpu_ras_mgr_virt_error_data_statistics_update()
1283 obj->err_data.de_count = err_data->de_count; in amdgpu_ras_mgr_virt_error_data_statistics_update()
1302 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) in amdgpu_ras_bind_aca()
1307 return -EINVAL; in amdgpu_ras_bind_aca()
1309 return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); in amdgpu_ras_bind_aca()
1318 return -EINVAL; in amdgpu_ras_unbind_aca()
1320 amdgpu_aca_remove_handle(&obj->aca_handle); in amdgpu_ras_unbind_aca()
1333 return -EINVAL; in amdgpu_aca_log_ras_error_data()
1335 return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); in amdgpu_aca_log_ras_error_data()
1343 .head = obj->head, in amdgpu_ras_aca_sysfs_read()
1346 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_aca_sysfs_read()
1349 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_aca_sysfs_read()
1350 return -EINVAL; in amdgpu_ras_aca_sysfs_read()
1362 enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; in amdgpu_ras_query_error_status_helper()
1367 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1370 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1375 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { in amdgpu_ras_query_error_status_helper()
1378 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); in amdgpu_ras_query_error_status_helper()
1379 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_query_error_status_helper()
1380 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status_helper()
1381 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status_helper()
1382 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1385 if (block_obj->hw_ops->query_ras_error_count) in amdgpu_ras_query_error_status_helper()
1386 block_obj->hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_query_error_status_helper()
1388 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || in amdgpu_ras_query_error_status_helper()
1389 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || in amdgpu_ras_query_error_status_helper()
1390 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { in amdgpu_ras_query_error_status_helper()
1391 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_query_error_status_helper()
1392 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_query_error_status_helper()
1423 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status_with_event()
1430 return -EINVAL; in amdgpu_ras_query_error_status_with_event()
1437 return -EINVAL; in amdgpu_ras_query_error_status_with_event()
1443 if (!down_read_trylock(&adev->reset_domain->sem)) { in amdgpu_ras_query_error_status_with_event()
1444 ret = -EIO; in amdgpu_ras_query_error_status_with_event()
1452 up_read(&adev->reset_domain->sem); in amdgpu_ras_query_error_status_with_event()
1468 info->ue_count = obj->err_data.ue_count; in amdgpu_ras_query_error_status_with_event()
1469 info->ce_count = obj->err_data.ce_count; in amdgpu_ras_query_error_status_with_event()
1470 info->de_count = obj->err_data.de_count; in amdgpu_ras_query_error_status_with_event()
1487 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_reset_error_count()
1488 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; in amdgpu_ras_reset_error_count()
1490 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_reset_error_count()
1491 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_count()
1493 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1498 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1502 ((smu_funcs && smu_funcs->set_debug_mode) || in amdgpu_ras_reset_error_count()
1503 (mca_funcs && mca_funcs->mca_set_debug_mode))) in amdgpu_ras_reset_error_count()
1504 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1506 if (block_obj->hw_ops->reset_ras_error_count) in amdgpu_ras_reset_error_count()
1507 block_obj->hw_ops->reset_ras_error_count(adev); in amdgpu_ras_reset_error_count()
1517 if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) in amdgpu_ras_reset_error_status()
1522 if (block_obj->hw_ops->reset_ras_error_status) in amdgpu_ras_reset_error_status()
1523 block_obj->hw_ops->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
1533 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1535 .block_id = amdgpu_ras_block_to_ta(info->head.block), in amdgpu_ras_error_inject()
1536 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), in amdgpu_ras_error_inject()
1537 .sub_block_index = info->head.sub_block_index, in amdgpu_ras_error_inject()
1538 .address = info->address, in amdgpu_ras_error_inject()
1539 .value = info->value, in amdgpu_ras_error_inject()
1541 int ret = -EINVAL; in amdgpu_ras_error_inject()
1543 info->head.block, in amdgpu_ras_error_inject()
1544 info->head.sub_block_index); in amdgpu_ras_error_inject()
1551 return -EINVAL; in amdgpu_ras_error_inject()
1553 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_inject()
1554 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1555 get_ras_block_str(&info->head)); in amdgpu_ras_error_inject()
1556 return -EINVAL; in amdgpu_ras_error_inject()
1560 if (adev->gmc.xgmi.num_physical_nodes > 1 && in amdgpu_ras_error_inject()
1561 info->head.block != AMDGPU_RAS_BLOCK__GFX) { in amdgpu_ras_error_inject()
1567 if (block_obj->hw_ops->ras_error_inject) { in amdgpu_ras_error_inject()
1568 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_error_inject()
1569 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); in amdgpu_ras_error_inject()
1571 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, in amdgpu_ras_error_inject()
1572 info->instance_mask); in amdgpu_ras_error_inject()
1575 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); in amdgpu_ras_error_inject()
1579 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1580 get_ras_block_str(&info->head), ret); in amdgpu_ras_error_inject()
1586 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1610 *ce_count += query_info->ce_count; in amdgpu_ras_query_error_count_helper()
1611 *ue_count += query_info->ue_count; in amdgpu_ras_query_error_count_helper()
1617 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) in amdgpu_ras_query_error_count_helper()
1618 dev_warn(adev->dev, in amdgpu_ras_query_error_count_helper()
1626 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1637 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1649 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1650 return -EOPNOTSUPP; in amdgpu_ras_query_error_count()
1661 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_error_count()
1663 .head = obj->head, in amdgpu_ras_query_error_count()
1709 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1728 * .. code-block:: bash
1741 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read()
1743 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; in amdgpu_ras_sysfs_badpages_read()
1744 unsigned int start = div64_ul(ppos + element_size - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1745 unsigned int end = div64_ul(ppos + count - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1773 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); in amdgpu_ras_sysfs_features_read()
1781 return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version); in amdgpu_ras_sysfs_version_show()
1789 return sysfs_emit(buf, "schema: 0x%x\n", con->schema); in amdgpu_ras_sysfs_schema_show()
1806 struct ras_event_manager *event_mgr = con->event_mgr; in amdgpu_ras_sysfs_event_state_show()
1811 return -EINVAL; in amdgpu_ras_sysfs_event_state_show()
1813 size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); in amdgpu_ras_sysfs_event_state_show()
1815 event_state = &event_mgr->event_state[dump_event[i].type]; in amdgpu_ras_sysfs_event_state_show()
1818 atomic64_read(&event_state->count), in amdgpu_ras_sysfs_event_state_show()
1819 event_state->last_seqno); in amdgpu_ras_sysfs_event_state_show()
1829 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_bad_page_node()
1830 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1831 &con->badpages_attr.attr, in amdgpu_ras_sysfs_remove_bad_page_node()
1839 &con->features_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1840 &con->version_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1841 &con->schema_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1842 &con->event_state_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1850 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_dev_attr_node()
1851 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_dev_attr_node()
1864 if (!obj || obj->attr_inuse) in amdgpu_ras_sysfs_create()
1865 return -EINVAL; in amdgpu_ras_sysfs_create()
1869 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), in amdgpu_ras_sysfs_create()
1870 "%s_err_count", head->name); in amdgpu_ras_sysfs_create()
1872 obj->sysfs_attr = (struct device_attribute){ in amdgpu_ras_sysfs_create()
1874 .name = obj->fs_data.sysfs_name, in amdgpu_ras_sysfs_create()
1879 sysfs_attr_init(&obj->sysfs_attr.attr); in amdgpu_ras_sysfs_create()
1881 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1882 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_create()
1885 return -EINVAL; in amdgpu_ras_sysfs_create()
1888 obj->attr_inuse = 1; in amdgpu_ras_sysfs_create()
1901 if (!obj || !obj->attr_inuse) in amdgpu_ras_sysfs_remove()
1902 return -EINVAL; in amdgpu_ras_sysfs_remove()
1904 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove()
1905 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1906 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_remove()
1908 obj->attr_inuse = 0; in amdgpu_ras_sysfs_remove()
1919 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_sysfs_remove_all()
1920 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1941 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1945 * .. code-block:: bash
1954 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; in amdgpu_ras_debugfs_create_ctrl_node()
1955 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1958 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); in amdgpu_ras_debugfs_create_ctrl_node()
1964 &con->bad_page_cnt_threshold); in amdgpu_ras_debugfs_create_ctrl_node()
1965 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); in amdgpu_ras_debugfs_create_ctrl_node()
1966 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1967 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1970 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", in amdgpu_ras_debugfs_create_ctrl_node()
1973 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); in amdgpu_ras_debugfs_create_ctrl_node()
1983 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); in amdgpu_ras_debugfs_create_ctrl_node()
1990 &con->disable_ras_err_cnt_harvest); in amdgpu_ras_debugfs_create_ctrl_node()
1998 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
2005 memcpy(obj->fs_data.debugfs_name, in amdgpu_ras_debugfs_create()
2006 head->debugfs_name, in amdgpu_ras_debugfs_create()
2007 sizeof(obj->fs_data.debugfs_name)); in amdgpu_ras_debugfs_create()
2009 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, in amdgpu_ras_debugfs_create()
2047 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_debugfs_create_all()
2048 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
2049 (obj->attr_inuse == 1)) { in amdgpu_ras_debugfs_create_all()
2051 get_ras_block_str(&obj->head)); in amdgpu_ras_debugfs_create_all()
2052 fs_info.head = obj->head; in amdgpu_ras_debugfs_create_all()
2085 &con->features_attr.attr, in amdgpu_ras_fs_init()
2086 &con->version_attr.attr, in amdgpu_ras_fs_init()
2087 &con->schema_attr.attr, in amdgpu_ras_fs_init()
2088 &con->event_state_attr.attr, in amdgpu_ras_fs_init()
2100 con->features_attr = dev_attr_features; in amdgpu_ras_fs_init()
2104 con->version_attr = dev_attr_version; in amdgpu_ras_fs_init()
2108 con->schema_attr = dev_attr_schema; in amdgpu_ras_fs_init()
2112 con->event_state_attr = dev_attr_event_state; in amdgpu_ras_fs_init()
2118 con->badpages_attr = bin_attr_gpu_vram_bad_pages; in amdgpu_ras_fs_init()
2119 bin_attrs[0] = &con->badpages_attr; in amdgpu_ras_fs_init()
2124 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
2126 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
2137 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { in amdgpu_ras_fs_fini()
2138 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
2162 * If the current interrupt is caused by a non-fatal RAS error, skip in amdgpu_ras_interrupt_fatal_error_handler()
2172 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
2173 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
2174 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
2176 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
2177 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
2178 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
2185 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_consumption_handler()
2187 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
2200 amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); in amdgpu_ras_interrupt_poison_consumption_handler()
2205 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { in amdgpu_ras_interrupt_poison_consumption_handler()
2206 poison_stat = block_obj->hw_ops->query_poison_status(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
2209 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
2210 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
2216 amdgpu_umc_poison_handler(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
2218 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) in amdgpu_ras_interrupt_poison_consumption_handler()
2219 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
2228 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
2239 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_creation_handler()
2251 if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { in amdgpu_ras_interrupt_poison_creation_handler()
2252 struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); in amdgpu_ras_interrupt_poison_creation_handler()
2254 atomic_inc(&con->page_retirement_req_cnt); in amdgpu_ras_interrupt_poison_creation_handler()
2255 atomic_inc(&con->poison_creation_count); in amdgpu_ras_interrupt_poison_creation_handler()
2257 wake_up(&con->page_retirement_wq); in amdgpu_ras_interrupt_poison_creation_handler()
2264 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_umc_handler()
2268 if (!data->cb) in amdgpu_ras_interrupt_umc_handler()
2278 amdgpu_ras_set_fed(obj->adev, true); in amdgpu_ras_interrupt_umc_handler()
2279 ret = data->cb(obj->adev, &err_data, entry); in amdgpu_ras_interrupt_umc_handler()
2289 obj->err_data.ue_count += err_data.ue_count; in amdgpu_ras_interrupt_umc_handler()
2290 obj->err_data.ce_count += err_data.ce_count; in amdgpu_ras_interrupt_umc_handler()
2291 obj->err_data.de_count += err_data.de_count; in amdgpu_ras_interrupt_umc_handler()
2299 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_handler()
2302 while (data->rptr != data->wptr) { in amdgpu_ras_interrupt_handler()
2304 memcpy(&entry, &data->ring[data->rptr], in amdgpu_ras_interrupt_handler()
2305 data->element_size); in amdgpu_ras_interrupt_handler()
2308 data->rptr = (data->aligned_element_size + in amdgpu_ras_interrupt_handler()
2309 data->rptr) % data->ring_size; in amdgpu_ras_interrupt_handler()
2311 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { in amdgpu_ras_interrupt_handler()
2312 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
2317 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
2320 dev_warn(obj->adev->dev, in amdgpu_ras_interrupt_handler()
2321 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); in amdgpu_ras_interrupt_handler()
2342 obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
2344 return -EINVAL; in amdgpu_ras_interrupt_dispatch()
2346 data = &obj->ih_data; in amdgpu_ras_interrupt_dispatch()
2348 if (data->inuse == 0) in amdgpu_ras_interrupt_dispatch()
2352 memcpy(&data->ring[data->wptr], info->entry, in amdgpu_ras_interrupt_dispatch()
2353 data->element_size); in amdgpu_ras_interrupt_dispatch()
2356 data->wptr = (data->aligned_element_size + in amdgpu_ras_interrupt_dispatch()
2357 data->wptr) % data->ring_size; in amdgpu_ras_interrupt_dispatch()
2359 schedule_work(&data->ih_work); in amdgpu_ras_interrupt_dispatch()
2371 return -EINVAL; in amdgpu_ras_interrupt_remove_handler()
2373 data = &obj->ih_data; in amdgpu_ras_interrupt_remove_handler()
2374 if (data->inuse == 0) in amdgpu_ras_interrupt_remove_handler()
2377 cancel_work_sync(&data->ih_work); in amdgpu_ras_interrupt_remove_handler()
2379 kfree(data->ring); in amdgpu_ras_interrupt_remove_handler()
2397 return -EINVAL; in amdgpu_ras_interrupt_add_handler()
2403 data = &obj->ih_data; in amdgpu_ras_interrupt_add_handler()
2407 .cb = ras_obj->ras_cb, in amdgpu_ras_interrupt_add_handler()
2413 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); in amdgpu_ras_interrupt_add_handler()
2415 data->aligned_element_size = ALIGN(data->element_size, 8); in amdgpu_ras_interrupt_add_handler()
2417 data->ring_size = 64 * data->aligned_element_size; in amdgpu_ras_interrupt_add_handler()
2418 data->ring = kmalloc(data->ring_size, GFP_KERNEL); in amdgpu_ras_interrupt_add_handler()
2419 if (!data->ring) { in amdgpu_ras_interrupt_add_handler()
2421 return -ENOMEM; in amdgpu_ras_interrupt_add_handler()
2425 data->inuse = 1; in amdgpu_ras_interrupt_add_handler()
2435 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_interrupt_remove_all()
2436 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); in amdgpu_ras_interrupt_remove_all()
2449 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
2452 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_log_on_err_counter()
2454 .head = obj->head, in amdgpu_ras_log_on_err_counter()
2486 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_log_on_err_counter()
2500 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && in amdgpu_ras_error_status_query()
2501 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) in amdgpu_ras_error_status_query()
2505 info->head.block, in amdgpu_ras_error_status_query()
2506 info->head.sub_block_index); in amdgpu_ras_error_status_query()
2508 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_status_query()
2509 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
2510 get_ras_block_str(&info->head)); in amdgpu_ras_error_status_query()
2514 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_error_status_query()
2515 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
2524 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
2527 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_err_status()
2529 .head = obj->head, in amdgpu_ras_query_err_status()
2549 if (!con || !con->eh_data || !bps || !count) in amdgpu_ras_badpages_read()
2550 return -EINVAL; in amdgpu_ras_badpages_read()
2552 mutex_lock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2553 data = con->eh_data; in amdgpu_ras_badpages_read()
2554 if (!data || data->count == 0) { in amdgpu_ras_badpages_read()
2556 ret = -EINVAL; in amdgpu_ras_badpages_read()
2560 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); in amdgpu_ras_badpages_read()
2562 ret = -ENOMEM; in amdgpu_ras_badpages_read()
2566 for (; i < data->count; i++) { in amdgpu_ras_badpages_read()
2568 .bp = data->bps[i].retired_page, in amdgpu_ras_badpages_read()
2572 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, in amdgpu_ras_badpages_read()
2573 data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT); in amdgpu_ras_badpages_read()
2574 if (status == -EBUSY) in amdgpu_ras_badpages_read()
2576 else if (status == -ENOENT) in amdgpu_ras_badpages_read()
2580 *count = data->count; in amdgpu_ras_badpages_read()
2582 mutex_unlock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2592 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) in amdgpu_ras_set_fed_all()
2606 hive_ras_recovery = atomic_read(&hive->ras_recovery); in amdgpu_ras_in_recovery()
2610 if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) in amdgpu_ras_in_recovery()
2629 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery()
2635 atomic_set(&hive->ras_recovery, 1); in amdgpu_ras_do_recovery()
2642 list_for_each_entry(remote_adev, &hive->device_list, in amdgpu_ras_do_recovery()
2649 if (!ras->disable_ras_err_cnt_harvest) { in amdgpu_ras_do_recovery()
2652 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
2653 device_list_handle = &hive->device_list; in amdgpu_ras_do_recovery()
2656 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
2669 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
2679 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) in amdgpu_ras_do_recovery()
2684 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { in amdgpu_ras_do_recovery()
2685 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; in amdgpu_ras_do_recovery()
2692 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { in amdgpu_ras_do_recovery()
2693 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_do_recovery()
2696 psp_fatal_error_recovery_quirk(&adev->psp); in amdgpu_ras_do_recovery()
2700 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
2702 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_do_recovery()
2704 atomic_set(&hive->ras_recovery, 0); in amdgpu_ras_do_recovery()
2713 unsigned int old_space = data->count + data->space_left; in amdgpu_ras_realloc_eh_data_space()
2716 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); in amdgpu_ras_realloc_eh_data_space()
2719 return -ENOMEM; in amdgpu_ras_realloc_eh_data_space()
2722 if (data->bps) { in amdgpu_ras_realloc_eh_data_space()
2723 memcpy(bps, data->bps, in amdgpu_ras_realloc_eh_data_space()
2724 data->count * sizeof(*data->bps)); in amdgpu_ras_realloc_eh_data_space()
2725 kfree(data->bps); in amdgpu_ras_realloc_eh_data_space()
2728 data->bps = bps; in amdgpu_ras_realloc_eh_data_space()
2729 data->space_left += align_space - old_space; in amdgpu_ras_realloc_eh_data_space()
2741 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) in amdgpu_ras_mca2pa_by_idx()
2742 socket = adev->smuio.funcs->get_socket_id(adev); in amdgpu_ras_mca2pa_by_idx()
2745 err_data->err_addr_cnt = 0; in amdgpu_ras_mca2pa_by_idx()
2746 err_data->err_addr_len = adev->umc.retire_unit; in amdgpu_ras_mca2pa_by_idx()
2749 addr_in.ma.err_addr = bps->address; in amdgpu_ras_mca2pa_by_idx()
2751 addr_in.ma.ch_inst = bps->mem_channel; in amdgpu_ras_mca2pa_by_idx()
2755 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_mca2pa_by_idx()
2756 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, in amdgpu_ras_mca2pa_by_idx()
2769 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) in amdgpu_ras_mca2pa()
2770 socket = adev->smuio.funcs->get_socket_id(adev); in amdgpu_ras_mca2pa()
2775 if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) in amdgpu_ras_mca2pa()
2776 die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, in amdgpu_ras_mca2pa()
2777 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); in amdgpu_ras_mca2pa()
2779 return -EINVAL; in amdgpu_ras_mca2pa()
2782 err_data->err_addr_cnt = 0; in amdgpu_ras_mca2pa()
2783 err_data->err_addr_len = adev->umc.retire_unit; in amdgpu_ras_mca2pa()
2786 addr_in.ma.err_addr = bps->address; in amdgpu_ras_mca2pa()
2787 addr_in.ma.ch_inst = bps->mem_channel; in amdgpu_ras_mca2pa()
2788 addr_in.ma.umc_inst = bps->mcumc_id; in amdgpu_ras_mca2pa()
2792 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_mca2pa()
2793 return adev->umc.ras->convert_ras_err_addr(adev, err_data, in amdgpu_ras_mca2pa()
2796 return -EINVAL; in amdgpu_ras_mca2pa()
2808 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_add_bad_pages()
2814 if (!con || !con->eh_data || !bps || pages <= 0) in amdgpu_ras_add_bad_pages()
2819 kcalloc(adev->umc.retire_unit, in amdgpu_ras_add_bad_pages()
2822 dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); in amdgpu_ras_add_bad_pages()
2823 ret = -ENOMEM; in amdgpu_ras_add_bad_pages()
2828 loop_cnt = adev->umc.retire_unit; in amdgpu_ras_add_bad_pages()
2829 if (adev->gmc.gmc_funcs->query_mem_partition_mode) in amdgpu_ras_add_bad_pages()
2830 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); in amdgpu_ras_add_bad_pages()
2833 mutex_lock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2834 data = con->eh_data; in amdgpu_ras_add_bad_pages()
2842 control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) { in amdgpu_ras_add_bad_pages()
2852 ret = -EINVAL; in amdgpu_ras_add_bad_pages()
2859 ret = -EOPNOTSUPP; in amdgpu_ras_add_bad_pages()
2866 ret = -EINVAL; in amdgpu_ras_add_bad_pages()
2875 ret = -EINVAL; in amdgpu_ras_add_bad_pages()
2891 /* non-nps1 mode, old RAS TA in amdgpu_ras_add_bad_pages()
2894 ret = -EOPNOTSUPP; in amdgpu_ras_add_bad_pages()
2901 i += (adev->umc.retire_unit - 1); in amdgpu_ras_add_bad_pages()
2912 if (!data->space_left && in amdgpu_ras_add_bad_pages()
2914 ret = -ENOMEM; in amdgpu_ras_add_bad_pages()
2920 memcpy(&data->bps[data->count], &(err_rec[j]), in amdgpu_ras_add_bad_pages()
2922 data->count++; in amdgpu_ras_add_bad_pages()
2923 data->space_left--; in amdgpu_ras_add_bad_pages()
2931 mutex_unlock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2949 if (!con || !con->eh_data) { in amdgpu_ras_save_bad_pages()
2956 mutex_lock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2957 control = &con->eeprom_control; in amdgpu_ras_save_bad_pages()
2958 data = con->eh_data; in amdgpu_ras_save_bad_pages()
2959 bad_page_num = control->ras_num_bad_pages; in amdgpu_ras_save_bad_pages()
2960 save_count = data->count - bad_page_num; in amdgpu_ras_save_bad_pages()
2961 mutex_unlock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2963 unit_num = save_count / adev->umc.retire_unit; in amdgpu_ras_save_bad_pages()
2969 if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) { in amdgpu_ras_save_bad_pages()
2971 &data->bps[control->ras_num_recs], in amdgpu_ras_save_bad_pages()
2973 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2974 return -EIO; in amdgpu_ras_save_bad_pages()
2979 &data->bps[bad_page_num + i * adev->umc.retire_unit], in amdgpu_ras_save_bad_pages()
2981 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2982 return -EIO; in amdgpu_ras_save_bad_pages()
2987 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
3000 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
3005 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) in amdgpu_ras_load_bad_pages()
3008 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); in amdgpu_ras_load_bad_pages()
3010 return -ENOMEM; in amdgpu_ras_load_bad_pages()
3012 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
3014 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
3016 if (control->ras_num_recs > 1 && in amdgpu_ras_load_bad_pages()
3017 adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { in amdgpu_ras_load_bad_pages()
3020 control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; in amdgpu_ras_load_bad_pages()
3022 control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; in amdgpu_ras_load_bad_pages()
3031 ret = -EHWPOISON; in amdgpu_ras_load_bad_pages()
3035 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); in amdgpu_ras_load_bad_pages()
3046 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_check_bad_page_unlock()
3050 for (i = 0; i < data->count; i++) in amdgpu_ras_check_bad_page_unlock()
3051 if (addr == data->bps[i].retired_page) in amdgpu_ras_check_bad_page_unlock()
3068 if (!con || !con->eh_data) in amdgpu_ras_check_bad_page()
3071 mutex_lock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
3073 mutex_unlock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
3086 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two in amdgpu_ras_validate_threshold()
3090 * - If amdgpu_bad_page_threshold = -2, in amdgpu_ras_validate_threshold()
3093 * - When the value from user is 0 < amdgpu_bad_page_threshold < in amdgpu_ras_validate_threshold()
3097 * - If amdgpu_bad_page_threshold = 0, bad page retirement in amdgpu_ras_validate_threshold()
3103 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
3106 con->bad_page_cnt_threshold = min(lower_32_bits(val), in amdgpu_ras_validate_threshold()
3109 con->bad_page_cnt_threshold = min_t(int, max_count, in amdgpu_ras_validate_threshold()
3129 ret = kfifo_put(&con->poison_fifo, poison_msg); in amdgpu_ras_put_poison_req()
3131 dev_err(adev->dev, "Poison message fifo is full!\n"); in amdgpu_ras_put_poison_req()
3132 return -ENOSPC; in amdgpu_ras_put_poison_req()
3143 return kfifo_get(&con->poison_fifo, poison_msg); in amdgpu_ras_get_poison_req()
3148 mutex_init(&ecc_log->lock); in amdgpu_ras_ecc_log_init()
3150 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); in amdgpu_ras_ecc_log_init()
3151 ecc_log->de_queried_count = 0; in amdgpu_ras_ecc_log_init()
3152 ecc_log->prev_de_queried_count = 0; in amdgpu_ras_ecc_log_init()
3161 mutex_lock(&ecc_log->lock); in amdgpu_ras_ecc_log_fini()
3162 radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { in amdgpu_ras_ecc_log_fini()
3164 kfree(ecc_err->err_pages.pfn); in amdgpu_ras_ecc_log_fini()
3166 radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); in amdgpu_ras_ecc_log_fini()
3168 mutex_unlock(&ecc_log->lock); in amdgpu_ras_ecc_log_fini()
3170 mutex_destroy(&ecc_log->lock); in amdgpu_ras_ecc_log_fini()
3171 ecc_log->de_queried_count = 0; in amdgpu_ras_ecc_log_fini()
3172 ecc_log->prev_de_queried_count = 0; in amdgpu_ras_ecc_log_fini()
3180 mutex_lock(&con->umc_ecc_log.lock); in amdgpu_ras_schedule_retirement_dwork()
3181 ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, in amdgpu_ras_schedule_retirement_dwork()
3183 mutex_unlock(&con->umc_ecc_log.lock); in amdgpu_ras_schedule_retirement_dwork()
3186 schedule_delayed_work(&con->page_retirement_dwork, in amdgpu_ras_schedule_retirement_dwork()
3196 struct amdgpu_device *adev = con->adev; in amdgpu_ras_do_page_retirement()
3238 ecc_log = &ras->umc_ecc_log; in amdgpu_ras_poison_creation_handler()
3245 de_queried_count = ecc_log->de_queried_count; in amdgpu_ras_poison_creation_handler()
3246 if (de_queried_count > ecc_log->prev_de_queried_count) { in amdgpu_ras_poison_creation_handler()
3247 new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; in amdgpu_ras_poison_creation_handler()
3248 ecc_log->prev_de_queried_count = de_queried_count; in amdgpu_ras_poison_creation_handler()
3261 if (!--timeout) { in amdgpu_ras_poison_creation_handler()
3271 dev_warn(adev->dev, "Can't find deferred error! count: %u\n", in amdgpu_ras_poison_creation_handler()
3272 (need_query_count - total_detect_count)); in amdgpu_ras_poison_creation_handler()
3273 return -ENOENT; in amdgpu_ras_poison_creation_handler()
3277 schedule_delayed_work(&ras->page_retirement_dwork, 0); in amdgpu_ras_poison_creation_handler()
3289 ret = kfifo_get(&con->poison_fifo, &msg); in amdgpu_ras_clear_poison_fifo()
3301 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); in amdgpu_ras_poison_consumption_handler()
3323 flush_delayed_work(&con->page_retirement_dwork); in amdgpu_ras_poison_consumption_handler()
3325 con->gpu_reset_flags |= reset; in amdgpu_ras_poison_consumption_handler()
3331 flush_work(&con->recovery_work); in amdgpu_ras_poison_consumption_handler()
3347 wait_event_interruptible(con->page_retirement_wq, in amdgpu_ras_page_retirement_thread()
3349 atomic_read(&con->page_retirement_req_cnt)); in amdgpu_ras_page_retirement_thread()
3357 poison_creation_count = atomic_read(&con->poison_creation_count); in amdgpu_ras_page_retirement_thread()
3359 if (ret == -EIO) in amdgpu_ras_page_retirement_thread()
3363 atomic_sub(poison_creation_count, &con->poison_creation_count); in amdgpu_ras_page_retirement_thread()
3364 atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); in amdgpu_ras_page_retirement_thread()
3366 } while (atomic_read(&con->poison_creation_count)); in amdgpu_ras_page_retirement_thread()
3368 if (ret != -EIO) { in amdgpu_ras_page_retirement_thread()
3369 msg_count = kfifo_len(&con->poison_fifo); in amdgpu_ras_page_retirement_thread()
3373 if ((ret != -EIO) && in amdgpu_ras_page_retirement_thread()
3375 atomic_sub(msg_count, &con->page_retirement_req_cnt); in amdgpu_ras_page_retirement_thread()
3379 if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { in amdgpu_ras_page_retirement_thread()
3380 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ in amdgpu_ras_page_retirement_thread()
3382 atomic_set(&con->poison_creation_count, 0); in amdgpu_ras_page_retirement_thread()
3388 atomic_set(&con->page_retirement_req_cnt, 0); in amdgpu_ras_page_retirement_thread()
3390 if (ret == -EIO) { in amdgpu_ras_page_retirement_thread()
3391 /* Wait for mode-1 reset to complete */ in amdgpu_ras_page_retirement_thread()
3392 down_read(&adev->reset_domain->sem); in amdgpu_ras_page_retirement_thread()
3393 up_read(&adev->reset_domain->sem); in amdgpu_ras_page_retirement_thread()
3397 schedule_delayed_work(&con->page_retirement_dwork, 0); in amdgpu_ras_page_retirement_thread()
3399 /* gpu just completed mode-2 reset or other reset */ in amdgpu_ras_page_retirement_thread()
3401 msg_count = kfifo_len(&con->poison_fifo); in amdgpu_ras_page_retirement_thread()
3404 atomic_sub(msg_count, &con->page_retirement_req_cnt); in amdgpu_ras_page_retirement_thread()
3408 schedule_delayed_work(&con->page_retirement_dwork, 0); in amdgpu_ras_page_retirement_thread()
3424 control = &con->eeprom_control; in amdgpu_ras_init_badpage_info()
3429 if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_init_badpage_info()
3430 control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; in amdgpu_ras_init_badpage_info()
3433 if (control->ras_num_recs <= 1 && in amdgpu_ras_init_badpage_info()
3434 adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_init_badpage_info()
3435 control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; in amdgpu_ras_init_badpage_info()
3437 if (control->ras_num_recs) { in amdgpu_ras_init_badpage_info()
3443 adev, control->ras_num_bad_pages); in amdgpu_ras_init_badpage_info()
3445 if (con->update_channel_flag == true) { in amdgpu_ras_init_badpage_info()
3447 adev, control->bad_channel_bitmap); in amdgpu_ras_init_badpage_info()
3448 con->update_channel_flag = false; in amdgpu_ras_init_badpage_info()
3467 * adev->ras_enabled is unset, i.e. when "ras_enable" in amdgpu_ras_recovery_init()
3470 con->adev = adev; in amdgpu_ras_recovery_init()
3472 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
3475 data = &con->eh_data; in amdgpu_ras_recovery_init()
3478 ret = -ENOMEM; in amdgpu_ras_recovery_init()
3482 mutex_init(&con->recovery_lock); in amdgpu_ras_recovery_init()
3483 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); in amdgpu_ras_recovery_init()
3484 atomic_set(&con->in_recovery, 0); in amdgpu_ras_recovery_init()
3485 con->eeprom_control.bad_channel_bitmap = 0; in amdgpu_ras_recovery_init()
3487 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); in amdgpu_ras_recovery_init()
3496 mutex_init(&con->page_rsv_lock); in amdgpu_ras_recovery_init()
3497 INIT_KFIFO(con->poison_fifo); in amdgpu_ras_recovery_init()
3498 mutex_init(&con->page_retirement_lock); in amdgpu_ras_recovery_init()
3499 init_waitqueue_head(&con->page_retirement_wq); in amdgpu_ras_recovery_init()
3500 atomic_set(&con->page_retirement_req_cnt, 0); in amdgpu_ras_recovery_init()
3501 atomic_set(&con->poison_creation_count, 0); in amdgpu_ras_recovery_init()
3502 con->page_retirement_thread = in amdgpu_ras_recovery_init()
3504 if (IS_ERR(con->page_retirement_thread)) { in amdgpu_ras_recovery_init()
3505 con->page_retirement_thread = NULL; in amdgpu_ras_recovery_init()
3506 dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); in amdgpu_ras_recovery_init()
3509 INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); in amdgpu_ras_recovery_init()
3510 amdgpu_ras_ecc_log_init(&con->umc_ecc_log); in amdgpu_ras_recovery_init()
3512 if ((adev->asic_type == CHIP_ALDEBARAN) && in amdgpu_ras_recovery_init()
3513 (adev->gmc.xgmi.connected_to_cpu)) in amdgpu_ras_recovery_init()
3519 kfree((*data)->bps); in amdgpu_ras_recovery_init()
3521 con->eh_data = NULL; in amdgpu_ras_recovery_init()
3523 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
3532 ret = -EINVAL; in amdgpu_ras_recovery_init()
3540 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_recovery_fini()
3550 flush_delayed_work(&con->page_retirement_dwork); in amdgpu_ras_recovery_fini()
3552 } while (ret && max_flush_timeout--); in amdgpu_ras_recovery_fini()
3554 if (con->page_retirement_thread) in amdgpu_ras_recovery_fini()
3555 kthread_stop(con->page_retirement_thread); in amdgpu_ras_recovery_fini()
3557 atomic_set(&con->page_retirement_req_cnt, 0); in amdgpu_ras_recovery_fini()
3558 atomic_set(&con->poison_creation_count, 0); in amdgpu_ras_recovery_fini()
3560 mutex_destroy(&con->page_rsv_lock); in amdgpu_ras_recovery_fini()
3562 cancel_work_sync(&con->recovery_work); in amdgpu_ras_recovery_fini()
3564 cancel_delayed_work_sync(&con->page_retirement_dwork); in amdgpu_ras_recovery_fini()
3566 amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); in amdgpu_ras_recovery_fini()
3568 mutex_lock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
3569 con->eh_data = NULL; in amdgpu_ras_recovery_fini()
3570 kfree(data->bps); in amdgpu_ras_recovery_fini()
3572 mutex_unlock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
3592 if (adev->asic_type == CHIP_IP_DISCOVERY) { in amdgpu_ras_asic_supported()
3606 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
3607 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
3608 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
3609 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
3610 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
3620 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
3625 if (strnstr(ctx->vbios_pn, "D16406", in amdgpu_ras_get_quirks()
3626 sizeof(ctx->vbios_pn)) || in amdgpu_ras_get_quirks()
3627 strnstr(ctx->vbios_pn, "D36002", in amdgpu_ras_get_quirks()
3628 sizeof(ctx->vbios_pn))) in amdgpu_ras_get_quirks()
3629 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
3637 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3638 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_query_ras_capablity_from_vbios()
3641 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3646 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3648 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_query_ras_capablity_from_vbios()
3651 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | in amdgpu_ras_query_ras_capablity_from_vbios()
3662 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_query_ras_capablity_from_vbios()
3665 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_query_ras_capablity_from_vbios()
3672 if (!adev->gmc.xgmi.num_physical_nodes) in amdgpu_ras_query_ras_capablity_from_vbios()
3673 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); in amdgpu_ras_query_ras_capablity_from_vbios()
3675 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3690 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_ras_query_poison_mode()
3691 adev->gmc.is_app_apu) { in amdgpu_ras_query_poison_mode()
3693 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
3694 } else if (adev->df.funcs && in amdgpu_ras_query_poison_mode()
3695 adev->df.funcs->query_ras_poison_mode && in amdgpu_ras_query_poison_mode()
3696 adev->umc.ras && in amdgpu_ras_query_poison_mode()
3697 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_query_poison_mode()
3699 adev->df.funcs->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
3701 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
3705 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
3707 dev_warn(adev->dev, in amdgpu_ras_query_poison_mode()
3724 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
3735 if (amdgpu_psp_get_ras_capability(&adev->psp)) in amdgpu_ras_check_supported()
3739 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { in amdgpu_ras_check_supported()
3744 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
3757 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
3759 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
3760 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
3763 adev->aca.is_enabled = false; in amdgpu_ras_check_supported()
3766 if (adev->gmc.is_app_apu && in amdgpu_ras_check_supported()
3775 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw()
3780 res = pm_runtime_get_sync(dev->dev); in amdgpu_ras_counte_dw()
3787 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_counte_dw()
3788 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_counte_dw()
3791 pm_runtime_mark_last_busy(dev->dev); in amdgpu_ras_counte_dw()
3793 pm_runtime_put_autosuspend(dev->dev); in amdgpu_ras_counte_dw()
3804 static void ras_event_mgr_init(struct ras_event_manager *mgr) in ras_event_mgr_init() argument
3809 memset(mgr, 0, sizeof(*mgr)); in ras_event_mgr_init()
3810 atomic64_set(&mgr->seqno, 0); in ras_event_mgr_init()
3812 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { in ras_event_mgr_init()
3813 event_state = &mgr->event_state[i]; in ras_event_mgr_init()
3814 event_state->last_seqno = RAS_EVENT_INVALID_ID; in ras_event_mgr_init()
3815 atomic64_set(&event_state->count, 0); in ras_event_mgr_init()
3828 ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; in amdgpu_ras_event_mgr_init()
3832 if (!hive || adev->gmc.xgmi.node_id == 0) in amdgpu_ras_event_mgr_init()
3833 ras_event_mgr_init(ras->event_mgr); in amdgpu_ras_event_mgr_init()
3844 if (!con || (adev->flags & AMD_IS_APU)) in amdgpu_ras_init_reserved_vram_size()
3852 con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE; in amdgpu_ras_init_reserved_vram_size()
3872 return -ENOMEM; in amdgpu_ras_init()
3874 con->adev = adev; in amdgpu_ras_init()
3875 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); in amdgpu_ras_init()
3876 atomic_set(&con->ras_ce_count, 0); in amdgpu_ras_init()
3877 atomic_set(&con->ras_ue_count, 0); in amdgpu_ras_init()
3879 con->objs = (struct ras_manager *)(con + 1); in amdgpu_ras_init()
3885 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
3889 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
3890 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_init()
3899 con->update_channel_flag = false; in amdgpu_ras_init()
3900 con->features = 0; in amdgpu_ras_init()
3901 con->schema = 0; in amdgpu_ras_init()
3902 INIT_LIST_HEAD(&con->head); in amdgpu_ras_init()
3904 con->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_init()
3913 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_init()
3914 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
3917 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
3924 adev->nbio.ras = &nbio_v4_3_ras; in amdgpu_ras_init()
3927 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
3935 adev->nbio.ras = &nbif_v6_3_1_ras; in amdgpu_ras_init()
3939 if (!adev->gmc.is_app_apu) in amdgpu_ras_init()
3940 adev->nbio.ras = &nbio_v7_9_ras; in amdgpu_ras_init()
3953 if (adev->nbio.ras && in amdgpu_ras_init()
3954 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
3955 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
3960 if (adev->nbio.ras && in amdgpu_ras_init()
3961 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
3962 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
3968 if (adev->smuio.funcs && in amdgpu_ras_init()
3969 adev->smuio.funcs->get_socket_id) in amdgpu_ras_init()
3970 con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << in amdgpu_ras_init()
3974 con->schema = amdgpu_get_ras_schema(adev); in amdgpu_ras_init()
3979 r = -EINVAL; in amdgpu_ras_init()
3992 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
3994 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
4006 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_persistent_edc_harvesting_supported()
4007 adev->gmc.is_app_apu) in amdgpu_persistent_edc_harvesting_supported()
4025 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
4038 return con->poison_supported; in amdgpu_ras_is_poison_mode_supported()
4052 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_block_late_init()
4059 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) { in amdgpu_ras_block_late_init()
4071 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) in amdgpu_ras_block_late_init()
4075 if (ras_obj->ras_cb || (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
4076 (ras_obj->hw_ops->query_poison_status || in amdgpu_ras_block_late_init()
4077 ras_obj->hw_ops->handle_poison_consumption))) { in amdgpu_ras_block_late_init()
4083 if (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
4084 (ras_obj->hw_ops->query_ras_error_count || in amdgpu_ras_block_late_init()
4085 ras_obj->hw_ops->query_ras_error_status)) { in amdgpu_ras_block_late_init()
4094 return -ENOMEM; in amdgpu_ras_block_late_init()
4095 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); in amdgpu_ras_block_late_init()
4098 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_block_late_init()
4099 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_block_late_init()
4108 if (ras_obj->ras_cb) in amdgpu_ras_block_late_init()
4132 if (ras_obj->ras_cb) in amdgpu_ras_block_late_fini()
4150 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
4157 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_resume()
4169 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_resume()
4170 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
4171 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
4183 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
4188 if (AMDGPU_RAS_GET_FEATURES(con->features)) in amdgpu_ras_suspend()
4222 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_late_init()
4223 obj = node->ras_obj; in amdgpu_ras_late_init()
4225 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
4229 if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) in amdgpu_ras_late_init()
4232 if (obj->ras_late_init) { in amdgpu_ras_late_init()
4233 r = obj->ras_late_init(adev, &obj->ras_comm); in amdgpu_ras_late_init()
4235 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", in amdgpu_ras_late_init()
4236 obj->ras_comm.name, r); in amdgpu_ras_late_init()
4240 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); in amdgpu_ras_late_init()
4251 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
4256 if (AMDGPU_RAS_GET_FEATURES(con->features)) in amdgpu_ras_pre_fini()
4268 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
4271 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { in amdgpu_ras_fini()
4272 if (ras_node->ras_obj) { in amdgpu_ras_fini()
4273 obj = ras_node->ras_obj; in amdgpu_ras_fini()
4274 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && in amdgpu_ras_fini()
4275 obj->ras_fini) in amdgpu_ras_fini()
4276 obj->ras_fini(adev, &obj->ras_comm); in amdgpu_ras_fini()
4278 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); in amdgpu_ras_fini()
4282 list_del(&ras_node->node); in amdgpu_ras_fini()
4296 WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); in amdgpu_ras_fini()
4298 if (AMDGPU_RAS_GET_FEATURES(con->features)) in amdgpu_ras_fini()
4301 cancel_delayed_work_sync(&con->ras_counte_delay_work); in amdgpu_ras_fini()
4317 return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_get_fed_status()
4327 set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_set_fed()
4329 clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_set_fed()
4339 ras->ras_err_state = 0; in amdgpu_ras_clear_err_state()
4349 set_bit(block, &ras->ras_err_state); in amdgpu_ras_set_err_poison()
4359 return (ras->ras_err_state != 0); in amdgpu_ras_is_err_state()
4361 return test_bit(block, &ras->ras_err_state) || in amdgpu_ras_is_err_state()
4363 &ras->ras_err_state); in amdgpu_ras_is_err_state()
4377 return ras->event_mgr; in __get_ras_event_mgr()
4388 ret = -EINVAL; in amdgpu_ras_mark_ras_event_caller()
4394 ret = -EINVAL; in amdgpu_ras_mark_ras_event_caller()
4398 event_state = &event_mgr->event_state[type]; in amdgpu_ras_mark_ras_event_caller()
4399 event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); in amdgpu_ras_mark_ras_event_caller()
4400 atomic64_inc(&event_state->count); in amdgpu_ras_mark_ras_event_caller()
4404 dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", in amdgpu_ras_mark_ras_event_caller()
4426 id = event_mgr->event_state[type].last_seqno; in amdgpu_ras_acquire_event_id()
4453 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_global_ras_isr()
4460 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
4461 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
4476 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
4477 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_release_ras_context()
4492 if (adev && adev->gmc.xgmi.connected_to_cpu && in find_adev()
4493 adev->gmc.xgmi.physical_node_id == node_id) in find_adev()
4519 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && in amdgpu_bad_page_notifier()
4520 (XEC(m->status, 0x3f) == 0x0))) in amdgpu_bad_page_notifier()
4532 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; in amdgpu_bad_page_notifier()
4545 umc_inst = GET_UMC_INST(m->ipid); in amdgpu_bad_page_notifier()
4546 ch_inst = GET_CHAN_INDEX(m->ipid); in amdgpu_bad_page_notifier()
4548 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", in amdgpu_bad_page_notifier()
4551 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) in amdgpu_bad_page_notifier()
4590 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
4596 return -EINVAL; in amdgpu_ras_set_context()
4598 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
4612 ret = ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
4639 ras->gpu_reset_flags = 0; in amdgpu_ras_reset_gpu()
4640 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_reset_gpu()
4643 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { in amdgpu_ras_reset_gpu()
4648 hive_ras_recovery = atomic_read(&hive->ras_recovery); in amdgpu_ras_reset_gpu()
4656 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
4658 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_reset_gpu()
4660 flush_work(&ras->recovery_work); in amdgpu_ras_reset_gpu()
4661 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
4675 con->is_aca_debug_mode = enable; in amdgpu_ras_set_mca_debug_mode()
4692 con->is_aca_debug_mode = enable; in amdgpu_ras_set_aca_debug_mode()
4701 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; in amdgpu_ras_get_aca_debug_mode()
4702 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_get_aca_debug_mode()
4707 if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || in amdgpu_ras_get_aca_debug_mode()
4708 (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) in amdgpu_ras_get_aca_debug_mode()
4709 return con->is_aca_debug_mode; in amdgpu_ras_get_aca_debug_mode()
4718 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_get_error_query_mode()
4719 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; in amdgpu_ras_get_error_query_mode()
4728 …} else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode… in amdgpu_ras_get_error_query_mode()
4730 (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; in amdgpu_ras_get_error_query_mode()
4744 return -EINVAL; in amdgpu_ras_register_ras_block()
4748 return -ENOMEM; in amdgpu_ras_register_ras_block()
4750 INIT_LIST_HEAD(&ras_node->node); in amdgpu_ras_register_ras_block()
4751 ras_node->ras_obj = ras_block_obj; in amdgpu_ras_register_ras_block()
4752 list_add_tail(&ras_node->node, &adev->ras_list); in amdgpu_ras_register_ras_block()
4786 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_memory_id_field()
4787 reg_entry->seg_lo, reg_entry->reg_lo); in amdgpu_ras_inst_get_memory_id_field()
4790 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && in amdgpu_ras_inst_get_memory_id_field()
4810 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_err_cnt_field()
4811 reg_entry->seg_hi, reg_entry->reg_hi); in amdgpu_ras_inst_get_err_cnt_field()
4814 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && in amdgpu_ras_inst_get_err_cnt_field()
4817 dev_dbg(adev->dev, "Invalid err_info field\n"); in amdgpu_ras_inst_get_err_cnt_field()
4857 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
4865 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
4901 INIT_LIST_HEAD(&err_data->err_node_list); in amdgpu_ras_error_data_init()
4911 list_del(&err_node->node); in amdgpu_ras_error_node_release()
4919 list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) in amdgpu_ras_error_data_fini()
4933 ref_id = &err_node->err_info.mcm_info; in amdgpu_ras_error_find_node_by_id()
4935 if (mcm_info->socket_id == ref_id->socket_id && in amdgpu_ras_error_find_node_by_id()
4936 mcm_info->die_id == ref_id->die_id) in amdgpu_ras_error_find_node_by_id()
4951 INIT_LIST_HEAD(&err_node->node); in amdgpu_ras_error_node_new()
4960 struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; in ras_err_info_cmp()
4961 struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; in ras_err_info_cmp()
4963 if (unlikely(infoa->socket_id != infob->socket_id)) in ras_err_info_cmp()
4964 return infoa->socket_id - infob->socket_id; in ras_err_info_cmp()
4966 return infoa->die_id - infob->die_id; in ras_err_info_cmp()
4978 return &err_node->err_info; in amdgpu_ras_error_get_info()
4984 memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); in amdgpu_ras_error_get_info()
4986 err_data->err_list_count++; in amdgpu_ras_error_get_info()
4987 list_add_tail(&err_node->node, &err_data->err_node_list); in amdgpu_ras_error_get_info()
4988 list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); in amdgpu_ras_error_get_info()
4990 return &err_node->err_info; in amdgpu_ras_error_get_info()
5000 return -EINVAL; in amdgpu_ras_error_statistic_ue_count()
5007 return -EINVAL; in amdgpu_ras_error_statistic_ue_count()
5009 err_info->ue_count += count; in amdgpu_ras_error_statistic_ue_count()
5010 err_data->ue_count += count; in amdgpu_ras_error_statistic_ue_count()
5022 return -EINVAL; in amdgpu_ras_error_statistic_ce_count()
5029 return -EINVAL; in amdgpu_ras_error_statistic_ce_count()
5031 err_info->ce_count += count; in amdgpu_ras_error_statistic_ce_count()
5032 err_data->ce_count += count; in amdgpu_ras_error_statistic_ce_count()
5044 return -EINVAL; in amdgpu_ras_error_statistic_de_count()
5051 return -EINVAL; in amdgpu_ras_error_statistic_de_count()
5053 err_info->de_count += count; in amdgpu_ras_error_statistic_de_count()
5054 err_data->de_count += count; in amdgpu_ras_error_statistic_de_count()
5086 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5091 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5096 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5101 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5106 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5111 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5116 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5121 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5126 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5131 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5170 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; in amdgpu_ras_reserve_page() local
5174 mutex_lock(&con->page_rsv_lock); in amdgpu_ras_reserve_page()
5175 ret = amdgpu_vram_mgr_query_page_status(mgr, start); in amdgpu_ras_reserve_page()
5176 if (ret == -ENOENT) in amdgpu_ras_reserve_page()
5177 ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE); in amdgpu_ras_reserve_page()
5178 mutex_unlock(&con->page_rsv_lock); in amdgpu_ras_reserve_page()
5194 dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); in amdgpu_ras_event_log_print()
5196 dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); in amdgpu_ras_event_log_print()
5208 return con->is_rma; in amdgpu_ras_is_rma()