From e2a75f88c3ad4b895b58d4abd877de827a12072f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 27 Apr 2017 16:58:01 -0400 Subject: drm/amdgpu: parse the gpu_info firmware (v4) And populate the gfx structures from it. v2: update the structures updated by the table v3: rework based on new table structure v4: simplify things Reviewed-by: Junwei Zhang Tested-by: Junwei Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 ++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 43ca16b6eee2..c20ef335a6ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -54,6 +54,8 @@ #include #include +MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); + static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev); static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev); @@ -1392,6 +1394,98 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) } } +static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) +{ + const struct firmware *fw; + const char *chip_name; + char fw_name[30]; + int err; + const struct gpu_info_firmware_header_v1_0 *hdr; + + switch (adev->asic_type) { + case CHIP_TOPAZ: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS11: + case CHIP_POLARIS10: + case CHIP_POLARIS12: + case CHIP_CARRIZO: + case CHIP_STONEY: +#ifdef CONFIG_DRM_AMDGPU_SI + case CHIP_VERDE: + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_OLAND: + case CHIP_HAINAN: +#endif +#ifdef CONFIG_DRM_AMDGPU_CIK + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_MULLINS: +#endif + default: + return 0; + case CHIP_VEGA10: + chip_name = "vega10"; + break; + } + + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); + err = request_firmware(&fw, fw_name, adev->dev); + if (err) { + dev_err(adev->dev, + "Failed to load gpu_info firmware \"%s\"\n", + fw_name); + goto out; + } + err = amdgpu_ucode_validate(fw); + if (err) { + dev_err(adev->dev, + "Failed to validate gpu_info firmware \"%s\"\n", + fw_name); + goto out; + } + + hdr = (const struct gpu_info_firmware_header_v1_0 *)fw->data; + amdgpu_ucode_print_gpu_info_hdr(&hdr->header); + + switch (hdr->version_major) { + case 1: + { + const struct gpu_info_firmware_v1_0 *gpu_info_fw = + (const struct gpu_info_firmware_v1_0 *)(fw->data + + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); + + adev->gfx.config.max_shader_engines = gpu_info_fw->gc_num_se; + adev->gfx.config.max_cu_per_sh = gpu_info_fw->gc_num_cu_per_sh; + adev->gfx.config.max_sh_per_se = gpu_info_fw->gc_num_sh_per_se; + adev->gfx.config.max_backends_per_se = gpu_info_fw->gc_num_rb_per_se; + adev->gfx.config.max_texture_channel_caches = + gpu_info_fw->gc_num_tccs; + adev->gfx.config.max_gprs = gpu_info_fw->gc_num_gprs; + adev->gfx.config.max_gs_threads = gpu_info_fw->gc_num_max_gs_thds; + adev->gfx.config.gs_vgt_table_depth = gpu_info_fw->gc_gs_table_depth; + adev->gfx.config.gs_prim_buffer_depth = gpu_info_fw->gc_gsprim_buff_depth; + adev->gfx.config.double_offchip_lds_buf = + gpu_info_fw->gc_double_offchip_lds_buffer; + adev->gfx.cu_info.wave_front_size = gpu_info_fw->gc_wave_size; + break; + } + default: + dev_err(adev->dev, + "Unsupported gpu_info table %d\n", hdr->header.ucode_version); + err = -EINVAL; + goto out; + } +out: + release_firmware(fw); + fw = NULL; + + return err; +} + static int amdgpu_early_init(struct amdgpu_device *adev) { int i, r; @@ -1456,6 +1550,10 @@ static int amdgpu_early_init(struct amdgpu_device *adev) return -EINVAL; } + r = amdgpu_device_parse_gpu_info_fw(adev); + if (r) + return r; + if (amdgpu_sriov_vf(adev)) { r = amdgpu_virt_request_full_gpu(adev, true); if (r) -- cgit v1.2.1 From 2cb681b6e4e317068153e217948d471f3117baee Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 26 Apr 2017 12:00:49 +0800 Subject: drm/amdgpu:re-write sriov_reinit_early/late (v2) 1,this way we make those routines compatible with the sequence requirment for both Tonga and Vega10 2,ignore PSP hw init when doing TDR, because for SR-IOV device the ucode won't get lost after VF FLR, so no need to invoke PSP doing the ucode reloading again. v2: squash in ARRAY_SIZE fix Signed-off-by: Monk Liu Reviewed-by: Xiangliang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++++++------------ 1 file changed, 39 insertions(+), 24 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c20ef335a6ab..9a7c0e4d9cc3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1815,19 +1815,27 @@ static int amdgpu_sriov_reinit_early(struct amdgpu_device *adev) { int i, r; - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) - r = adev->ip_blocks[i].version->funcs->hw_init(adev); + static enum amd_ip_block_type ip_order[] = { + AMD_IP_BLOCK_TYPE_GMC, + AMD_IP_BLOCK_TYPE_COMMON, + AMD_IP_BLOCK_TYPE_GFXHUB, + AMD_IP_BLOCK_TYPE_MMHUB, + AMD_IP_BLOCK_TYPE_IH, + }; + + for (i = 0; i < ARRAY_SIZE(ip_order); i++) { + int j; + struct amdgpu_ip_block *block; + + for (j = 0; j < adev->num_ip_blocks; j++) { + block = &adev->ip_blocks[j]; + + if (block->version->type != ip_order[i] || + !block->status.valid) + continue; - if (r) { - DRM_ERROR("resume of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - return r; + r = block->version->funcs->hw_init(adev); + DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); } } @@ -1838,20 +1846,27 @@ static int amdgpu_sriov_reinit_late(struct amdgpu_device *adev) { int i, r; - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; + static enum amd_ip_block_type ip_order[] = { + AMD_IP_BLOCK_TYPE_SMC, + AMD_IP_BLOCK_TYPE_DCE, + AMD_IP_BLOCK_TYPE_GFX, + AMD_IP_BLOCK_TYPE_SDMA, + AMD_IP_BLOCK_TYPE_VCE, + }; - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ) - continue; + for (i = 0; i < ARRAY_SIZE(ip_order); i++) { + int j; + struct amdgpu_ip_block *block; - r = adev->ip_blocks[i].version->funcs->hw_init(adev); - if (r) { - DRM_ERROR("resume of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - return r; + for (j = 0; j < adev->num_ip_blocks; j++) { + block = &adev->ip_blocks[j]; + + if (block->version->type != ip_order[i] || + !block->status.valid) + continue; + + r = block->version->funcs->hw_init(adev); + DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); } } -- cgit v1.2.1 From ea81a173ff2d4ac2807d016715cdc89c1656b20e Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Mon, 8 May 2017 13:41:11 -0400 Subject: drm/amdgpu: fix errors in comments. Signed-off-by: Alex Xie Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9a7c0e4d9cc3..4ca5af0e2bc6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -480,9 +480,8 @@ void amdgpu_doorbell_get_kfd_info(struct amdgpu_device *adev, /* * amdgpu_wb_*() - * Writeback is the the method by which the the GPU updates special pages - * in memory with the status of certain GPU events (fences, ring pointers, - * etc.). + * Writeback is the method by which GPU updates special pages in memory + * with the status of certain GPU events (fences, ring pointers,etc.). */ /** @@ -508,7 +507,7 @@ static void amdgpu_wb_fini(struct amdgpu_device *adev) * * @adev: amdgpu_device pointer * - * Disables Writeback and frees the Writeback memory (all asics). + * Initialize writeback and allocates writeback memory (all asics). * Used at driver startup. * Returns 0 on success or an -error on failure. */ -- cgit v1.2.1 From 455a7bc27c2118c7e0531b7502dedc80ed2f9d40 Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Mon, 8 May 2017 21:36:03 -0400 Subject: drm/amdgpu: Fix comments in source code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Alex Xie Reviewed-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4ca5af0e2bc6..e4d9aa470278 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -480,7 +480,7 @@ void amdgpu_doorbell_get_kfd_info(struct amdgpu_device *adev, /* * amdgpu_wb_*() - * Writeback is the method by which GPU updates special pages in memory + * Writeback is the method by which the GPU updates special pages in memory * with the status of certain GPU events (fences, ring pointers,etc.). */ @@ -507,7 +507,7 @@ static void amdgpu_wb_fini(struct amdgpu_device *adev) * * @adev: amdgpu_device pointer * - * Initialize writeback and allocates writeback memory (all asics). + * Initializes writeback and allocates writeback memory (all asics). * Used at driver startup. * Returns 0 on success or an -error on failure. */ @@ -615,7 +615,7 @@ void amdgpu_wb_free_64bit(struct amdgpu_device *adev, u32 wb) * @mc: memory controller structure holding memory informations * @base: base address at which to put VRAM * - * Function will place try to place VRAM at base address provided + * Function will try to place VRAM at base address provided * as parameter (which is so far either PCI aperture address or * for IGP TOM base address). * @@ -637,7 +637,7 @@ void amdgpu_wb_free_64bit(struct amdgpu_device *adev, u32 wb) * ones) * * Note: IGP TOM addr should be the same as the aperture addr, we don't - * explicitly check for that thought. + * explicitly check for that though. * * FIXME: when reducing VRAM size align new size on power of 2. */ -- cgit v1.2.1 From fcf0649fcc71d1d6a8d45d7cba21b6a0ad6489b7 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Fri, 5 May 2017 10:33:33 +0800 Subject: drm/amdgpu: fix ring0 failed on pro card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the root cause is vram content is lost completely after pci reset. Signed-off-by: Chunming Zhou Reviewed-by: Roger.He Acked-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 57 ++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e4d9aa470278..3c95e1858aa2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1872,13 +1872,40 @@ static int amdgpu_sriov_reinit_late(struct amdgpu_device *adev) return 0; } -static int amdgpu_resume(struct amdgpu_device *adev) +static int amdgpu_resume_phase1(struct amdgpu_device *adev) { int i, r; for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid) continue; + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == + AMD_IP_BLOCK_TYPE_IH) { + r = adev->ip_blocks[i].version->funcs->resume(adev); + if (r) { + DRM_ERROR("resume of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + return r; + } + } + } + + return 0; +} + +static int amdgpu_resume_phase2(struct amdgpu_device *adev) +{ + int i, r; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ) + continue; r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -1890,6 +1917,18 @@ static int amdgpu_resume(struct amdgpu_device *adev) return 0; } +static int amdgpu_resume(struct amdgpu_device *adev) +{ + int r; + + r = amdgpu_resume_phase1(adev); + if (r) + return r; + r = amdgpu_resume_phase2(adev); + + return r; +} + static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) { if (adev->is_atom_fw) { @@ -2753,16 +2792,20 @@ retry: if (!r) { dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); - r = amdgpu_resume(adev); + r = amdgpu_resume_phase1(adev); + if (r) + goto out; + r = amdgpu_ttm_recover_gart(adev); + if (r) + goto out; + r = amdgpu_resume_phase2(adev); + if (r) + goto out; } } +out: if (!r) { amdgpu_irq_gpu_reset_resume_helper(adev); - if (need_full_reset && amdgpu_need_backup(adev)) { - r = amdgpu_ttm_recover_gart(adev); - if (r) - DRM_ERROR("gart recovery failed!!!\n"); - } r = amdgpu_ib_ring_tests(adev); if (r) { dev_err(adev->dev, "ib ring test failed (%d).\n", r); -- cgit v1.2.1 From 6643be65d9e3e76960119957e5ad1acecb0b8dc0 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Fri, 5 May 2017 10:50:09 +0800 Subject: drm/amdgpu: print when gpu reset successed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chunming Zhou Reviewed-by: Roger.He Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3c95e1858aa2..ee0877342566 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2867,10 +2867,11 @@ out: drm_helper_resume_force_mode(adev->ddev); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); - if (r) { + if (r) /* bad news, how to tell it to userspace ? */ dev_info(adev->dev, "GPU reset failed\n"); - } + else + dev_info(adev->dev, "GPU reset successed!\n"); return r; } -- cgit v1.2.1 From 4fbf87e2fe472110d8d3f66ffcbfb7fff911c191 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Fri, 5 May 2017 15:09:42 +0800 Subject: drm/amdgpu:don't invoke srio-gpu-reset in gpu-reset (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit because we don't want to do sriov-gpu-reset under certain cases, so just split those two funtion and don't invoke sr-iov one from bare-metal one. V2: remove debugfs_gpu_reset routine on SRIOV case. Signed-off-by: Monk Liu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ee0877342566..5b8f7e59099e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2731,9 +2731,6 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) int resched; bool need_full_reset; - if (amdgpu_sriov_vf(adev)) - return amdgpu_sriov_gpu_reset(adev, true); - if (!amdgpu_check_soft_reset(adev)) { DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); return 0; -- cgit v1.2.1 From 7225f8736c66b7130d3a6294217ed86f26b59489 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 26 Apr 2017 14:51:54 +0800 Subject: drm/amdgpu:use job* to replace voluntary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit that way we can know which job cause hang and can do per sched reset/recovery instead of all sched. Signed-off-by: Monk Liu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5b8f7e59099e..41c18700e275 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2609,14 +2609,13 @@ err: * amdgpu_sriov_gpu_reset - reset the asic * * @adev: amdgpu device pointer - * @voluntary: if this reset is requested by guest. - * (true means by guest and false means by HYPERVISOR ) + * @job: which job trigger hang * * Attempt the reset the GPU if it has hung (all asics). * for SRIOV case. * Returns 0 for success or an error on failure. */ -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary) +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) { int i, r = 0; int resched; @@ -2646,7 +2645,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary) amdgpu_fence_driver_force_completion(adev); /* request to take full control of GPU before re-initialization */ - if (voluntary) + if (job) amdgpu_virt_reset_gpu(adev); else amdgpu_virt_request_full_gpu(adev, true); -- cgit v1.2.1 From 65781c78ad74e4260fbec92c0ecc05738044e177 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Thu, 11 May 2017 13:36:44 +0800 Subject: drm/amdgpu/SRIOV:implement guilty job TDR for(V2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1,TDR will kickout guilty job if it hang exceed the threshold of the given one from kernel paramter "job_hang_limit", that way a bad command stream will not infinitly cause GPU hang. by default this threshold is 1 so a job will be kicked out after it hang. 2,if a job timeout TDR routine will not reset all sched/ring, instead if will only reset on the givn one which is indicated by @job of amdgpu_sriov_gpu_reset, that way we don't need to reset and recover each sched/ring if we already know which job cause GPU hang. 3,unblock sriov_gpu_reset for AI family. V2: 1:put kickout guilty job after sched parked. 2:since parking scheduler prior to kickout already occupies a while, we can do last check on the in question job before doing hw_reset. TODO: 1:when a job is considered as guilty, we should mark some flag in its fence status flag, and let UMD side aware that this fence signaling is not due to job complete but job hang. 2:if gpu reset cause all video memory lost, we need introduce a new policy to implement TDR, like drop all jobs not yet signaled, and all IOCTL on this device will return ERROR DEVICE_LOST. this will be implemented later. Signed-off-by: Monk Liu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 43 +++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 41c18700e275..8b0f4864a885 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2617,7 +2617,7 @@ err: */ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) { - int i, r = 0; + int i, j, r = 0; int resched; struct amdgpu_bo *bo, *tmp; struct amdgpu_ring *ring; @@ -2630,19 +2630,36 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) /* block TTM */ resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); - /* block scheduler */ - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - ring = adev->rings[i]; + /* we start from the ring trigger GPU hang */ + j = job ? job->ring->idx : 0; + /* block scheduler */ + for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) { + ring = adev->rings[i % AMDGPU_MAX_RINGS]; if (!ring || !ring->sched.thread) continue; kthread_park(ring->sched.thread); + + if (job && j != i) + continue; + + /* here give the last chance to check if fence signaled + * since we already pay some time on kthread_park */ + if (job && dma_fence_is_signaled(&job->base.s_fence->finished)) { + kthread_unpark(ring->sched.thread); + goto give_up_reset; + } + + if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit)) + amd_sched_job_kickout(&job->base); + + /* only do job_reset on the hang ring if @job not NULL */ amd_sched_hw_job_reset(&ring->sched); - } - /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ - amdgpu_fence_driver_force_completion(adev); + /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ + amdgpu_fence_driver_force_completion_ring(ring); + } /* request to take full control of GPU before re-initialization */ if (job) @@ -2695,20 +2712,28 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) } dma_fence_put(fence); - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; + for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) { + ring = adev->rings[i % AMDGPU_MAX_RINGS]; if (!ring || !ring->sched.thread) continue; + if (job && j != i) { + kthread_unpark(ring->sched.thread); + continue; + } + amd_sched_job_recovery(&ring->sched); kthread_unpark(ring->sched.thread); } drm_helper_resume_force_mode(adev->ddev); +give_up_reset: ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); if (r) { /* bad news, how to tell it to userspace ? */ dev_info(adev->dev, "GPU reset failed\n"); + } else { + dev_info(adev->dev, "GPU reset successed!\n"); } adev->gfx.in_reset = false; -- cgit v1.2.1 From 4f059ecdcec2dd6fab757a16cc552093bfd321ee Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Thu, 11 May 2017 13:59:15 +0800 Subject: drm/amdgpu:use job's list instead of check fence because if the fence is really signaled, it could already released so the fence pointer is a wild pointer, but if we use job->base.node we are safe because job will not be released untill amdgpu_job_timedout finished. Signed-off-by: Monk Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8b0f4864a885..d1385eba6f43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2644,9 +2644,9 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) if (job && j != i) continue; - /* here give the last chance to check if fence signaled + /* here give the last chance to check if job removed from mirror-list * since we already pay some time on kthread_park */ - if (job && dma_fence_is_signaled(&job->base.s_fence->finished)) { + if (job && list_empty(&job->base.node)) { kthread_unpark(ring->sched.thread); goto give_up_reset; } -- cgit v1.2.1 From 2ca8a5d2ebd12c72c8b3e5ce251a02c0cc7e18b1 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Wed, 7 Dec 2016 17:31:19 +0800 Subject: drm/amdgpu: add RAVEN family id definition RAVEN is a new APU. Signed-off-by: Chunming Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d1385eba6f43..d0a26fff53f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -79,6 +79,7 @@ static const char *amdgpu_asic_name[] = { "POLARIS11", "POLARIS12", "VEGA10", + "RAVEN", "LAST", }; @@ -1537,8 +1538,12 @@ static int amdgpu_early_init(struct amdgpu_device *adev) return r; break; #endif - case CHIP_VEGA10: - adev->family = AMDGPU_FAMILY_AI; + case CHIP_VEGA10: + case CHIP_RAVEN: + if (adev->asic_type == CHIP_RAVEN) + adev->family = AMDGPU_FAMILY_RV; + else + adev->family = AMDGPU_FAMILY_AI; r = soc15_set_ip_blocks(adev); if (r) -- cgit v1.2.1 From 2d2e5e7e530722bc5815e5c646c5d5ec7479d55c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 9 May 2017 12:27:35 -0400 Subject: drm/amdgpu: add raven gpu_info support Add support for parsing the gpu info table on raven. This is required to get the gpu config data for raven. Signed-off-by: Alex Deucher Signed-off-by: Tom St Denis Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d0a26fff53f3..8eb162509c84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -55,6 +55,7 @@ #include MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev); static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev); @@ -1430,6 +1431,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_VEGA10: chip_name = "vega10"; break; + case CHIP_RAVEN: + chip_name = "raven"; + break; } snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); -- cgit v1.2.1 From 0c49e0b8a43c8addb0498cd32390f4ef08b5dd27 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Mon, 15 May 2017 14:20:00 +0800 Subject: drm/amdgpu: check if vram is lost v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit backup first 64 byte of gart table as reset magic, check if magic is same after gpu hw reset. v2: use memcmp instead of manual innovation. Signed-off-by: Chunming Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8eb162509c84..5a170071702a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1658,6 +1658,17 @@ static int amdgpu_init(struct amdgpu_device *adev) return 0; } +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev) +{ + memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); +} + +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev) +{ + return !!memcmp(adev->gart.ptr, adev->reset_magic, + AMDGPU_RESET_MAGIC_NUM); +} + static int amdgpu_late_init(struct amdgpu_device *adev) { int i = 0, r; @@ -1688,6 +1699,8 @@ static int amdgpu_late_init(struct amdgpu_device *adev) } } + amdgpu_fill_reset_magic(adev); + return 0; } @@ -2762,7 +2775,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) { int i, r; int resched; - bool need_full_reset; + bool need_full_reset, vram_lost = false; if (!amdgpu_check_soft_reset(adev)) { DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); @@ -2825,12 +2838,17 @@ retry: r = amdgpu_resume_phase1(adev); if (r) goto out; + vram_lost = amdgpu_check_vram_lost(adev); + if (vram_lost) + DRM_ERROR("VRAM is lost!\n"); r = amdgpu_ttm_recover_gart(adev); if (r) goto out; r = amdgpu_resume_phase2(adev); if (r) goto out; + if (vram_lost) + amdgpu_fill_reset_magic(adev); } } out: -- cgit v1.2.1 From f1892138abcb6d58359189f3b0a6c95f10613513 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Mon, 15 May 2017 16:48:27 +0800 Subject: drm/amdgpu: return -ENODEV to user space when vram is lost v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit below ioctl will return -ENODEV: amdgpu_cs_ioctl amdgpu_cs_wait_ioctl amdgpu_cs_wait_fences_ioctl amdgpu_gem_va_ioctl amdgpu_info_ioctl v2: only for map and replace cases in amdgpu_gem_va_ioctl Signed-off-by: Chunming Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5a170071702a..794e14d8e906 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2839,8 +2839,10 @@ retry: if (r) goto out; vram_lost = amdgpu_check_vram_lost(adev); - if (vram_lost) + if (vram_lost) { DRM_ERROR("VRAM is lost!\n"); + atomic_inc(&adev->vram_lost_counter); + } r = amdgpu_ttm_recover_gart(adev); if (r) goto out; -- cgit v1.2.1 From b5ab16bf64347ebc9dbdc51a4f603511babda1e6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 19:09:49 -0400 Subject: drm/amdgpu: properly byteswap gpu_info firmware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's stored in LE format. Reviewed-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 794e14d8e906..a1d631ab8eb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1462,19 +1462,19 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) (const struct gpu_info_firmware_v1_0 *)(fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); - adev->gfx.config.max_shader_engines = gpu_info_fw->gc_num_se; - adev->gfx.config.max_cu_per_sh = gpu_info_fw->gc_num_cu_per_sh; - adev->gfx.config.max_sh_per_se = gpu_info_fw->gc_num_sh_per_se; - adev->gfx.config.max_backends_per_se = gpu_info_fw->gc_num_rb_per_se; + adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); + adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); + adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); + adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); adev->gfx.config.max_texture_channel_caches = - gpu_info_fw->gc_num_tccs; - adev->gfx.config.max_gprs = gpu_info_fw->gc_num_gprs; - adev->gfx.config.max_gs_threads = gpu_info_fw->gc_num_max_gs_thds; - adev->gfx.config.gs_vgt_table_depth = gpu_info_fw->gc_gs_table_depth; - adev->gfx.config.gs_prim_buffer_depth = gpu_info_fw->gc_gsprim_buff_depth; + le32_to_cpu(gpu_info_fw->gc_num_tccs); + adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); + adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); + adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); + adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); adev->gfx.config.double_offchip_lds_buf = - gpu_info_fw->gc_double_offchip_lds_buffer; - adev->gfx.cu_info.wave_front_size = gpu_info_fw->gc_wave_size; + le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); + adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); break; } default: -- cgit v1.2.1 From 2dc80b00652f2a08f3f1a01e668e3c7ad716f55f Mon Sep 17 00:00:00 2001 From: Shirish S Date: Thu, 25 May 2017 10:05:25 +0530 Subject: drm/amdgpu: optimize amdgpu driver load & resume time amdgpu_device_resume() & amdgpu_device_init() have a high time consuming call of amdgpu_late_init() which sets the clock_gating state of all IP blocks and is blocking. This patch defers only this setting of clock gating state operation to post resume of amdgpu driver but ideally before the UI comes up or in some cases post ui as well. With this change the resume time of amdgpu_device comes down from 1.299s to 0.199s which further helps in reducing the overall system resume time. V1: made the optimization applicable during driver load as well. TEST:(For ChromiumOS on STONEY only) * UI comes up * amdgpu_late_init() call gets called consistently and no errors reported. Signed-off-by: Shirish S Reviewed-by: Huang Rui Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a1d631ab8eb9..e731c4876a09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -57,6 +57,8 @@ MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); +#define AMDGPU_RESUME_MS 2000 + static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev); static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev); @@ -1669,22 +1671,13 @@ static bool amdgpu_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM); } -static int amdgpu_late_init(struct amdgpu_device *adev) +static int amdgpu_late_set_cg_state(struct amdgpu_device *adev) { int i = 0, r; for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid) continue; - if (adev->ip_blocks[i].version->funcs->late_init) { - r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); - if (r) { - DRM_ERROR("late_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - return r; - } - adev->ip_blocks[i].status.late_initialized = true; - } /* skip CG for VCE/UVD, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE) { @@ -1698,6 +1691,29 @@ static int amdgpu_late_init(struct amdgpu_device *adev) } } } + return 0; +} + +static int amdgpu_late_init(struct amdgpu_device *adev) +{ + int i = 0, r; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (adev->ip_blocks[i].version->funcs->late_init) { + r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); + if (r) { + DRM_ERROR("late_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + return r; + } + adev->ip_blocks[i].status.late_initialized = true; + } + } + + mod_delayed_work(system_wq, &adev->late_init_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); amdgpu_fill_reset_magic(adev); @@ -1791,6 +1807,13 @@ static int amdgpu_fini(struct amdgpu_device *adev) return 0; } +static void amdgpu_late_init_func_handler(struct work_struct *work) +{ + struct amdgpu_device *adev = + container_of(work, struct amdgpu_device, late_init_work.work); + amdgpu_late_set_cg_state(adev); +} + int amdgpu_suspend(struct amdgpu_device *adev) { int i, r; @@ -2050,6 +2073,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->gtt_list); spin_lock_init(&adev->gtt_list_lock); + INIT_DELAYED_WORK(&adev->late_init_work, amdgpu_late_init_func_handler); + if (adev->asic_type >= CHIP_BONAIRE) { adev->rmmio_base = pci_resource_start(adev->pdev, 5); adev->rmmio_size = pci_resource_len(adev->pdev, 5); @@ -2247,6 +2272,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_fbdev_fini(adev); r = amdgpu_fini(adev); adev->accel_working = false; + cancel_delayed_work_sync(&adev->late_init_work); /* free i2c buses */ amdgpu_i2c_fini(adev); amdgpu_atombios_fini(adev); -- cgit v1.2.1 From 795f2813e628bcf57a69f2dfe413360d14a1d7f4 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Mon, 6 Mar 2017 16:27:55 -0500 Subject: drm/amdgpu: implement lru amdgpu_queue_mgr policy for compute v4 Use an LRU policy to map usermode rings to HW compute queues. Most compute clients use one queue, and usually the first queue available. This results in poor pipe/queue work distribution when multiple compute apps are running. In most cases pipe 0 queue 0 is the only queue that gets used. In order to better distribute work across multiple HW queues, we adopt a policy to map the usermode ring ids to the LRU HW queue. This fixes a large majority of multi-app compute workloads sharing the same HW queue, even though 7 other queues are available. v2: use ring->funcs->type instead of ring->hw_ip v3: remove amdgpu_queue_mapper_funcs v4: change ring_lru_list_lock to spinlock, grab only once in lru_get() Signed-off-by: Andres Rodriguez Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e731c4876a09..cce94d836221 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2073,6 +2073,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->gtt_list); spin_lock_init(&adev->gtt_list_lock); + INIT_LIST_HEAD(&adev->ring_lru_list); + spin_lock_init(&adev->ring_lru_list_lock); + INIT_DELAYED_WORK(&adev->late_init_work, amdgpu_late_init_func_handler); if (adev->asic_type >= CHIP_BONAIRE) { -- cgit v1.2.1 From e59c020598666ffc22c627910667e44ac2412304 Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Thu, 1 Jun 2017 09:42:59 -0400 Subject: drm/amdgpu: Move compute vm bug logic to amdgpu_vm.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In review, Christian would like to keep the logic inside amdgpu_vm.c with a cost of slightly slower. The loop is still optimized out with this patch. v2: remove the if statement. Now it is not slower. Signed-off-by: Alex Xie Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cce94d836221..01ee69b37a2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2190,6 +2190,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->accel_working = true; + amdgpu_vm_check_compute_bug(adev); + /* Initialize the buffer migration limit. */ if (amdgpu_moverate >= 0) max_MBps = amdgpu_moverate; -- cgit v1.2.1 From 373f59232546f0450d0898dfb858a18e4b17e5d7 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 31 May 2017 23:46:26 +0800 Subject: drm/amdgpu: remove gfxhub ip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 01ee69b37a2b..85dc4640b193 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1862,7 +1862,6 @@ static int amdgpu_sriov_reinit_early(struct amdgpu_device *adev) static enum amd_ip_block_type ip_order[] = { AMD_IP_BLOCK_TYPE_GMC, AMD_IP_BLOCK_TYPE_COMMON, - AMD_IP_BLOCK_TYPE_GFXHUB, AMD_IP_BLOCK_TYPE_MMHUB, AMD_IP_BLOCK_TYPE_IH, }; -- cgit v1.2.1 From 1191d110c31bde9683f3871e49466eb799ebdc0f Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 31 May 2017 23:49:46 +0800 Subject: drm/amdgpu: remove mmhub ip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 85dc4640b193..383a943dc83a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1862,7 +1862,6 @@ static int amdgpu_sriov_reinit_early(struct amdgpu_device *adev) static enum amd_ip_block_type ip_order[] = { AMD_IP_BLOCK_TYPE_GMC, AMD_IP_BLOCK_TYPE_COMMON, - AMD_IP_BLOCK_TYPE_MMHUB, AMD_IP_BLOCK_TYPE_IH, }; -- cgit v1.2.1 From ed8cf00ce4dcdd7b50bf094d8015d8839ce770f3 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 3 May 2017 09:40:17 +0800 Subject: drm/amdgpu: add ip name print for selecting ips with ip_block_mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 383a943dc83a..e3ca67e3fca7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1572,7 +1572,8 @@ static int amdgpu_early_init(struct amdgpu_device *adev) for (i = 0; i < adev->num_ip_blocks; i++) { if ((amdgpu_ip_block_mask & (1 << i)) == 0) { - DRM_ERROR("disabled ip block: %d\n", i); + DRM_ERROR("disabled ip block: %d <%s>\n", + i, adev->ip_blocks[i].version->funcs->name); adev->ip_blocks[i].status.valid = false; } else { if (adev->ip_blocks[i].version->funcs->early_init) { -- cgit v1.2.1 From a0bae3577f46f6b61ccfa4cb0772fd804be1de96 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 3 May 2017 09:52:06 +0800 Subject: drm/amdgpu: add ip block number prints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User is able to follow the ip block number to write the ip_block_mask for selecting the one which user would like to enable. Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e3ca67e3fca7..0296c9efc356 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1347,6 +1347,9 @@ int amdgpu_ip_block_add(struct amdgpu_device *adev, if (!ip_block_version) return -EINVAL; + DRM_DEBUG("add ip block number %d <%s>\n", adev->num_ip_blocks, + ip_block_version->funcs->name); + adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; return 0; -- cgit v1.2.1 From 0fa4955838ea1ecde268456676540dd2e84dce26 Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Thu, 8 Jun 2017 14:58:05 -0400 Subject: drm/amdgpu: move comment to the right place Signed-off-by: Alex Xie Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0296c9efc356..de2abef922f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2058,8 +2058,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_check_arguments(adev); - /* Registers mapping */ - /* TODO: block userspace mapping of io register */ spin_lock_init(&adev->mmio_idx_lock); spin_lock_init(&adev->smc_idx_lock); spin_lock_init(&adev->pcie_idx_lock); @@ -2080,6 +2078,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_DELAYED_WORK(&adev->late_init_work, amdgpu_late_init_func_handler); + /* Registers mapping */ + /* TODO: block userspace mapping of io register */ if (adev->asic_type >= CHIP_BONAIRE) { adev->rmmio_base = pci_resource_start(adev->pdev, 5); adev->rmmio_size = pci_resource_len(adev->pdev, 5); -- cgit v1.2.1 From 51fd0370677733785b1f5f31057a12738386ee25 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 9 Jun 2017 22:30:52 +0800 Subject: drm/amdgpu: add new member in gpu_info fw Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index de2abef922f5..5d6175ede20b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1480,6 +1480,11 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->gfx.config.double_offchip_lds_buf = le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); + adev->gfx.cu_info.max_waves_per_simd = + le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); + adev->gfx.cu_info.max_scratch_slots_per_cu = + le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); + adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); break; } default: -- cgit v1.2.1 From 4f0955fcc052b556446f6f041ad8c83d70c3b253 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 10 May 2017 23:04:06 +0800 Subject: drm/amdgpu: export test ib debugfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As Christian and David's suggestion, submit the test ib ring debug interfaces. It's useful for debugging with the command submission without VM case. Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5d6175ede20b..f5c4e2e5c4ad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -61,6 +61,7 @@ MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev); static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev); +static int amdgpu_debugfs_test_ib_ring_init(struct amdgpu_device *adev); static const char *amdgpu_asic_name[] = { "TAHITI", @@ -2227,6 +2228,10 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) DRM_ERROR("registering register debugfs failed (%d).\n", r); + r = amdgpu_debugfs_test_ib_ring_init(adev); + if (r) + DRM_ERROR("registering register test ib ring debugfs failed (%d).\n", r); + r = amdgpu_debugfs_firmware_init(adev); if (r) DRM_ERROR("registering firmware debugfs failed (%d).\n", r); @@ -3743,11 +3748,60 @@ static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev) } } +static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) +{ + struct drm_info_node *node = (struct drm_info_node *) m->private; + struct drm_device *dev = node->minor->dev; + struct amdgpu_device *adev = dev->dev_private; + int r = 0, i; + + /* hold on the scheduler */ + for (i = 0; i < AMDGPU_MAX_RINGS; i++) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + kthread_park(ring->sched.thread); + } + + seq_printf(m, "run ib test:\n"); + r = amdgpu_ib_ring_tests(adev); + if (r) + seq_printf(m, "ib ring tests failed (%d).\n", r); + else + seq_printf(m, "ib ring tests passed.\n"); + + /* go on the scheduler */ + for (i = 0; i < AMDGPU_MAX_RINGS; i++) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + kthread_unpark(ring->sched.thread); + } + + return 0; +} + +static const struct drm_info_list amdgpu_debugfs_test_ib_ring_list[] = { + {"amdgpu_test_ib", &amdgpu_debugfs_test_ib} +}; + +static int amdgpu_debugfs_test_ib_ring_init(struct amdgpu_device *adev) +{ + return amdgpu_debugfs_add_files(adev, + amdgpu_debugfs_test_ib_ring_list, 1); +} + int amdgpu_debugfs_init(struct drm_minor *minor) { return 0; } #else +static int amdgpu_debugfs_test_ib_init(struct amdgpu_device *adev) +{ + return 0; +} static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev) { return 0; -- cgit v1.2.1 From ab4fe3e1f910a71aabf0b1c919c482d7ce9fc5c7 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 5 Jun 2017 22:11:59 +0800 Subject: drm/amdgpu: fix missed gpu info firmware when cache firmware during S3 gpu_info firmware is released after data is used. But when system enters into suspend, upper class driver will cache all firmware names. At that time, gpu_info will be failing to load. It seems an upper class issue, that we should not release gpu_info firmware until device finished. [ 903.236589] cache_firmware: amdgpu/vega10_sdma1.bin [ 903.236590] fw_set_page_data: fw-amdgpu/vega10_sdma1.bin buf=ffff88041eee10c0 data=ffffc90002561000 size=17408 [ 903.236591] cache_firmware: amdgpu/vega10_sdma1.bin ret=0 [ 903.464160] __allocate_fw_buf: fw-amdgpu/vega10_gpu_info.bin buf=ffff88041eee2c00 [ 903.471815] (NULL device *): loading /lib/firmware/updates/4.11.0-custom/amdgpu/vega10_gpu_info.bin failed with error -2 [ 903.482870] (NULL device *): loading /lib/firmware/updates/amdgpu/vega10_gpu_info.bin failed with error -2 [ 903.492716] (NULL device *): loading /lib/firmware/4.11.0-custom/amdgpu/vega10_gpu_info.bin failed with error -2 [ 903.503156] (NULL device *): direct-loading amdgpu/vega10_gpu_info.bin Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f5c4e2e5c4ad..875cde414be7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1403,12 +1403,13 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) { - const struct firmware *fw; const char *chip_name; char fw_name[30]; int err; const struct gpu_info_firmware_header_v1_0 *hdr; + adev->firmware.gpu_info_fw = NULL; + switch (adev->asic_type) { case CHIP_TOPAZ: case CHIP_TONGA: @@ -1443,14 +1444,14 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) } snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); - err = request_firmware(&fw, fw_name, adev->dev); + err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); if (err) { dev_err(adev->dev, "Failed to load gpu_info firmware \"%s\"\n", fw_name); goto out; } - err = amdgpu_ucode_validate(fw); + err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); if (err) { dev_err(adev->dev, "Failed to validate gpu_info firmware \"%s\"\n", @@ -1458,14 +1459,14 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) goto out; } - hdr = (const struct gpu_info_firmware_header_v1_0 *)fw->data; + hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; amdgpu_ucode_print_gpu_info_hdr(&hdr->header); switch (hdr->version_major) { case 1: { const struct gpu_info_firmware_v1_0 *gpu_info_fw = - (const struct gpu_info_firmware_v1_0 *)(fw->data + + (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); @@ -1495,9 +1496,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) goto out; } out: - release_firmware(fw); - fw = NULL; - return err; } @@ -2288,6 +2286,10 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_fence_driver_fini(adev); amdgpu_fbdev_fini(adev); r = amdgpu_fini(adev); + if (adev->firmware.gpu_info_fw) { + release_firmware(adev->firmware.gpu_info_fw); + adev->firmware.gpu_info_fw = NULL; + } adev->accel_working = false; cancel_delayed_work_sync(&adev->late_init_work); /* free i2c buses */ -- cgit v1.2.1 From 64dab074fe5e8852b0c981564dc146f39535a81a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 15 Jun 2017 18:20:09 -0400 Subject: drm/amdgpu: don't check the default value for vm size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoids printing spurious messages like this: [ 3.102059] amdgpu 0000:01:00.0: VM size (-1) must be a power of 2 Reviewed-by: Christian König Reviewed-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 875cde414be7..b2c960b2ea82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1073,6 +1073,10 @@ def_value: static void amdgpu_check_vm_size(struct amdgpu_device *adev) { + /* no need to check the default value */ + if (amdgpu_vm_size == -1) + return; + if (!amdgpu_check_pot_argument(amdgpu_vm_size)) { dev_warn(adev->dev, "VM size (%d) must be a power of 2\n", amdgpu_vm_size); -- cgit v1.2.1 From 27bad5b9a7caf4f2b144fcd862f6b2685c671079 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Jun 2017 23:51:02 +0200 Subject: drm/amdgpu: fix typo in amdgpu_debugfs_test_ib_init The debugfs interface has calls a function that was evidently defined under the wrong name in some configurations: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:64:12: error: 'amdgpu_debugfs_test_ib_ring_init' used but never defined [-Werror] drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:3803:12: error: 'amdgpu_debugfs_test_ib_init' defined but not used [-Werror=unused-function] This fixes the function name. Fixes: 4f0955fcc052 ("drm/amdgpu: export test ib debugfs interface") Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b2c960b2ea82..2fe1e0a20c17 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3804,7 +3804,7 @@ int amdgpu_debugfs_init(struct drm_minor *minor) return 0; } #else -static int amdgpu_debugfs_test_ib_init(struct amdgpu_device *adev) +static int amdgpu_debugfs_test_ib_ring_init(struct amdgpu_device *adev) { return 0; } -- cgit v1.2.1