msm: kgsl: Recovery policy change
Recovery algorithm is changed to
step 1: retry the same commands that hung the GPU
step 2: if step 1 fails, nop just the IB that hung the GPU
and retry
step 3: if step 2 fails, skip commands in current context
till end of frame and retry
step 4: if step 3 fails mark context as bad and execute
the remaining commands from good contexts.
Previously we used to return -EDEADLK when recovery succeeds,
this is the same error code if the context is not recoverable.
With new policy if recovery succeeds we return -EAGAIN so that
userspace treats recovered context differently from the ones
that are not recoverable. If recovery fails we mark the context
as bad and return -EDEADLK.
Change-Id: I9fa3c40801964186866b6002e62f19cf6aa41361
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 5fd1230..88756c0 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -1383,6 +1383,7 @@
{
vfree(rec_data->rb_buffer);
vfree(rec_data->bad_rb_buffer);
+ vfree(rec_data->good_rb_buffer);
}
static int _find_start_of_cmd_seq(struct adreno_ringbuffer *rb,
@@ -1564,9 +1565,9 @@
current_context));
kgsl_sharedmem_readl(&device->memstore,
- &rec_data->global_eop,
- KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
- eoptimestamp));
+ &rec_data->global_eop,
+ KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+ eoptimestamp));
rec_data->rb_buffer = vmalloc(rb->buffer_desc.size);
if (!rec_data->rb_buffer) {
@@ -1582,7 +1583,17 @@
ret = -ENOMEM;
goto done;
}
+
+ rec_data->good_rb_buffer = vmalloc(rb->buffer_desc.size);
+ if (!rec_data->good_rb_buffer) {
+ KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
+ rb->buffer_desc.size);
+ ret = -ENOMEM;
+ goto done;
+ }
rec_data->fault = device->mmu.fault;
+ rec_data->step = adreno_dev->ft_policy;
+
/* find the start of bad command sequence in rb */
context = idr_find(&device->context_idr, rec_data->context_id);
/* Look for the command stream that is right after the global eop */
@@ -1621,16 +1632,73 @@
if (ret) {
vfree(rec_data->rb_buffer);
vfree(rec_data->bad_rb_buffer);
+ vfree(rec_data->good_rb_buffer);
}
return ret;
}
static int
-_adreno_recover_hang(struct kgsl_device *device,
- struct adreno_recovery_data *rec_data,
- bool try_bad_commands)
+_adreno_restart_device(struct kgsl_device *device,
+ struct kgsl_context *context)
{
- int ret;
+
+ struct adreno_context *adreno_context = context->devctxt;
+
+ /* restart device */
+ if (adreno_stop(device)) {
+ KGSL_DRV_ERR(device, "Device stop failed in recovery\n");
+ return 1;
+ }
+
+ if (adreno_start(device, true)) {
+ KGSL_DRV_ERR(device, "Device start failed in recovery\n");
+ return 1;
+ }
+
+ if (context)
+ kgsl_mmu_setstate(&device->mmu, adreno_context->pagetable,
+ KGSL_MEMSTORE_GLOBAL);
+
+ /* If iommu is used then we need to make sure that the iommu clocks
+ * are on since there could be commands in pipeline that touch iommu */
+ if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
+ if (kgsl_mmu_enable_clk(&device->mmu,
+ KGSL_IOMMU_CONTEXT_USER))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int
+_adreno_recovery_resubmit(struct kgsl_device *device,
+ struct adreno_ringbuffer *rb,
+ struct kgsl_context *context,
+ struct adreno_recovery_data *rec_data,
+ unsigned int *buff, unsigned int size)
+{
+ unsigned int ret = 0;
+
+ if (_adreno_restart_device(device, context))
+ return 1;
+
+ if (size) {
+
+ /* submit commands and wait for them to pass */
+ adreno_ringbuffer_restore(rb, buff, size);
+
+ ret = adreno_idle(device);
+ }
+
+ return ret;
+}
+
+
+static int
+_adreno_recover_hang(struct kgsl_device *device,
+ struct adreno_recovery_data *rec_data)
+{
+ int ret = 0, i;
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
struct kgsl_context *context;
@@ -1655,103 +1723,117 @@
* hang */
adreno_ringbuffer_extract(rb, rec_data);
- /* restart device */
- ret = adreno_stop(device);
- if (ret) {
- KGSL_DRV_ERR(device, "Device stop failed in recovery\n");
- goto done;
- }
+ /* Do not try the bad commands if hang is due to a fault */
+ if (rec_data->fault)
+ goto play_good_cmds;
- ret = adreno_start(device, true);
- if (ret) {
- KGSL_DRV_ERR(device, "Device start failed in recovery\n");
- goto done;
- }
+ if (rec_data->step == FT_REPLAY_BAD_CTXT_CMDS) {
- if (context)
- kgsl_mmu_setstate(&device->mmu, adreno_context->pagetable,
- KGSL_MEMSTORE_GLOBAL);
+ ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
+ rec_data->bad_rb_buffer, rec_data->bad_rb_size);
- /* If iommu is used then we need to make sure that the iommu clocks
- * are on since there could be commands in pipeline that touch iommu */
- if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
- ret = kgsl_mmu_enable_clk(&device->mmu,
- KGSL_IOMMU_CONTEXT_USER);
if (ret)
- goto done;
+ rec_data->step = FT_NOT_IB_BAD_CTXT_CMDS;
+ else
+ goto play_good_cmds;
+
}
- /* Do not try the bad commands if recovery has failed bad commands
- * once already or if hang is due to a fault */
- if (!try_bad_commands || rec_data->fault)
- rec_data->bad_rb_size = 0;
+ if (rec_data->step == FT_NOT_IB_BAD_CTXT_CMDS) {
- if (rec_data->bad_rb_size) {
- int idle_ret;
- /* submit the bad and good context commands and wait for
- * them to pass */
- adreno_ringbuffer_restore(rb, rec_data->bad_rb_buffer,
- rec_data->bad_rb_size);
- idle_ret = adreno_idle(device);
- if (idle_ret) {
- ret = adreno_stop(device);
- if (ret) {
- KGSL_DRV_ERR(device,
- "Device stop failed in recovery\n");
- goto done;
- }
- ret = adreno_start(device, true);
- if (ret) {
- KGSL_DRV_ERR(device,
- "Device start failed in recovery\n");
- goto done;
- }
- if (context)
- kgsl_mmu_setstate(&device->mmu,
- adreno_context->pagetable,
- KGSL_MEMSTORE_GLOBAL);
+ for (i = 0; i < rec_data->bad_rb_size; i++) {
+ if ((rec_data->bad_rb_buffer[i] ==
+ CP_HDR_INDIRECT_BUFFER_PFD) &&
+ (rec_data->bad_rb_buffer[i+1] ==
+ rec_data->ib1)) {
- if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
- ret = kgsl_mmu_enable_clk(&device->mmu,
- KGSL_IOMMU_CONTEXT_USER);
- if (ret)
- goto done;
+ rec_data->bad_rb_buffer[i] = cp_nop_packet(2);
+ rec_data->bad_rb_buffer[i+1] =
+ KGSL_NOP_IB_IDENTIFIER;
+ rec_data->bad_rb_buffer[i+2] =
+ KGSL_NOP_IB_IDENTIFIER;
+ break;
}
-
- ret = idle_ret;
- KGSL_DRV_ERR(device,
- "Bad context commands hung in recovery\n");
- } else {
- KGSL_DRV_ERR(device,
- "Bad context commands succeeded in recovery\n");
- if (adreno_context)
- adreno_context->flags = (adreno_context->flags &
- ~CTXT_FLAGS_GPU_HANG) |
- CTXT_FLAGS_GPU_HANG_RECOVERED;
- adreno_dev->drawctxt_active = last_active_ctx;
}
- }
- /* If either the bad command sequence failed or we did not play it */
- if (ret || !rec_data->bad_rb_size) {
- adreno_ringbuffer_restore(rb, rec_data->rb_buffer,
- rec_data->rb_size);
- ret = adreno_idle(device);
+
+ if ((i == (rec_data->bad_rb_size)) || (!rec_data->ib1)) {
+ KGSL_DRV_ERR(device, "Bad IB to NOP not found\n");
+ rec_data->step = FT_FAIL_BAD_CTXT_CMDS;
+ goto play_good_cmds;
+ }
+
+ ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
+ rec_data->bad_rb_buffer, rec_data->bad_rb_size);
+
if (ret) {
- /* If we fail here we can try to invalidate another
- * context and try recovering again */
- ret = -EAGAIN;
- goto done;
- }
- /* ringbuffer now has data from the last valid context id,
- * so restore the active_ctx to the last valid context */
- if (rec_data->last_valid_ctx_id) {
- struct kgsl_context *last_ctx =
- idr_find(&device->context_idr,
- rec_data->last_valid_ctx_id);
- if (last_ctx)
- adreno_dev->drawctxt_active = last_ctx->devctxt;
- }
+ KGSL_DRV_ERR(device, "NOP faulty IB unsuccessful\n");
+ rec_data->step = FT_SKIP_EOF_BAD_CTXT_CMDS;
+ } else
+ goto play_good_cmds;
}
+
+ if (rec_data->step == FT_SKIP_EOF_BAD_CTXT_CMDS) {
+
+ for (i = 0; i < rec_data->bad_rb_size; i++) {
+ if (rec_data->bad_rb_buffer[i] ==
+ KGSL_END_OF_FRAME_IDENTIFIER) {
+ rec_data->bad_rb_buffer[0] = cp_nop_packet(i);
+ break;
+ }
+ }
+
+ /* EOF not found in RB, discard till EOF in
+ next IB submission */
+ if (i == rec_data->bad_rb_size) {
+ adreno_context->flags |= CTXT_FLAGS_SKIP_EOF;
+ rec_data->bad_rb_buffer[0] = cp_nop_packet(i);
+ }
+
+ ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
+ rec_data->bad_rb_buffer, rec_data->bad_rb_size);
+
+ if (ret) {
+ KGSL_DRV_ERR(device, "Skip EOF unsuccessful\n");
+ rec_data->step = FT_FAIL_BAD_CTXT_CMDS;
+ } else
+ goto play_good_cmds;
+ }
+
+play_good_cmds:
+
+ if (rec_data->step == FT_FAIL_BAD_CTXT_CMDS)
+ KGSL_DRV_ERR(device, "Bad context commands failed\n");
+ else {
+
+ if (adreno_context) {
+ adreno_context->flags = (adreno_context->flags &
+ ~CTXT_FLAGS_GPU_HANG) | CTXT_FLAGS_GPU_HANG_RECOVERED;
+ }
+ adreno_dev->drawctxt_active = last_active_ctx;
+ }
+
+ ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
+ rec_data->good_rb_buffer, rec_data->good_rb_size);
+
+ if (ret) {
+ /* If we fail here we can try to invalidate another
+ * context and try fault tolerance again */
+ ret = -EAGAIN;
+ KGSL_DRV_ERR(device, "Playing good commands unsuccessful\n");
+ goto done;
+ }
+
+
+ /* ringbuffer now has data from the last valid context id,
+ * so restore the active_ctx to the last valid context */
+ if (rec_data->last_valid_ctx_id) {
+ struct kgsl_context *last_ctx =
+ idr_find(&device->context_idr,
+ rec_data->last_valid_ctx_id);
+ if (last_ctx)
+ adreno_dev->drawctxt_active = last_ctx->devctxt;
+ }
+
done:
/* Turn off iommu clocks */
if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
@@ -1779,10 +1861,8 @@
/* We may need to replay commands multiple times based on whether
* multiple contexts hang the GPU */
while (true) {
- if (!ret)
- ret = _adreno_recover_hang(device, rec_data, true);
- else
- ret = _adreno_recover_hang(device, rec_data, false);
+
+ ret = _adreno_recover_hang(device, rec_data);
if (-EAGAIN == ret) {
/* setup new recovery parameters and retry, this