msm: kgsl: Faster GPU hang detection
Check global GPU status registers in intervals of 2 seconds,
if the registers did not change after 2 seconds trigger a hang.
When there is an actual GPU hang and GPU is stalled, this reduces
the current hang detection time from 10 seconds to 2 seconds.
Faster GPU hang detection reduces the overall time to recover
from GPU hang.
Change-Id: If432ccacc1b77b4fc7f08b756886bd3ae5edf04f
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index ab47d40..bc687f5 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -111,6 +111,21 @@
.ib_check_level = 0,
};
+/* This set of registers are used for Hang detection
+ * If the values of these registers are same after
+ * KGSL_TIMEOUT_PART time, GPU hang is reported in
+ * kernel log.
+ */
+unsigned int hang_detect_regs[] = {
+ A3XX_RBBM_STATUS,
+ REG_CP_RB_RPTR,
+ REG_CP_IB1_BASE,
+ REG_CP_IB1_BUFSZ,
+ REG_CP_IB2_BASE,
+ REG_CP_IB2_BUFSZ,
+};
+
+const unsigned int hang_detect_regs_count = ARRAY_SIZE(hang_detect_regs);
/*
* This is the master list of all GPU cores that are supported by this
@@ -748,6 +763,10 @@
kgsl_mh_start(device);
}
+ /* Assign correct RBBM status register to hang detect regs
+ */
+ hang_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+
status = kgsl_mmu_start(device);
if (status)
goto error_clk_off;
@@ -1114,7 +1133,10 @@
unsigned long wait_time_part;
unsigned int msecs;
unsigned int msecs_first;
- unsigned int msecs_part;
+ unsigned int msecs_part = KGSL_TIMEOUT_PART;
+ unsigned int prev_reg_val[hang_detect_regs_count];
+
+ memset(prev_reg_val, 0, sizeof(prev_reg_val));
kgsl_cffdump_regpoll(device->id,
adreno_dev->gpudev->reg_rbbm_status << 2,
@@ -1126,7 +1148,6 @@
if (rb->flags & KGSL_FLAGS_STARTED) {
msecs = adreno_dev->wait_timeout;
msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
- msecs_part = (msecs - msecs_first + 3) / 4;
wait_time = jiffies + wait_timeout;
wait_time_part = jiffies + msecs_to_jiffies(msecs_first);
adreno_poke(device);
@@ -1135,6 +1156,8 @@
adreno_poke(device);
wait_time_part = jiffies +
msecs_to_jiffies(msecs_part);
+ if ((adreno_hang_detect(device, prev_reg_val)))
+ goto err;
}
GSL_RB_GET_READPTR(rb, &rb->rptr);
if (time_after(jiffies, wait_time)) {
@@ -1147,6 +1170,7 @@
/* now, wait for the GPU to finish its operations */
wait_time = jiffies + wait_timeout;
+ wait_time_part = jiffies + msecs_to_jiffies(msecs_part);
while (time_before(jiffies, wait_time)) {
adreno_regread(device, adreno_dev->gpudev->reg_rbbm_status,
&rbbm_status);
@@ -1157,6 +1181,16 @@
if (!(rbbm_status & 0x80000000))
return 0;
}
+
+ /* Dont wait for timeout, detect hang faster.
+ */
+ if (time_after(jiffies, wait_time_part)) {
+ wait_time_part = jiffies +
+ msecs_to_jiffies(msecs_part);
+ if ((adreno_hang_detect(device, prev_reg_val)))
+ goto err;
+ }
+
}
err:
@@ -1422,6 +1456,32 @@
__ret; \
})
+
+
+unsigned int adreno_hang_detect(struct kgsl_device *device,
+ unsigned int *prev_reg_val)
+{
+ struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+ unsigned int curr_reg_val[hang_detect_regs_count];
+ unsigned int hang_detected = 1;
+ unsigned int i;
+
+ if (!adreno_dev->fast_hang_detect)
+ return 0;
+
+ for (i = 0; i < hang_detect_regs_count; i++) {
+ adreno_regread(device, hang_detect_regs[i],
+ &curr_reg_val[i]);
+ if (curr_reg_val[i] != prev_reg_val[i]) {
+ prev_reg_val[i] = curr_reg_val[i];
+ hang_detected = 0;
+ }
+ }
+
+ return hang_detected;
+}
+
+
/* MUST be called with the device mutex held */
static int adreno_waittimestamp(struct kgsl_device *device,
struct kgsl_context *context,
@@ -1433,16 +1493,20 @@
static uint io_cnt;
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
struct kgsl_pwrctrl *pwr = &device->pwrctrl;
- int retries;
+ int retries = 0;
unsigned int msecs_first;
- unsigned int msecs_part;
+ unsigned int msecs_part = KGSL_TIMEOUT_PART;
unsigned int ts_issued;
unsigned int context_id = _get_context_id(context);
+ unsigned int time_elapsed = 0;
+ unsigned int prev_reg_val[hang_detect_regs_count];
+
+ memset(prev_reg_val, 0, sizeof(prev_reg_val));
ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
/* Don't wait forever, set a max value for now */
- if (msecs == -1)
+ if (msecs == KGSL_TIMEOUT_DEFAULT)
msecs = adreno_dev->wait_timeout;
if (timestamp_cmp(timestamp, ts_issued) > 0) {
@@ -1458,8 +1522,7 @@
* been updated properly.
*/
msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
- msecs_part = (msecs - msecs_first + 3) / 4;
- for (retries = 0; retries < 5; retries++) {
+ do {
/*
* If the context ID is invalid, we are in a race with
* the context being destroyed by userspace so bail.
@@ -1484,6 +1547,11 @@
if (io_cnt <
pwr->pwrlevels[pwr->active_pwrlevel].io_fraction)
io = 0;
+
+ if ((retries > 0) &&
+ (adreno_hang_detect(device, prev_reg_val)))
+ goto hang_dump;
+
mutex_unlock(&device->mutex);
/* We need to make sure that the process is
* placed in wait-q before its condition is called
@@ -1505,7 +1573,14 @@
goto done;
}
/*this wait timed out*/
- }
+
+ time_elapsed = time_elapsed +
+ (retries ? msecs_part : msecs_first);
+ retries++;
+
+ } while (time_elapsed < msecs);
+
+hang_dump:
status = -ETIMEDOUT;
KGSL_DRV_ERR(device,
"Device hang detected while waiting for timestamp: "
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 88ed895..e1d1eb9 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -82,6 +82,7 @@
unsigned int pix_shader_start;
unsigned int instruction_size;
unsigned int ib_check_level;
+ unsigned int fast_hang_detect;
};
struct adreno_gpudev {
@@ -124,6 +125,10 @@
extern const unsigned int a3xx_registers[];
extern const unsigned int a3xx_registers_count;
+extern unsigned int hang_detect_regs[];
+extern const unsigned int hang_detect_regs_count;
+
+
int adreno_idle(struct kgsl_device *device, unsigned int timeout);
void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
unsigned int *value);
@@ -146,6 +151,9 @@
int adreno_dump_and_recover(struct kgsl_device *device);
+unsigned int adreno_hang_detect(struct kgsl_device *device,
+ unsigned int *prev_reg_val);
+
static inline int adreno_is_a200(struct adreno_device *adreno_dev)
{
return (adreno_dev->gpurev == ADRENO_REV_A200);
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 822cf14..e3c9a18 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -117,6 +117,11 @@
debugfs_create_u32("ib_check", 0644, device->d_debugfs,
&adreno_dev->ib_check_level);
+ /* By Default enable fast hang detection */
+ adreno_dev->fast_hang_detect = 1;
+ debugfs_create_u32("fast_hang_detect", 0644, device->d_debugfs,
+ &adreno_dev->fast_hang_detect);
+
/* Create post mortem control files */
pm_d_debugfs = debugfs_create_dir("postmortem", device->d_debugfs);
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 73b4513..afcceee 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -56,6 +56,11 @@
struct adreno_device *adreno_dev = ADRENO_DEVICE(rb->device);
unsigned long wait_timeout = msecs_to_jiffies(adreno_dev->wait_timeout);
unsigned long wait_time;
+ unsigned long wait_time_part;
+ unsigned int msecs_part = KGSL_TIMEOUT_PART;
+ unsigned int prev_reg_val[hang_detect_regs_count];
+
+ memset(prev_reg_val, 0, sizeof(prev_reg_val));
/* if wptr ahead, fill the remaining with NOPs */
if (wptr_ahead) {
@@ -83,6 +88,7 @@
}
wait_time = jiffies + wait_timeout;
+ wait_time_part = jiffies + msecs_to_jiffies(msecs_part);
/* wait for space in ringbuffer */
while (1) {
GSL_RB_GET_READPTR(rb, &rb->rptr);
@@ -92,16 +98,34 @@
if (freecmds == 0 || freecmds > numcmds)
break;
+ /* Dont wait for timeout, detect hang faster.
+ */
+ if (time_after(jiffies, wait_time_part)) {
+ wait_time_part = jiffies +
+ msecs_to_jiffies(msecs_part);
+ if ((adreno_hang_detect(rb->device,
+ prev_reg_val))){
+ KGSL_DRV_ERR(rb->device,
+ "Hang detected while waiting for freespace in"
+ "ringbuffer rptr: 0x%x, wptr: 0x%x\n",
+ rb->rptr, rb->wptr);
+ goto err;
+ }
+ }
+
if (time_after(jiffies, wait_time)) {
KGSL_DRV_ERR(rb->device,
"Timed out while waiting for freespace in ringbuffer "
"rptr: 0x%x, wptr: 0x%x\n", rb->rptr, rb->wptr);
- if (!adreno_dump_and_recover(rb->device))
+ goto err;
+ }
+
+err:
+ if (!adreno_dump_and_recover(rb->device))
wait_time = jiffies + wait_timeout;
else
/* GPU is hung and we cannot recover */
BUG();
- }
}
}
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index d464cf1..75b688b 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -25,6 +25,7 @@
#define KGSL_TIMEOUT_NONE 0
#define KGSL_TIMEOUT_DEFAULT 0xFFFFFFFF
+#define KGSL_TIMEOUT_PART 2000 /* 2 sec */
#define FIRST_TIMEOUT (HZ / 2)