Execute shader on wakeup from power collapse

After wakeup from power-collapse, a shader execution is needed
to reset the GPU to a known and stable state on certain targets.
This change implements the required shader execution only
on initialization and on recovery from power-collapse on A3xx
GPU targets.

CRs-fixed: 526544

Change-Id: Ibd33578ecbb94df5ff276d1e460d94235a82dc43
Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org>
Signed-off-by: Ananta Kishore K <akollipa@codeaurora.org>
diff --git a/drivers/gpu/msm/a3xx_reg.h b/drivers/gpu/msm/a3xx_reg.h
index 21d4759..f81fc67 100644
--- a/drivers/gpu/msm/a3xx_reg.h
+++ b/drivers/gpu/msm/a3xx_reg.h
@@ -231,6 +231,7 @@
 #define A3XX_PC_PERFCOUNTER1_SELECT 0xC49
 #define A3XX_PC_PERFCOUNTER2_SELECT 0xC4A
 #define A3XX_PC_PERFCOUNTER3_SELECT 0xC4B
+#define A3XX_GRAS_TSE_DEBUG_ECO 0xC81
 #define A3XX_GRAS_PERFCOUNTER0_SELECT 0xC88
 #define A3XX_GRAS_PERFCOUNTER1_SELECT 0xC89
 #define A3XX_GRAS_PERFCOUNTER2_SELECT 0xC8A
@@ -268,6 +269,10 @@
 #define A3XX_HLSQ_PERFCOUNTER3_SELECT 0xE03
 #define A3XX_HLSQ_PERFCOUNTER4_SELECT 0xE04
 #define A3XX_HLSQ_PERFCOUNTER5_SELECT 0xE05
+#define A3XX_RB_DEBUG_ECO_CONTROLS_ADDR 0xCC1
+#define A3XX_RB_PERFCOUNTER0_SELECT   0xCC6
+#define A3XX_RB_PERFCOUNTER1_SELECT   0xCC7
+#define A3XX_RB_FRAME_BUFFER_DIMENSION 0xCE0
 #define A3XX_VFD_PERFCOUNTER0_SELECT 0xE44
 #define A3XX_VFD_PERFCOUNTER1_SELECT 0xE45
 #define A3XX_VPC_VPC_DEBUG_RAM_SEL 0xE61
@@ -299,6 +304,9 @@
 #define A3XX_GRAS_CL_CLIP_CNTL 0x2040
 #define A3XX_GRAS_CL_GB_CLIP_ADJ 0x2044
 #define A3XX_GRAS_CL_VPORT_XOFFSET 0x2048
+#define A3XX_GRAS_CL_VPORT_XSCALE 0x2049
+#define A3XX_GRAS_CL_VPORT_YOFFSET 0x204A
+#define A3XX_GRAS_CL_VPORT_YSCALE 0x204B
 #define A3XX_GRAS_CL_VPORT_ZOFFSET 0x204C
 #define A3XX_GRAS_CL_VPORT_ZSCALE 0x204D
 #define A3XX_GRAS_SU_POINT_MINMAX 0x2068
@@ -314,30 +322,75 @@
 #define A3XX_RB_MODE_CONTROL 0x20C0
 #define A3XX_RB_RENDER_CONTROL 0x20C1
 #define A3XX_RB_MSAA_CONTROL 0x20C2
+#define A3XX_RB_ALPHA_REFERENCE 0x20C3
 #define A3XX_RB_MRT_CONTROL0 0x20C4
 #define A3XX_RB_MRT_BUF_INFO0 0x20C5
+#define A3XX_RB_MRT_BUF_BASE0 0x20C6
 #define A3XX_RB_MRT_BLEND_CONTROL0 0x20C7
+#define A3XX_RB_MRT_CONTROL1 0x20C8
+#define A3XX_RB_MRT_BUF_INFO1 0x20C9
+#define A3XX_RB_MRT_BUF_BASE1 0x20CA
 #define A3XX_RB_MRT_BLEND_CONTROL1 0x20CB
+#define A3XX_RB_MRT_CONTROL2 0x20CC
+#define A3XX_RB_MRT_BUF_INFO2 0x20CD
+#define A3XX_RB_MRT_BUF_BASE2 0x20CE
 #define A3XX_RB_MRT_BLEND_CONTROL2 0x20CF
+#define A3XX_RB_MRT_CONTROL3 0x20D0
+#define A3XX_RB_MRT_BUF_INFO3 0x20D1
+#define A3XX_RB_MRT_BUF_BASE3 0x20D2
 #define A3XX_RB_MRT_BLEND_CONTROL3 0x20D3
 #define A3XX_RB_BLEND_RED 0x20E4
+#define A3XX_RB_BLEND_GREEN 0x20E5
+#define A3XX_RB_BLEND_BLUE 0x20E6
+#define A3XX_RB_BLEND_ALPHA 0x20E7
+#define A3XX_RB_CLEAR_COLOR_DW0 0x20E8
+#define A3XX_RB_CLEAR_COLOR_DW1 0x20E9
+#define A3XX_RB_CLEAR_COLOR_DW2 0x20EA
+#define A3XX_RB_CLEAR_COLOR_DW3 0x20EB
 #define A3XX_RB_COPY_CONTROL 0x20EC
+#define A3XX_RB_COPY_DEST_BASE 0x20ED
+#define A3XX_RB_COPY_DEST_PITCH 0x20EE
 #define A3XX_RB_COPY_DEST_INFO 0x20EF
 #define A3XX_RB_DEPTH_CONTROL 0x2100
+#define A3XX_RB_DEPTH_CLEAR 0x2101
+#define A3XX_RB_DEPTH_BUF_INFO 0x2102
+#define A3XX_RB_DEPTH_BUF_PITCH 0x2103
 #define A3XX_RB_STENCIL_CONTROL 0x2104
+#define A3XX_RB_STENCIL_CLEAR 0x2105
+#define A3XX_RB_STENCIL_BUF_INFO 0x2106
+#define A3XX_RB_STENCIL_BUF_PITCH 0x2107
+#define A3XX_RB_STENCIL_REF_MASK 0x2108
+#define A3XX_RB_STENCIL_REF_MASK_BF 0x2109
+#define A3XX_RB_LRZ_VSC_CONTROL 0x210C
+#define A3XX_RB_WINDOW_OFFSET 0x210E
+#define A3XX_RB_SAMPLE_COUNT_CONTROL 0x2110
+#define A3XX_RB_SAMPLE_COUNT_ADDR 0x2111
+#define A3XX_RB_Z_CLAMP_MIN 0x2114
+#define A3XX_RB_Z_CLAMP_MAX 0x2115
 #define A3XX_PC_VSTREAM_CONTROL 0x21E4
 #define A3XX_PC_VERTEX_REUSE_BLOCK_CNTL 0x21EA
 #define A3XX_PC_PRIM_VTX_CNTL 0x21EC
 #define A3XX_PC_RESTART_INDEX 0x21ED
 #define A3XX_HLSQ_CONTROL_0_REG 0x2200
+#define A3XX_HLSQ_CONTROL_1_REG 0x2201
+#define A3XX_HLSQ_CONTROL_2_REG 0x2202
+#define A3XX_HLSQ_CONTROL_3_REG 0x2203
 #define A3XX_HLSQ_VS_CONTROL_REG 0x2204
+#define A3XX_HLSQ_FS_CONTROL_REG 0x2205
+#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG 0x2206
 #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG 0x2207
 #define A3XX_HLSQ_CL_NDRANGE_0_REG 0x220A
+#define A3XX_HLSQ_CL_NDRANGE_1_REG 0x220B
 #define A3XX_HLSQ_CL_NDRANGE_2_REG 0x220C
+#define A3XX_HLSQ_CL_NDRANGE_3_REG 0x220D
+#define A3XX_HLSQ_CL_NDRANGE_4_REG 0x220E
+#define A3XX_HLSQ_CL_NDRANGE_5_REG 0x220F
+#define A3XX_HLSQ_CL_NDRANGE_6_REG 0x2210
 #define A3XX_HLSQ_CL_CONTROL_0_REG 0x2211
 #define A3XX_HLSQ_CL_CONTROL_1_REG 0x2212
 #define A3XX_HLSQ_CL_KERNEL_CONST_REG 0x2214
 #define A3XX_HLSQ_CL_KERNEL_GROUP_X_REG 0x2215
+#define A3XX_HLSQ_CL_KERNEL_GROUP_Y_REG 0x2216
 #define A3XX_HLSQ_CL_KERNEL_GROUP_Z_REG 0x2217
 #define A3XX_HLSQ_CL_WG_OFFSET_REG 0x221A
 #define A3XX_VFD_CONTROL_0 0x2240
@@ -354,10 +407,21 @@
 #define A3XX_SP_VS_CTRL_REG0 0x22C4
 #define A3XX_SP_VS_CTRL_REG1 0x22C5
 #define A3XX_SP_VS_PARAM_REG 0x22C6
+#define A3XX_SP_VS_OUT_REG_0 0x22C7
+#define A3XX_SP_VS_OUT_REG_1 0x22C8
+#define A3XX_SP_VS_OUT_REG_2 0x22C9
+#define A3XX_SP_VS_OUT_REG_3 0x22CA
+#define A3XX_SP_VS_OUT_REG_4 0x22CB
+#define A3XX_SP_VS_OUT_REG_5 0x22CC
+#define A3XX_SP_VS_OUT_REG_6 0x22CD
 #define A3XX_SP_VS_OUT_REG_7 0x22CE
 #define A3XX_SP_VS_VPC_DST_REG_0 0x22D0
+#define A3XX_SP_VS_VPC_DST_REG_1 0x22D1
+#define A3XX_SP_VS_VPC_DST_REG_2 0x22D2
+#define A3XX_SP_VS_VPC_DST_REG_3 0x22D3
 #define A3XX_SP_VS_OBJ_OFFSET_REG 0x22D4
 #define A3XX_SP_VS_OBJ_START_REG 0x22D5
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG 0x22D6
 #define A3XX_SP_VS_PVT_MEM_ADDR_REG 0x22D7
 #define A3XX_SP_VS_PVT_MEM_SIZE_REG 0x22D8
 #define A3XX_SP_VS_LENGTH_REG 0x22DF
@@ -365,13 +429,19 @@
 #define A3XX_SP_FS_CTRL_REG1 0x22E1
 #define A3XX_SP_FS_OBJ_OFFSET_REG 0x22E2
 #define A3XX_SP_FS_OBJ_START_REG 0x22E3
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG 0x22E4
 #define A3XX_SP_FS_PVT_MEM_ADDR_REG 0x22E5
 #define A3XX_SP_FS_PVT_MEM_SIZE_REG 0x22E6
 #define A3XX_SP_FS_FLAT_SHAD_MODE_REG_0 0x22E8
 #define A3XX_SP_FS_FLAT_SHAD_MODE_REG_1 0x22E9
 #define A3XX_SP_FS_OUTPUT_REG 0x22EC
 #define A3XX_SP_FS_MRT_REG_0 0x22F0
+#define A3XX_SP_FS_MRT_REG_1 0x22F1
+#define A3XX_SP_FS_MRT_REG_2 0x22F2
+#define A3XX_SP_FS_MRT_REG_3 0x22F3
 #define A3XX_SP_FS_IMAGE_OUTPUT_REG_0 0x22F4
+#define A3XX_SP_FS_IMAGE_OUTPUT_REG_1 0x22F5
+#define A3XX_SP_FS_IMAGE_OUTPUT_REG_2 0x22F6
 #define A3XX_SP_FS_IMAGE_OUTPUT_REG_3 0x22F7
 #define A3XX_SP_FS_LENGTH_REG 0x22FF
 #define A3XX_TPL1_TP_VS_TEX_OFFSET 0x2340
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index cf8970f..bb7de19 100755
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -572,13 +572,15 @@
 
 	kgsl_mmu_unmap(pagetable, &device->memstore);
 
+	kgsl_mmu_unmap(pagetable, &adreno_dev->pwron_fixup);
+
 	kgsl_mmu_unmap(pagetable, &device->mmu.setstate_memory);
 }
 
 static int adreno_setup_pt(struct kgsl_device *device,
 			struct kgsl_pagetable *pagetable)
 {
-	int result = 0;
+	int result;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
@@ -594,6 +596,10 @@
 	if (result)
 		goto unmap_memptrs_desc;
 
+	result = kgsl_mmu_map_global(pagetable, &adreno_dev->pwron_fixup);
+	if (result)
+		goto unmap_pwron_fixup_desc;
+
 	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory);
 	if (result)
 		goto unmap_memstore_desc;
@@ -622,6 +628,9 @@
 unmap_memstore_desc:
 	kgsl_mmu_unmap(pagetable, &device->memstore);
 
+unmap_pwron_fixup_desc:
+	kgsl_mmu_unmap(pagetable, &adreno_dev->pwron_fixup);
+
 unmap_memptrs_desc:
 	kgsl_mmu_unmap(pagetable, &rb->memptrs_desc);
 
@@ -1654,6 +1663,15 @@
 	kgsl_pwrctrl_enable(device);
 
 	/* Set up a2xx special case */
+
+	/* Certain targets need the fixup.  You know who you are */
+	if (adreno_is_a305(adreno_dev) || adreno_is_a320(adreno_dev))
+		adreno_a3xx_pwron_fixup_init(adreno_dev);
+
+	/* Set the bit to indicate that we've just powered on */
+	set_bit(ADRENO_DEVICE_PWRON, &adreno_dev->priv);
+
+	/* Set up the MMU */
 	if (adreno_is_a2xx(adreno_dev)) {
 		/*
 		 * the MH_CLNT_INTF_CTRL_CONFIG registers aren't present
@@ -3241,6 +3259,9 @@
 	if (kgsl_gpuaddr_in_memdesc(&device->memstore, gpuaddr, size))
 		return &device->memstore;
 
+	if (kgsl_gpuaddr_in_memdesc(&adreno_dev->pwron_fixup, gpuaddr, size))
+		return &adreno_dev->pwron_fixup;
+
 	if (kgsl_gpuaddr_in_memdesc(&device->mmu.setstate_memory, gpuaddr,
 					size))
 		return &device->mmu.setstate_memory;
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 79ea40c..1444851 100755
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -35,6 +35,7 @@
 #define KGSL_CMD_FLAGS_PMODE		0x00000001
 #define KGSL_CMD_FLAGS_INTERNAL_ISSUE	0x00000002
 #define KGSL_CMD_FLAGS_GET_INT		0x00000004
+#define KGSL_CMD_FLAGS_PWRON_FIXUP      0x00000008
 #define KGSL_CMD_FLAGS_EOF	        0x00000100
 
 /* Command identifiers */
@@ -45,6 +46,7 @@
 #define KGSL_END_OF_IB_IDENTIFIER	0x2ABEDEAD
 #define KGSL_END_OF_FRAME_IDENTIFIER	0x2E0F2E0F
 #define KGSL_NOP_IB_IDENTIFIER	        0x20F20F20
+#define KGSL_PWRON_FIXUP_IDENTIFIER	0x2AFAFAFA
 
 #ifdef CONFIG_MSM_SCM
 #define ADRENO_DEFAULT_PWRSCALE_POLICY  (&kgsl_pwrscale_policy_tz)
@@ -80,6 +82,7 @@
 
 struct adreno_device {
 	struct kgsl_device dev;    /* Must be first field in this struct */
+	unsigned long priv;
 	unsigned int chip_id;
 	enum adreno_gpurev gpurev;
 	unsigned long gmem_base;
@@ -115,6 +118,8 @@
 	struct kgsl_memdesc on_resume_cmd;
 	unsigned int on_resume_ib[3];
 	bool on_resume_issueib;
+	struct kgsl_memdesc pwron_fixup;
+	unsigned int pwron_fixup_dwords;
 };
 
 #define PERFCOUNTER_FLAG_NONE 0x0
@@ -155,6 +160,17 @@
 	unsigned int group_count;
 };
 
+/**
+ * enum adreno_device_flags - Private flags for the adreno_device
+ * @ADRENO_DEVICE_PWRON - Set during init after a power collapse
+ * @ADRENO_DEVICE_PWRON_FIXUP - Set if the target requires the shader fixup
+ * after power collapse
+ */
+enum adreno_device_flags {
+	ADRENO_DEVICE_PWRON = 0,
+	ADRENO_DEVICE_PWRON_FIXUP = 1,
+};
+
 struct adreno_gpudev {
 	/*
 	 * These registers are in a different location on A3XX,  so define
@@ -313,6 +329,8 @@
 int adreno_ft_init_sysfs(struct kgsl_device *device);
 void adreno_ft_uninit_sysfs(struct kgsl_device *device);
 
+int adreno_a3xx_pwron_fixup_init(struct adreno_device *adreno_dev);
+
 static inline int adreno_is_a200(struct adreno_device *adreno_dev)
 {
 	return (adreno_dev->gpurev == ADRENO_REV_A200);
diff --git a/drivers/gpu/msm/adreno_a3xx.c b/drivers/gpu/msm/adreno_a3xx.c
index aa0f614..8db9524 100644
--- a/drivers/gpu/msm/adreno_a3xx.c
+++ b/drivers/gpu/msm/adreno_a3xx.c
@@ -2507,6 +2507,270 @@
 	}
 }
 
+static const unsigned int _a3xx_pwron_fixup_fs_instructions[] = {
+	0x00000000, 0x10000400, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x03000000,
+};
+
+/**
+ * adreno_a3xx_pwron_fixup_init() - Initalize a special command buffer to run a
+ * post-power collapse shader workaround
+ * @adreno_dev: Pointer to a adreno_device struct
+ *
+ * A3xx targets require a CL Exec after recovery from power-collapse.
+ * Construct the IB once at init time and keep it handy.
+ *
+ * Returns: 0 on success or negative on error
+ */
+int adreno_a3xx_pwron_fixup_init(struct adreno_device *adreno_dev)
+{
+	unsigned int *cmds;
+	int count = sizeof(_a3xx_pwron_fixup_fs_instructions) >> 2;
+	int ret;
+	/* Return if the fixup is already in place */
+	if (test_bit(ADRENO_DEVICE_PWRON_FIXUP, &adreno_dev->priv))
+		return 0;
+
+	ret = kgsl_allocate_contiguous(&adreno_dev->pwron_fixup, PAGE_SIZE);
+
+	if (ret)
+		return ret;
+	adreno_dev->pwron_fixup.flags |= KGSL_MEMFLAGS_GPUREADONLY;
+	cmds = adreno_dev->pwron_fixup.hostptr;
+
+	*cmds++ = cp_type0_packet(A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
+	*cmds++ = 0x00000000;
+	*cmds++ = 0x90000000;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_REG_RMW, 3);
+	*cmds++ = A3XX_RBBM_CLOCK_CTL;
+	*cmds++ = 0xFFFCFFFF;
+	*cmds++ = 0x00010000;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONTROL_0_REG, 1);
+	*cmds++ = 0x1E000150;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
+	*cmds++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG) | (0x1 << 30);
+	*cmds++ = 0x1E000150;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONTROL_0_REG, 1);
+	*cmds++ = 0x1E000150;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONTROL_1_REG, 1);
+	*cmds++ = 0x00000040;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONTROL_2_REG, 1);
+	*cmds++ = 0x80000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONTROL_3_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_VS_CONTROL_REG, 1);
+	*cmds++ = 0x00000001;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_FS_CONTROL_REG, 1);
+	*cmds++ = 0x00001002 | (count >> 3) << 24;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONST_VSPRESV_RANGE_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_0_REG, 1);
+	*cmds++ = 0x00401101;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_1_REG, 1);
+	*cmds++ = 0x00000400;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_2_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_3_REG, 1);
+	*cmds++ = 0x00000001;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_4_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_5_REG, 1);
+	*cmds++ = 0x00000001;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_NDRANGE_6_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_CONTROL_0_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_CONTROL_1_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_KERNEL_CONST_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_KERNEL_GROUP_X_REG, 1);
+	*cmds++ = 0x00000010;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_KERNEL_GROUP_Y_REG, 1);
+	*cmds++ = 0x00000001;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_KERNEL_GROUP_Z_REG, 1);
+	*cmds++ = 0x00000001;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_WG_OFFSET_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_SP_CTRL_REG, 1);
+	*cmds++ = 0x00040000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_CTRL_REG0, 1);
+	*cmds++ = 0x0000000A;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_CTRL_REG1, 1);
+	*cmds++ = 0x00000001;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_PARAM_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_3, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_4, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_5, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_6, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OUT_REG_7, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_VPC_DST_REG_0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_VPC_DST_REG_1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_VPC_DST_REG_2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_VPC_DST_REG_3, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OBJ_OFFSET_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_OBJ_START_REG, 1);
+	*cmds++ = 0x00000004;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_PARAM_REG, 1);
+	*cmds++ = 0x04008001;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_ADDR_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_SIZE_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_VS_LENGTH_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_CTRL_REG0, 1);
+	*cmds++ = 0x00B0400A | (count >> 3) << 24;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_CTRL_REG1, 1);
+	*cmds++ = 0x00300402;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_OBJ_OFFSET_REG, 1);
+	*cmds++ = 0x00010000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_OBJ_START_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_PARAM_REG, 1);
+	*cmds++ = 0x04008001;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_ADDR_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_SIZE_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_FLAT_SHAD_MODE_REG_1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_OUTPUT_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_MRT_REG_0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_MRT_REG_1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_MRT_REG_2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_MRT_REG_3, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_IMAGE_OUTPUT_REG_0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_IMAGE_OUTPUT_REG_1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_IMAGE_OUTPUT_REG_2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_IMAGE_OUTPUT_REG_3, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_SP_FS_LENGTH_REG, 1);
+	*cmds++ = count >> 3;
+	*cmds++ = cp_type0_packet(A3XX_RB_MODE_CONTROL, 1);
+	*cmds++ = 0x00008000;
+	*cmds++ = cp_type0_packet(A3XX_RB_RENDER_CONTROL, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MSAA_CONTROL, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_ALPHA_REFERENCE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_CONTROL0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_CONTROL1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_CONTROL2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_CONTROL3, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_INFO0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_INFO1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_INFO2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_INFO3, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_BASE0, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_BASE1, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_BASE2, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_MRT_BUF_BASE3, 1);
+	*cmds++ = 0x00000000;
+
+	*cmds++ = cp_type0_packet(A3XX_RB_PERFCOUNTER0_SELECT, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_PERFCOUNTER1_SELECT, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_RB_FRAME_BUFFER_DIMENSION, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+
+	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 2 + count);
+	*cmds++ = (6 << CP_LOADSTATE_STATEBLOCKID_SHIFT) |
+		  ((count >> 3) << CP_LOADSTATE_NUMOFUNITS_SHIFT);
+	*cmds++ = 0x00000000;
+	memcpy(cmds, _a3xx_pwron_fixup_fs_instructions, count << 2);
+	cmds += count;
+
+	*cmds++ = cp_type3_packet(CP_EXEC_CL, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_nop_packet(1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CL_CONTROL_0_REG, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type0_packet(A3XX_HLSQ_CONTROL_0_REG, 1);
+	*cmds++ = 0x1E000150;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
+	*cmds++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG);
+	*cmds++ = 0x1E000050;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_REG_RMW, 3);
+	*cmds++ = A3XX_RBBM_CLOCK_CTL;
+	*cmds++ = 0xFFFCFFFF;
+	*cmds++ = 0x00000000;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+
+	/*
+	 * Remember the number of dwords in the command buffer for when we
+	 * program the indirect buffer call in the ringbuffer
+	 */
+	adreno_dev->pwron_fixup_dwords =
+		(cmds - (unsigned int *)adreno_dev->pwron_fixup.hostptr);
+
+	/* Mark the flag in ->priv to show that we have the fix */
+	set_bit(ADRENO_DEVICE_PWRON_FIXUP, &adreno_dev->priv);
+	return 0;
+}
+
 #define QUAD_RESTORE_LEN 14
 
 static unsigned int gmem_restore_quad[QUAD_RESTORE_LEN] = {
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index a3fa312..e6ec91d 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -177,6 +177,8 @@
 /* Load a buffer with pre-fetch enabled */
 #define CP_INDIRECT_BUFFER_PFE 0x3F
 
+#define CP_EXEC_CL 0x31
+
 #define CP_LOADSTATE_DSTOFFSET_SHIFT 0x00000000
 #define CP_LOADSTATE_STATESRC_SHIFT 0x00000010
 #define CP_LOADSTATE_STATEBLOCKID_SHIFT 0x00000013
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index bc7a5c2..c4d12ec 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -584,6 +584,10 @@
 	if (flags & KGSL_CMD_FLAGS_EOF)
 		total_sizedwords += 2;
 
+	/* Add space for the power on shader fixup if we need it */
+	if (flags & KGSL_CMD_FLAGS_PWRON_FIXUP)
+		total_sizedwords += 5;
+
 	ringcmds = adreno_ringbuffer_allocspace(rb, context, total_sizedwords);
 	if (!ringcmds)
 		return -ENOSPC;
@@ -591,6 +595,18 @@
 	rcmd_gpu = rb->buffer_desc.gpuaddr
 		+ sizeof(uint)*(rb->wptr-total_sizedwords);
 
+	if (flags & KGSL_CMD_FLAGS_PWRON_FIXUP) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+				KGSL_PWRON_FIXUP_IDENTIFIER);
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			CP_HDR_INDIRECT_BUFFER_PFD);
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			adreno_dev->pwron_fixup.gpuaddr);
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			adreno_dev->pwron_fixup_dwords);
+	}
+
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_CMD_IDENTIFIER);
 
@@ -1069,9 +1085,20 @@
 	} else
 		drawctxt->timestamp++;
 
+	/*
+	 * For some targets, we need to execute a dummy shader operation after a
+	 * power collapse
+	 */
+
+	if (test_and_clear_bit(ADRENO_DEVICE_PWRON, &adreno_dev->priv) &&
+	    test_bit(ADRENO_DEVICE_PWRON_FIXUP, &adreno_dev->priv))
+	{
+		flags |= KGSL_CMD_FLAGS_PWRON_FIXUP;
+	}
+
 	ret = adreno_ringbuffer_addcmds(&adreno_dev->ringbuffer,
 					drawctxt,
-					(flags & KGSL_CMD_FLAGS_EOF),
+					flags,
 					&link[0], (cmds - link));
 	if (ret)
 		goto done;