Copied caf 2.5.1 video/gpu genlock and rotator [WIP]
diff --git a/drivers/gpu/msm/Kconfig b/drivers/gpu/msm/Kconfig
index 0b293a7..ba63fbc 100644
--- a/drivers/gpu/msm/Kconfig
+++ b/drivers/gpu/msm/Kconfig
@@ -78,13 +78,6 @@
 	  Sets the pagetable size used by the MMU.  The max value
 	  is 0xFFF0000 or (256M - 64K).
 
-config MSM_KGSL_PAGE_TABLE_SIZE_FOR_IOMMU
-	hex "Size of pagetables for iommu"
-	default 0x1FE00000
-	---help---
-	Sets the pagetable size used by the IOMMU.  The max value
-	is 0x1FE00000 or (512M - 1536K - little interval).
-
 config MSM_KGSL_PAGE_TABLE_COUNT
 	int "Minimum of concurrent pagetables to support"
 	default 8
@@ -103,21 +96,3 @@
 	bool "Disable register shadow writes for context switches"
 	default n
 	depends on MSM_KGSL
-
-config MSM_KGSL_GPU_USAGE
-	bool "Enable sysfs node of GPU usage per process"
-	default n
-
-config MSM_KGSL_DEFAULT_GPUMMU
-	bool "Prefer gpummu than iommu"
-	default n
-
-config MSM_KGSL_GPU_USAGE_SYSTRACE
-        bool "Enable kgsl_usage node for ftrace gpu usage event"
-        default y
-
-config MSM_KGSL_KILL_HANG_PROCESS
-	bool "Enable killing recoverable gpu hang process routine"
-	default y
-	---help---
-	We only enable this config in CRC branch.
diff --git a/drivers/gpu/msm/Makefile b/drivers/gpu/msm/Makefile
index 6cdb5f1..fec5363 100644
--- a/drivers/gpu/msm/Makefile
+++ b/drivers/gpu/msm/Makefile
@@ -9,7 +9,8 @@
 	kgsl_mmu.o \
 	kgsl_gpummu.o \
 	kgsl_iommu.o \
-	kgsl_snapshot.o
+	kgsl_snapshot.o \
+	kgsl_events.o
 
 msm_kgsl_core-$(CONFIG_DEBUG_FS) += kgsl_debugfs.o
 msm_kgsl_core-$(CONFIG_MSM_KGSL_CFF_DUMP) += kgsl_cffdump.o
@@ -17,6 +18,7 @@
 msm_kgsl_core-$(CONFIG_MSM_SCM) += kgsl_pwrscale_trustzone.o
 msm_kgsl_core-$(CONFIG_MSM_SLEEP_STATS_DEVICE) += kgsl_pwrscale_idlestats.o
 msm_kgsl_core-$(CONFIG_MSM_DCVS) += kgsl_pwrscale_msm.o
+msm_kgsl_core-$(CONFIG_SYNC) += kgsl_sync.o
 
 msm_adreno-y += \
 	adreno_ringbuffer.o \
@@ -35,6 +37,7 @@
 
 msm_z180-y += \
 	z180.o \
+	z180_postmortem.o \
 	z180_trace.o
 
 msm_kgsl_core-objs = $(msm_kgsl_core-y)
diff --git a/drivers/gpu/msm/a2xx_reg.h b/drivers/gpu/msm/a2xx_reg.h
index bde8784..c70c4eb 100644
--- a/drivers/gpu/msm/a2xx_reg.h
+++ b/drivers/gpu/msm/a2xx_reg.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -386,6 +386,7 @@
 #define REG_COHER_STATUS_PM4             0xA2B
 #define REG_COHER_SIZE_PM4               0xA29
 
+/*registers added in adreno220*/
 #define REG_A220_PC_INDX_OFFSET          REG_VGT_INDX_OFFSET
 #define REG_A220_PC_VERTEX_REUSE_BLOCK_CNTL REG_VGT_VERTEX_REUSE_BLOCK_CNTL
 #define REG_A220_PC_MAX_VTX_INDX         REG_VGT_MAX_VTX_INDX
@@ -394,12 +395,14 @@
 #define REG_A220_VSC_BIN_SIZE            0x0C01
 #define REG_A220_VSC_PIPE_DATA_LENGTH_7  0x0C1D
 
+/*registers added in adreno225*/
 #define REG_A225_RB_COLOR_INFO3          0x2005
 #define REG_A225_PC_MULTI_PRIM_IB_RESET_INDX 0x2103
 #define REG_A225_GRAS_UCP0X              0x2340
 #define REG_A225_GRAS_UCP5W              0x2357
 #define REG_A225_GRAS_UCP_ENABLED        0x2360
 
+/* Debug registers used by snapshot */
 #define REG_PA_SU_DEBUG_CNTL            0x0C80
 #define REG_PA_SU_DEBUG_DATA            0x0C81
 #define REG_RB_DEBUG_CNTL               0x0F26
@@ -432,4 +435,4 @@
 #define REG_SQ_DEBUG_MISC_0             0x2309
 #define REG_SQ_DEBUG_MISC_1             0x230A
 
-#endif 
+#endif /* __A200_REG_H */
diff --git a/drivers/gpu/msm/a3xx_reg.h b/drivers/gpu/msm/a3xx_reg.h
index 77bd1d0..be9f3ac 100644
--- a/drivers/gpu/msm/a3xx_reg.h
+++ b/drivers/gpu/msm/a3xx_reg.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -14,6 +14,7 @@
 #ifndef _A300_REG_H
 #define _A300_REG_H
 
+/* Interrupt bit positions within RBBM_INT_0 */
 
 #define A3XX_INT_RBBM_GPU_IDLE 0
 #define A3XX_INT_RBBM_AHB_ERROR 1
@@ -40,6 +41,7 @@
 #define A3XX_INT_MISC_HANG_DETECT 24
 #define A3XX_INT_UCHE_OOB_ACCESS 25
 
+/* Register definitions */
 
 #define A3XX_RBBM_HW_VERSION 0x000
 #define A3XX_RBBM_HW_RELEASE 0x001
@@ -52,6 +54,7 @@
 #define A3XX_RBBM_AHB_CMD 0x022
 #define A3XX_RBBM_AHB_ERROR_STATUS 0x027
 #define A3XX_RBBM_GPR0_CTL 0x02E
+/* This the same register as on A2XX, just in a different place */
 #define A3XX_RBBM_STATUS 0x030
 #define A3XX_RBBM_WAIT_IDLE_CLOCKS_CTL 0x33
 #define A3XX_RBBM_INTERFACE_HANG_INT_CTL 0x50
@@ -62,17 +65,29 @@
 #define A3XX_RBBM_INT_CLEAR_CMD 0x061
 #define A3XX_RBBM_INT_0_MASK 0x063
 #define A3XX_RBBM_INT_0_STATUS 0x064
+#define A3XX_RBBM_PERFCTR_CTL 0x80
 #define A3XX_RBBM_GPU_BUSY_MASKED 0x88
+#define A3XX_RBBM_PERFCTR_SP_5_LO 0xDC
+#define A3XX_RBBM_PERFCTR_SP_5_HI 0xDD
+#define A3XX_RBBM_PERFCTR_SP_6_LO 0xDE
+#define A3XX_RBBM_PERFCTR_SP_6_HI 0xDF
+#define A3XX_RBBM_PERFCTR_SP_7_LO 0xE0
+#define A3XX_RBBM_PERFCTR_SP_7_HI 0xE1
 #define A3XX_RBBM_RBBM_CTL 0x100
 #define A3XX_RBBM_RBBM_CTL 0x100
 #define A3XX_RBBM_PERFCTR_PWR_1_LO 0x0EC
 #define A3XX_RBBM_PERFCTR_PWR_1_HI 0x0ED
 #define A3XX_RBBM_DEBUG_BUS_CTL             0x111
 #define A3XX_RBBM_DEBUG_BUS_DATA_STATUS     0x112
+
+/* Following two are same as on A2XX, just in a different place */
 #define A3XX_CP_PFP_UCODE_ADDR 0x1C9
 #define A3XX_CP_PFP_UCODE_DATA 0x1CA
 #define A3XX_CP_ROQ_ADDR 0x1CC
 #define A3XX_CP_ROQ_DATA 0x1CD
+#define A3XX_CP_MERCIU_ADDR 0x1D1
+#define A3XX_CP_MERCIU_DATA 0x1D2
+#define A3XX_CP_MERCIU_DATA2 0x1D3
 #define A3XX_CP_MEQ_ADDR 0x1DA
 #define A3XX_CP_MEQ_DATA 0x1DB
 #define A3XX_CP_HW_FAULT  0x45C
@@ -147,10 +162,15 @@
 #define A3XX_GRAS_CL_USER_PLANE_Y5 0xCB5
 #define A3XX_GRAS_CL_USER_PLANE_Z5 0xCB6
 #define A3XX_GRAS_CL_USER_PLANE_W5 0xCB7
+#define A3XX_RB_GMEM_BASE_ADDR 0xCC0
 #define A3XX_VFD_PERFCOUNTER0_SELECT 0xE44
 #define A3XX_VPC_VPC_DEBUG_RAM_SEL 0xE61
 #define A3XX_VPC_VPC_DEBUG_RAM_READ 0xE62
+#define A3XX_UCHE_CACHE_MODE_CONTROL_REG 0xE82
 #define A3XX_UCHE_CACHE_INVALIDATE0_REG 0xEA0
+#define A3XX_SP_PERFCOUNTER5_SELECT 0xEC9
+#define A3XX_SP_PERFCOUNTER6_SELECT 0xECA
+#define A3XX_SP_PERFCOUNTER7_SELECT 0xECB
 #define A3XX_GRAS_CL_CLIP_CNTL 0x2040
 #define A3XX_GRAS_CL_GB_CLIP_ADJ 0x2044
 #define A3XX_GRAS_CL_VPORT_XOFFSET 0x2048
@@ -230,6 +250,7 @@
 #define A3XX_TPL1_TP_VS_TEX_OFFSET 0x2340
 #define A3XX_TPL1_TP_FS_TEX_OFFSET 0x2342
 #define A3XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR 0x2343
+#define A3XX_VBIF_CLKON 0x3001
 #define A3XX_VBIF_FIXED_SORT_EN 0x300C
 #define A3XX_VBIF_FIXED_SORT_SEL0 0x300D
 #define A3XX_VBIF_FIXED_SORT_SEL1 0x300E
@@ -244,12 +265,16 @@
 #define A3XX_VBIF_OUT_WR_LIM_CONF0 0x3035
 #define A3XX_VBIF_DDR_OUT_MAX_BURST 0x3036
 #define A3XX_VBIF_ARB_CTL 0x303C
+#define A3XX_VBIF_ROUND_ROBIN_QOS_ARB 0x3049
+#define A3XX_VBIF_OUT_AXI_AMEMTYPE_CONF0 0x3058
 #define A3XX_VBIF_OUT_AXI_AOOO_EN 0x305E
 #define A3XX_VBIF_OUT_AXI_AOOO 0x305F
 
+/* Bit flags for RBBM_CTL */
 #define RBBM_RBBM_CTL_RESET_PWR_CTR1  (1 << 1)
 #define RBBM_RBBM_CTL_ENABLE_PWR_CTR1  (1 << 17)
 
+/* Various flags used by the context switch code */
 
 #define SP_MULTI 0
 #define SP_BUFFER_MODE 1
@@ -302,6 +327,11 @@
 #define UCHE_ENTIRE_CACHE 1
 #define UCHE_OP_INVALIDATE 1
 
+/*
+ * The following are bit field shifts within some of the registers defined
+ * above. These are used in the context switch code in conjunction with the
+ * _SET macro
+ */
 
 #define GRAS_CL_CLIP_CNTL_CLIP_DISABLE 16
 #define GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER 12
@@ -476,6 +506,7 @@
 #define VPC_VPCVARPSREPLMODE_COMPONENT16 28
 #define VPC_VPCVARPSREPLMODE_COMPONENT17 30
 
+/* RBBM Debug bus block IDs */
 #define RBBM_BLOCK_ID_NONE             0x0
 #define RBBM_BLOCK_ID_CP               0x1
 #define RBBM_BLOCK_ID_RBBM             0x2
@@ -505,6 +536,13 @@
 #define RBBM_BLOCK_ID_MARB_2           0x2a
 #define RBBM_BLOCK_ID_MARB_3           0x2b
 
+/* RBBM_CLOCK_CTL default value */
 #define A3XX_RBBM_CLOCK_CTL_DEFAULT 0xBFFFFFFF
 
+/* COUNTABLE FOR SP PERFCOUNTER */
+#define SP_FS_FULL_ALU_INSTRUCTIONS    0x0E
+#define SP_ALU_ACTIVE_CYCLES           0x1D
+#define SP0_ICL1_MISSES                0x1A
+#define SP_FS_CFLOW_INSTRUCTIONS       0x0C
+
 #endif
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 3b3fba1..1886e04 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -10,15 +10,20 @@
  * GNU General Public License for more details.
  *
  */
-#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/ioctl.h>
 #include <linux/sched.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/msm_kgsl.h>
 
 #include <mach/socinfo.h>
-#include <mach/board.h>
+#include <mach/msm_bus_board.h>
+#include <mach/msm_bus.h>
+#include <mach/msm_dcvs.h>
+#include <mach/msm_dcvs_scm.h>
 
 #include "kgsl.h"
 #include "kgsl_pwrscale.h"
@@ -28,8 +33,6 @@
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
-#include "adreno_debugfs.h"
-#include "adreno_postmortem.h"
 
 #include "a2xx_reg.h"
 #include "a3xx_reg.h"
@@ -37,6 +40,7 @@
 #define DRIVER_VERSION_MAJOR   3
 #define DRIVER_VERSION_MINOR   1
 
+/* Adreno MH arbiter config*/
 #define ADRENO_CFG_MHARB \
 	(0x10 \
 		| (0 << MH_ARBITER_CONFIG__SAME_PAGE_GRANULARITY__SHIFT) \
@@ -69,9 +73,6 @@
 	 | (MMU_CONFIG << MH_MMU_CONFIG__PA_W_CLNT_BEHAVIOR__SHIFT))
 
 static const struct kgsl_functable adreno_functable;
-static volatile int adreno_regwrite_footprint = 0;
-static volatile unsigned int *adreno_regwrite_reg;
-static volatile unsigned int adreno_regwrite_val;
 
 static struct adreno_device device_3d0 = {
 	.dev = {
@@ -80,7 +81,14 @@
 		.id = KGSL_DEVICE_3D0,
 		.mh = {
 			.mharb  = ADRENO_CFG_MHARB,
+			/* Remove 1k boundary check in z470 to avoid a GPU
+			 * hang.  Notice that this solution won't work if
+			 * both EBI and SMI are used
+			 */
 			.mh_intf_cfg1 = 0x00032f07,
+			/* turn off memory protection unit by setting
+			   acceptable physical address range to include
+			   all pages. */
 			.mpu_base = 0x00000000,
 			.mpu_range =  0xFFFFF000,
 		},
@@ -104,21 +112,43 @@
 	.gmem_size = SZ_256K,
 	.pfp_fw = NULL,
 	.pm4_fw = NULL,
-	.wait_timeout = 0, 
+	.wait_timeout = 0, /* in milliseconds, 0 means disabled */
 	.ib_check_level = 0,
 };
 
-unsigned int hang_detect_regs[] = {
+/* This set of registers are used for Hang detection
+ * If the values of these registers are same after
+ * KGSL_TIMEOUT_PART time, GPU hang is reported in
+ * kernel log.
+ * *****ALERT******ALERT********ALERT*************
+ * Order of registers below is important, registers
+ * from LONG_IB_DETECT_REG_INDEX_START to
+ * LONG_IB_DETECT_REG_INDEX_END are used in long ib detection.
+ */
+#define LONG_IB_DETECT_REG_INDEX_START 1
+#define LONG_IB_DETECT_REG_INDEX_END 5
+
+unsigned int ft_detect_regs[] = {
 	A3XX_RBBM_STATUS,
-	REG_CP_RB_RPTR,
+	REG_CP_RB_RPTR,   /* LONG_IB_DETECT_REG_INDEX_START */
 	REG_CP_IB1_BASE,
 	REG_CP_IB1_BUFSZ,
 	REG_CP_IB2_BASE,
-	REG_CP_IB2_BUFSZ,
+	REG_CP_IB2_BUFSZ, /* LONG_IB_DETECT_REG_INDEX_END */
+	0,
+	0,
+	0,
+	0,
+	0,
+	0
 };
 
-const unsigned int hang_detect_regs_count = ARRAY_SIZE(hang_detect_regs);
+const unsigned int ft_detect_regs_count = ARRAY_SIZE(ft_detect_regs);
 
+/*
+ * This is the master list of all GPU cores that are supported by this
+ * driver.
+ */
 
 #define ANY_ID (~0)
 #define NO_VER (~0)
@@ -131,11 +161,15 @@
 	struct adreno_gpudev *gpudev;
 	unsigned int istore_size;
 	unsigned int pix_shader_start;
-	
+	/* Size of an instruction in dwords */
 	unsigned int instruction_size;
-	
+	/* size of gmem for gpu*/
 	unsigned int gmem_size;
+	/* version of pm4 microcode that supports sync_lock
+	   between CPU and GPU for SMMU-v1 programming */
 	unsigned int sync_lock_pm4_ver;
+	/* version of pfp microcode that supports sync_lock
+	   between CPU and GPU for SMMU-v1 programming */
 	unsigned int sync_lock_pfp_ver;
 } adreno_gpulist[] = {
 	{ ADRENO_REV_A200, 0, 2, ANY_ID, ANY_ID,
@@ -150,6 +184,10 @@
 	{ ADRENO_REV_A220, 2, 1, ANY_ID, ANY_ID,
 		"leia_pm4_470.fw", "leia_pfp_470.fw", &adreno_a2xx_gpudev,
 		512, 384, 3, SZ_512K, NO_VER, NO_VER },
+	/*
+	 * patchlevel 5 (8960v2) needs special pm4 firmware to work around
+	 * a hardware problem.
+	 */
 	{ ADRENO_REV_A225, 2, 2, 0, 5,
 		"a225p5_pm4.fw", "a225_pfp.fw", &adreno_a2xx_gpudev,
 		1536, 768, 3, SZ_512K, NO_VER, NO_VER },
@@ -159,26 +197,17 @@
 	{ ADRENO_REV_A225, 2, 2, ANY_ID, ANY_ID,
 		"a225_pm4.fw", "a225_pfp.fw", &adreno_a2xx_gpudev,
 		1536, 768, 3, SZ_512K, 0x225011, 0x225002 },
-	
+	/* A3XX doesn't use the pix_shader_start */
 	{ ADRENO_REV_A305, 3, 0, 5, ANY_ID,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_256K, 0x3FF037, 0x3FF016 },
-	
-	{ ADRENO_REV_A320, 3, 2, 0, ANY_ID,
+	/* A3XX doesn't use the pix_shader_start */
+	{ ADRENO_REV_A320, 3, 2, ANY_ID, ANY_ID,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_512K, 0x3FF037, 0x3FF016 },
-};
-
-struct kgsl_process_name {
-        char name[TASK_COMM_LEN+1];
-};
-
-static const struct kgsl_process_name kgsl_blocking_process_tbl[] = {
-        {"SurfaceFlinger"},
-        {"surfaceflinger"},
-        {"ndroid.systemui"},
-	{"droid.htcdialer"},
-	{"mediaserver"},
+	{ ADRENO_REV_A330, 3, 3, 0, 0,
+		"a330_pm4.fw", "a330_pfp.fw", &adreno_a3xx_gpudev,
+		512, 0, 2, SZ_1M, NO_VER, NO_VER },
 };
 
 static irqreturn_t adreno_irq_handler(struct kgsl_device *device)
@@ -197,7 +226,7 @@
 		}
 	}
 
-	
+	/* Reset the time-out in our idle timer */
 	mod_timer_pending(&device->idle_timer,
 		jiffies + device->pwrctrl.interval_timeout);
 	return result;
@@ -269,25 +298,26 @@
 	unsigned int *cmds = &link[0];
 	int sizedwords = 0;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct kgsl_memdesc **reg_map_desc;
-	void *reg_map_array = NULL;
 	int num_iommu_units, i;
 	struct kgsl_context *context;
 	struct adreno_context *adreno_ctx = NULL;
 
 	if (!adreno_dev->drawctxt_active)
 		return kgsl_mmu_device_setstate(&device->mmu, flags);
-	num_iommu_units = kgsl_mmu_get_reg_map_desc(&device->mmu,
-							&reg_map_array);
+	num_iommu_units = kgsl_mmu_get_num_iommu_units(&device->mmu);
 
 	context = idr_find(&device->context_idr, context_id);
+	if (context == NULL)
+		return;
 	adreno_ctx = context->devctxt;
 
-	reg_map_desc = reg_map_array;
-
 	if (kgsl_mmu_enable_clk(&device->mmu,
 				KGSL_IOMMU_CONTEXT_USER))
-		goto done;
+		return;
+
+	cmds += __adreno_add_idle_indirect_cmds(cmds,
+		device->mmu.setstate_memory.gpuaddr +
+		KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 
 	if (cpu_is_msm8960())
 		cmds += adreno_add_change_mh_phys_limit_cmds(cmds, 0xFFFFF000,
@@ -301,46 +331,53 @@
 
 	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
 
-	
+	/* Acquire GPU-CPU sync Lock here */
 	cmds += kgsl_mmu_sync_lock(&device->mmu, cmds);
 
-	pt_val = kgsl_mmu_pt_get_base_addr(device->mmu.hwpagetable);
+	pt_val = kgsl_mmu_get_pt_base_addr(&device->mmu,
+					device->mmu.hwpagetable);
 	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
+		/*
+		 * We need to perfrom the following operations for all
+		 * IOMMU units
+		 */
 		for (i = 0; i < num_iommu_units; i++) {
-			reg_pt_val = (pt_val &
-				(KGSL_IOMMU_TTBR0_PA_MASK <<
-				KGSL_IOMMU_TTBR0_PA_SHIFT)) +
-				kgsl_mmu_get_pt_lsb(&device->mmu, i,
-					KGSL_IOMMU_CONTEXT_USER);
+			reg_pt_val = (pt_val + kgsl_mmu_get_pt_lsb(&device->mmu,
+						i, KGSL_IOMMU_CONTEXT_USER));
+			/*
+			 * Set address of the new pagetable by writng to IOMMU
+			 * TTBR0 register
+			 */
 			*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
-			*cmds++ = reg_map_desc[i]->gpuaddr +
-				(KGSL_IOMMU_CONTEXT_USER <<
-				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_TTBR0;
+			*cmds++ = kgsl_mmu_get_reg_gpuaddr(&device->mmu, i,
+				KGSL_IOMMU_CONTEXT_USER, KGSL_IOMMU_CTX_TTBR0);
 			*cmds++ = reg_pt_val;
 			*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 			*cmds++ = 0x00000000;
 
+			/*
+			 * Read back the ttbr0 register as a barrier to ensure
+			 * above writes have completed
+			 */
 			cmds += adreno_add_read_cmds(device, cmds,
-				reg_map_desc[i]->gpuaddr +
-				(KGSL_IOMMU_CONTEXT_USER <<
-				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_TTBR0,
+				kgsl_mmu_get_reg_gpuaddr(&device->mmu, i,
+				KGSL_IOMMU_CONTEXT_USER, KGSL_IOMMU_CTX_TTBR0),
 				reg_pt_val,
 				device->mmu.setstate_memory.gpuaddr +
 				KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 		}
 	}
 	if (flags & KGSL_MMUFLAGS_TLBFLUSH) {
+		/*
+		 * tlb flush
+		 */
 		for (i = 0; i < num_iommu_units; i++) {
-			reg_pt_val = (pt_val &
-				(KGSL_IOMMU_TTBR0_PA_MASK <<
-				KGSL_IOMMU_TTBR0_PA_SHIFT)) +
-				kgsl_mmu_get_pt_lsb(&device->mmu, i,
-					KGSL_IOMMU_CONTEXT_USER);
+			reg_pt_val = (pt_val + kgsl_mmu_get_pt_lsb(&device->mmu,
+						i, KGSL_IOMMU_CONTEXT_USER));
 
 			*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
-			*cmds++ = (reg_map_desc[i]->gpuaddr +
-				(KGSL_IOMMU_CONTEXT_USER <<
-				KGSL_IOMMU_CTX_SHIFT) +
+			*cmds++ = kgsl_mmu_get_reg_gpuaddr(&device->mmu, i,
+				KGSL_IOMMU_CONTEXT_USER,
 				KGSL_IOMMU_CTX_TLBIALL);
 			*cmds++ = 1;
 
@@ -349,21 +386,22 @@
 			KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 
 			cmds += adreno_add_read_cmds(device, cmds,
-				reg_map_desc[i]->gpuaddr +
-				(KGSL_IOMMU_CONTEXT_USER <<
-				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_TTBR0,
+				kgsl_mmu_get_reg_gpuaddr(&device->mmu, i,
+					KGSL_IOMMU_CONTEXT_USER,
+					KGSL_IOMMU_CTX_TTBR0),
 				reg_pt_val,
 				device->mmu.setstate_memory.gpuaddr +
 				KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 		}
 	}
 
-	
+	/* Release GPU-CPU sync Lock here */
 	cmds += kgsl_mmu_sync_unlock(&device->mmu, cmds);
 
 	if (cpu_is_msm8960())
 		cmds += adreno_add_change_mh_phys_limit_cmds(cmds,
-			reg_map_desc[num_iommu_units - 1]->gpuaddr - PAGE_SIZE,
+			kgsl_mmu_get_reg_gpuaddr(&device->mmu, 0,
+						0, KGSL_IOMMU_GLOBAL_BASE),
 			device->mmu.setstate_memory.gpuaddr +
 			KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 	else
@@ -376,26 +414,23 @@
 
 	sizedwords += (cmds - &link[0]);
 	if (sizedwords) {
-		
+		/* invalidate all base pointers */
 		*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
 		*cmds++ = 0x7fff;
 		sizedwords += 2;
-		*cmds++ = cp_type3_packet(CP_INTERRUPT, 1);
-		*cmds++ = CP_INT_CNTL__RB_INT_MASK;
-		sizedwords += 2;
+		/* This returns the per context timestamp but we need to
+		 * use the global timestamp for iommu clock disablement */
 		adreno_ringbuffer_issuecmds(device, adreno_ctx,
 			KGSL_CMD_FLAGS_PMODE,
 			&link[0], sizedwords);
 		kgsl_mmu_disable_clk_on_ts(&device->mmu,
 		adreno_dev->ringbuffer.timestamp[KGSL_MEMSTORE_GLOBAL], true);
 	}
+
 	if (sizedwords > (sizeof(link)/sizeof(unsigned int))) {
 		KGSL_DRV_ERR(device, "Temp command buffer overflow\n");
 		BUG();
 	}
-done:
-	if (num_iommu_units)
-		kfree(reg_map_array);
 }
 
 static void adreno_gpummu_setstate(struct kgsl_device *device,
@@ -406,24 +441,36 @@
 	unsigned int link[32];
 	unsigned int *cmds = &link[0];
 	int sizedwords = 0;
-	unsigned int mh_mmu_invalidate = 0x00000003; 
+	unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */
 	struct kgsl_context *context;
 	struct adreno_context *adreno_ctx = NULL;
 
+	/*
+	 * Fix target freeze issue by adding TLB flush for each submit
+	 * on A20X based targets.
+	 */
 	if (adreno_is_a20x(adreno_dev))
 		flags |= KGSL_MMUFLAGS_TLBFLUSH;
+	/*
+	 * If possible, then set the state via the command stream to avoid
+	 * a CPU idle.  Otherwise, use the default setstate which uses register
+	 * writes For CFF dump we must idle and use the registers so that it is
+	 * easier to filter out the mmu accesses from the dump
+	 */
 	if (!kgsl_cff_dump_enable && adreno_dev->drawctxt_active) {
 		context = idr_find(&device->context_idr, context_id);
+		if (context == NULL)
+			return;
 		adreno_ctx = context->devctxt;
 
 		if (flags & KGSL_MMUFLAGS_PTUPDATE) {
-			
+			/* wait for graphics pipe to be idle */
 			*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 			*cmds++ = 0x00000000;
 
-			
+			/* set page table base */
 			*cmds++ = cp_type0_packet(MH_MMU_PT_BASE, 1);
-			*cmds++ = kgsl_mmu_pt_get_base_addr(
+			*cmds++ = kgsl_mmu_get_pt_base_addr(&device->mmu,
 					device->mmu.hwpagetable);
 			sizedwords += 4;
 		}
@@ -442,28 +489,40 @@
 
 		if (flags & KGSL_MMUFLAGS_PTUPDATE &&
 			adreno_is_a20x(adreno_dev)) {
+			/* HW workaround: to resolve MMU page fault interrupts
+			* caused by the VGT.It prevents the CP PFP from filling
+			* the VGT DMA request fifo too early,thereby ensuring
+			* that the VGT will not fetch vertex/bin data until
+			* after the page table base register has been updated.
+			*
+			* Two null DRAW_INDX_BIN packets are inserted right
+			* after the page table base update, followed by a
+			* wait for idle. The null packets will fill up the
+			* VGT DMA request fifo and prevent any further
+			* vertex/bin updates from occurring until the wait
+			* has finished. */
 			*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 			*cmds++ = (0x4 << 16) |
 				(REG_PA_SU_SC_MODE_CNTL - 0x2000);
-			*cmds++ = 0;	  
+			*cmds++ = 0;	  /* disable faceness generation */
 			*cmds++ = cp_type3_packet(CP_SET_BIN_BASE_OFFSET, 1);
 			*cmds++ = device->mmu.setstate_memory.gpuaddr;
 			*cmds++ = cp_type3_packet(CP_DRAW_INDX_BIN, 6);
-			*cmds++ = 0;	  
-			*cmds++ = 0x0003C004; 
-			*cmds++ = 0;	  
-			*cmds++ = 3;	  
+			*cmds++ = 0;	  /* viz query info */
+			*cmds++ = 0x0003C004; /* draw indicator */
+			*cmds++ = 0;	  /* bin base */
+			*cmds++ = 3;	  /* bin size */
 			*cmds++ =
-			device->mmu.setstate_memory.gpuaddr; 
-			*cmds++ = 6;	  
+			device->mmu.setstate_memory.gpuaddr; /* dma base */
+			*cmds++ = 6;	  /* dma size */
 			*cmds++ = cp_type3_packet(CP_DRAW_INDX_BIN, 6);
-			*cmds++ = 0;	  
-			*cmds++ = 0x0003C004; 
-			*cmds++ = 0;	  
-			*cmds++ = 3;	  
-			
+			*cmds++ = 0;	  /* viz query info */
+			*cmds++ = 0x0003C004; /* draw indicator */
+			*cmds++ = 0;	  /* bin base */
+			*cmds++ = 3;	  /* bin size */
+			/* dma base */
 			*cmds++ = device->mmu.setstate_memory.gpuaddr;
-			*cmds++ = 6;	  
+			*cmds++ = 6;	  /* dma size */
 			*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 			*cmds++ = 0x00000000;
 			sizedwords += 21;
@@ -472,7 +531,7 @@
 
 		if (flags & (KGSL_MMUFLAGS_PTUPDATE | KGSL_MMUFLAGS_TLBFLUSH)) {
 			*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
-			*cmds++ = 0x7fff; 
+			*cmds++ = 0x7fff; /* invalidate all base pointers */
 			sizedwords += 2;
 		}
 
@@ -488,7 +547,7 @@
 			unsigned int context_id,
 			uint32_t flags)
 {
-	
+	/* call the mmu specific handler */
 	if (KGSL_MMU_TYPE_GPU == kgsl_mmu_get_mmutype())
 		return adreno_gpummu_setstate(device, context_id, flags);
 	else if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
@@ -498,42 +557,16 @@
 static unsigned int
 a3xx_getchipid(struct kgsl_device *device)
 {
-	unsigned int majorid = 0, minorid = 0, patchid = 0;
+	struct kgsl_device_platform_data *pdata =
+		kgsl_device_get_drvdata(device);
 
+	/*
+	 * All current A3XX chipids are detected at the SOC level. Leave this
+	 * function here to support any future GPUs that have working
+	 * chip ID registers
+	 */
 
-	unsigned int version = socinfo_get_version();
-
-	if (cpu_is_apq8064() || cpu_is_apq8064ab()) {
-
-		
-		majorid = 2;
-		minorid = 0;
-
-
-		if (SOCINFO_VERSION_MAJOR(version) == 2) {
-			patchid = 2;
-		} else {
-			if ((SOCINFO_VERSION_MAJOR(version) == 1) &&
-				(SOCINFO_VERSION_MINOR(version) == 1))
-					patchid = 1;
-			else
-					patchid = 0;
-		}
-	} else if (cpu_is_msm8930() || cpu_is_msm8930aa() || cpu_is_msm8627()) {
-
-		
-		majorid = 0;
-		minorid = 5;
-
-
-		if ((SOCINFO_VERSION_MAJOR(version) == 1) &&
-			(SOCINFO_VERSION_MINOR(version) == 2))
-			patchid = 2;
-		else
-			patchid = 0;
-	}
-
-	return (0x03 << 24) | (majorid << 16) | (minorid << 8) | patchid;
+	return pdata->chipid;
 }
 
 static unsigned int
@@ -541,13 +574,23 @@
 {
 	unsigned int chipid = 0;
 	unsigned int coreid, majorid, minorid, patchid, revid;
-	uint32_t soc_platform_version = socinfo_get_version();
+	struct kgsl_device_platform_data *pdata =
+		kgsl_device_get_drvdata(device);
+
+	/* If the chip id is set at the platform level, then just use that */
+
+	if (pdata->chipid != 0)
+		return pdata->chipid;
 
 	adreno_regread(device, REG_RBBM_PERIPHID1, &coreid);
 	adreno_regread(device, REG_RBBM_PERIPHID2, &majorid);
 	adreno_regread(device, REG_RBBM_PATCH_RELEASE, &revid);
 
-	if (cpu_is_msm8960() || cpu_is_msm8x60())
+	/*
+	* adreno 22x gpus are indicated by coreid 2,
+	* but REG_RBBM_PERIPHID1 always contains 0 for this field
+	*/
+	if (cpu_is_msm8x60())
 		chipid = 2 << 24;
 	else
 		chipid = (coreid & 0xF) << 24;
@@ -558,14 +601,10 @@
 
 	patchid = ((revid >> 16) & 0xFF);
 
-	
-	
-	
+	/* 8x50 returns 0 for patch release, but it should be 1 */
+	/* 8x25 returns 0 for minor id, but it should be 1 */
 	if (cpu_is_qsd8x50())
 		patchid = 1;
-	else if (cpu_is_msm8960() &&
-			SOCINFO_VERSION_MAJOR(soc_platform_version) == 3)
-		patchid = 6;
 	else if (cpu_is_msm8625() && minorid == 0)
 		minorid = 1;
 
@@ -577,11 +616,18 @@
 static unsigned int
 adreno_getchipid(struct kgsl_device *device)
 {
-	if (cpu_is_apq8064() || cpu_is_apq8064ab() || cpu_is_msm8930() ||
-		cpu_is_msm8930aa() || cpu_is_msm8627())
-		return a3xx_getchipid(device);
-	else
+	struct kgsl_device_platform_data *pdata =
+		kgsl_device_get_drvdata(device);
+
+	/*
+	 * All A3XX chipsets will have pdata set, so assume !pdata->chipid is
+	 * an A2XX processor
+	 */
+
+	if (pdata->chipid == 0 || ADRENO_CHIPID_MAJOR(pdata->chipid) == 2)
 		return a2xx_getchipid(device);
+	else
+		return a3xx_getchipid(device);
 }
 
 static inline bool _rev_match(unsigned int id, unsigned int entry)
@@ -596,10 +642,10 @@
 
 	adreno_dev->chip_id = adreno_getchipid(&adreno_dev->dev);
 
-	core = (adreno_dev->chip_id >> 24) & 0xff;
-	major = (adreno_dev->chip_id >> 16) & 0xff;
-	minor = (adreno_dev->chip_id >> 8) & 0xff;
-	patchid = (adreno_dev->chip_id & 0xff);
+	core = ADRENO_CHIPID_CORE(adreno_dev->chip_id);
+	major = ADRENO_CHIPID_MAJOR(adreno_dev->chip_id);
+	minor = ADRENO_CHIPID_MINOR(adreno_dev->chip_id);
+	patchid = ADRENO_CHIPID_PATCH(adreno_dev->chip_id);
 
 	for (i = 0; i < ARRAY_SIZE(adreno_gpulist); i++) {
 		if (core == adreno_gpulist[i].core &&
@@ -626,12 +672,490 @@
 
 }
 
+static struct platform_device_id adreno_id_table[] = {
+	{ DEVICE_3D0_NAME, (kernel_ulong_t)&device_3d0.dev, },
+	{},
+};
+
+MODULE_DEVICE_TABLE(platform, adreno_id_table);
+
+static struct of_device_id adreno_match_table[] = {
+	{ .compatible = "qcom,kgsl-3d0", },
+	{}
+};
+
+static inline int adreno_of_read_property(struct device_node *node,
+	const char *prop, unsigned int *ptr)
+{
+	int ret = of_property_read_u32(node, prop, ptr);
+	if (ret)
+		KGSL_CORE_ERR("Unable to read '%s'\n", prop);
+	return ret;
+}
+
+static struct device_node *adreno_of_find_subnode(struct device_node *parent,
+	const char *name)
+{
+	struct device_node *child;
+
+	for_each_child_of_node(parent, child) {
+		if (of_device_is_compatible(child, name))
+			return child;
+	}
+
+	return NULL;
+}
+
+static int adreno_of_get_pwrlevels(struct device_node *parent,
+	struct kgsl_device_platform_data *pdata)
+{
+	struct device_node *node, *child;
+	int ret = -EINVAL;
+
+	node = adreno_of_find_subnode(parent, "qcom,gpu-pwrlevels");
+
+	if (node == NULL) {
+		KGSL_CORE_ERR("Unable to find 'qcom,gpu-pwrlevels'\n");
+		return -EINVAL;
+	}
+
+	pdata->num_levels = 0;
+
+	for_each_child_of_node(node, child) {
+		unsigned int index;
+		struct kgsl_pwrlevel *level;
+
+		if (adreno_of_read_property(child, "reg", &index))
+			goto done;
+
+		if (index >= KGSL_MAX_PWRLEVELS) {
+			KGSL_CORE_ERR("Pwrlevel index %d is out of range\n",
+				index);
+			continue;
+		}
+
+		if (index >= pdata->num_levels)
+			pdata->num_levels = index + 1;
+
+		level = &pdata->pwrlevel[index];
+
+		if (adreno_of_read_property(child, "qcom,gpu-freq",
+			&level->gpu_freq))
+			goto done;
+
+		if (adreno_of_read_property(child, "qcom,bus-freq",
+			&level->bus_freq))
+			goto done;
+
+		if (adreno_of_read_property(child, "qcom,io-fraction",
+			&level->io_fraction))
+			level->io_fraction = 0;
+	}
+
+	if (adreno_of_read_property(parent, "qcom,initial-pwrlevel",
+		&pdata->init_level))
+		pdata->init_level = 1;
+
+	if (pdata->init_level < 0 || pdata->init_level > pdata->num_levels) {
+		KGSL_CORE_ERR("Initial power level out of range\n");
+		pdata->init_level = 1;
+	}
+
+	ret = 0;
+done:
+	return ret;
+
+}
+
+static struct msm_dcvs_core_info *adreno_of_get_dcvs(struct device_node *parent)
+{
+	struct device_node *node, *child;
+	struct msm_dcvs_core_info *info = NULL;
+	int count = 0;
+	int ret = -EINVAL;
+
+	node = adreno_of_find_subnode(parent, "qcom,dcvs-core-info");
+	if (node == NULL)
+		return ERR_PTR(-EINVAL);
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (info == NULL) {
+		KGSL_CORE_ERR("kzalloc(%d) failed\n", sizeof(*info));
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for_each_child_of_node(node, child)
+		count++;
+
+	info->power_param.num_freq = count;
+
+	info->freq_tbl = kzalloc(info->power_param.num_freq *
+			sizeof(struct msm_dcvs_freq_entry),
+			GFP_KERNEL);
+
+	if (info->freq_tbl == NULL) {
+		KGSL_CORE_ERR("kzalloc(%d) failed\n",
+			info->power_param.num_freq *
+			sizeof(struct msm_dcvs_freq_entry));
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for_each_child_of_node(node, child) {
+		unsigned int index;
+
+		if (adreno_of_read_property(child, "reg", &index))
+			goto err;
+
+		if (index >= info->power_param.num_freq) {
+			KGSL_CORE_ERR("DCVS freq entry %d is out of range\n",
+				index);
+			continue;
+		}
+
+		if (adreno_of_read_property(child, "qcom,freq",
+			&info->freq_tbl[index].freq))
+			goto err;
+
+		if (adreno_of_read_property(child, "qcom,voltage",
+			&info->freq_tbl[index].voltage))
+			info->freq_tbl[index].voltage = 0;
+
+		if (adreno_of_read_property(child, "qcom,is_trans_level",
+			&info->freq_tbl[index].is_trans_level))
+			info->freq_tbl[index].is_trans_level = 0;
+
+		if (adreno_of_read_property(child, "qcom,active-energy-offset",
+			&info->freq_tbl[index].active_energy_offset))
+			info->freq_tbl[index].active_energy_offset = 0;
+
+		if (adreno_of_read_property(child, "qcom,leakage-energy-offset",
+			&info->freq_tbl[index].leakage_energy_offset))
+			info->freq_tbl[index].leakage_energy_offset = 0;
+	}
+
+	if (adreno_of_read_property(node, "qcom,num-cores", &info->num_cores))
+		goto err;
+
+	info->sensors = kzalloc(info->num_cores *
+			sizeof(int),
+			GFP_KERNEL);
+
+	for (count = 0; count < info->num_cores; count++) {
+		if (adreno_of_read_property(node, "qcom,sensors",
+			&(info->sensors[count])))
+			goto err;
+	}
+
+	if (adreno_of_read_property(node, "qcom,core-core-type",
+		&info->core_param.core_type))
+		goto err;
+
+	if (adreno_of_read_property(node, "qcom,algo-disable-pc-threshold",
+		&info->algo_param.disable_pc_threshold))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-em-win-size-min-us",
+		&info->algo_param.em_win_size_min_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-em-win-size-max-us",
+		&info->algo_param.em_win_size_max_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-em-max-util-pct",
+		&info->algo_param.em_max_util_pct))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-group-id",
+		&info->algo_param.group_id))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-max-freq-chg-time-us",
+		&info->algo_param.max_freq_chg_time_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-slack-mode-dynamic",
+		&info->algo_param.slack_mode_dynamic))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-slack-weight-thresh-pct",
+		&info->algo_param.slack_weight_thresh_pct))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-slack-time-min-us",
+		&info->algo_param.slack_time_min_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-slack-time-max-us",
+		&info->algo_param.slack_time_max_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-ss-win-size-min-us",
+		&info->algo_param.ss_win_size_min_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-ss-win-size-max-us",
+		&info->algo_param.ss_win_size_max_us))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-ss-util-pct",
+		&info->algo_param.ss_util_pct))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,algo-ss-no-corr-below-freq",
+		&info->algo_param.ss_no_corr_below_freq))
+		goto err;
+
+	if (adreno_of_read_property(node, "qcom,energy-active-coeff-a",
+		&info->energy_coeffs.active_coeff_a))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,energy-active-coeff-b",
+		&info->energy_coeffs.active_coeff_b))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,energy-active-coeff-c",
+		&info->energy_coeffs.active_coeff_c))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,energy-leakage-coeff-a",
+		&info->energy_coeffs.leakage_coeff_a))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,energy-leakage-coeff-b",
+		&info->energy_coeffs.leakage_coeff_b))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,energy-leakage-coeff-c",
+		&info->energy_coeffs.leakage_coeff_c))
+		goto err;
+	if (adreno_of_read_property(node, "qcom,energy-leakage-coeff-d",
+		&info->energy_coeffs.leakage_coeff_d))
+		goto err;
+
+	if (adreno_of_read_property(node, "qcom,power-current-temp",
+		&info->power_param.current_temp))
+		goto err;
+
+	return info;
+
+err:
+	if (info)
+		kfree(info->freq_tbl);
+
+	kfree(info);
+
+	return ERR_PTR(ret);
+}
+
+static int adreno_of_get_iommu(struct device_node *parent,
+	struct kgsl_device_platform_data *pdata)
+{
+	struct device_node *node, *child;
+	struct kgsl_device_iommu_data *data = NULL;
+	struct kgsl_iommu_ctx *ctxs = NULL;
+	u32 reg_val[2];
+	int ctx_index = 0;
+
+	node = of_parse_phandle(parent, "iommu", 0);
+	if (node == NULL)
+		return -EINVAL;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL) {
+		KGSL_CORE_ERR("kzalloc(%d) failed\n", sizeof(*data));
+		goto err;
+	}
+
+	if (of_property_read_u32_array(node, "reg", reg_val, 2))
+		goto err;
+
+	data->physstart = reg_val[0];
+	data->physend = data->physstart + reg_val[1] - 1;
+
+	data->iommu_ctx_count = 0;
+
+	for_each_child_of_node(node, child)
+		data->iommu_ctx_count++;
+
+	ctxs = kzalloc(data->iommu_ctx_count * sizeof(struct kgsl_iommu_ctx),
+		GFP_KERNEL);
+
+	if (ctxs == NULL) {
+		KGSL_CORE_ERR("kzalloc(%d) failed\n",
+			data->iommu_ctx_count * sizeof(struct kgsl_iommu_ctx));
+		goto err;
+	}
+
+	for_each_child_of_node(node, child) {
+		int ret = of_property_read_string(child, "label",
+				&ctxs[ctx_index].iommu_ctx_name);
+
+		if (ret) {
+			KGSL_CORE_ERR("Unable to read KGSL IOMMU 'label'\n");
+			goto err;
+		}
+
+		if (adreno_of_read_property(child, "qcom,iommu-ctx-sids",
+			&ctxs[ctx_index].ctx_id))
+			goto err;
+
+		ctx_index++;
+	}
+
+	data->iommu_ctxs = ctxs;
+
+	pdata->iommu_data = data;
+	pdata->iommu_count = 1;
+
+	return 0;
+
+err:
+	kfree(ctxs);
+	kfree(data);
+
+	return -EINVAL;
+}
+
+static int adreno_of_get_pdata(struct platform_device *pdev)
+{
+	struct kgsl_device_platform_data *pdata = NULL;
+	struct kgsl_device *device;
+	int ret = -EINVAL;
+
+	pdev->id_entry = adreno_id_table;
+
+	pdata = pdev->dev.platform_data;
+	if (pdata)
+		return 0;
+
+	if (of_property_read_string(pdev->dev.of_node, "label", &pdev->name)) {
+		KGSL_CORE_ERR("Unable to read 'label'\n");
+		goto err;
+	}
+
+	if (adreno_of_read_property(pdev->dev.of_node, "qcom,id", &pdev->id))
+		goto err;
+
+	pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+	if (pdata == NULL) {
+		KGSL_CORE_ERR("kzalloc(%d) failed\n", sizeof(*pdata));
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (adreno_of_read_property(pdev->dev.of_node, "qcom,chipid",
+		&pdata->chipid))
+		goto err;
+
+	/* pwrlevel Data */
+	ret = adreno_of_get_pwrlevels(pdev->dev.of_node, pdata);
+	if (ret)
+		goto err;
+
+	/* Default value is 83, if not found in DT */
+	if (adreno_of_read_property(pdev->dev.of_node, "qcom,idle-timeout",
+		&pdata->idle_timeout))
+		pdata->idle_timeout = 83;
+
+	if (adreno_of_read_property(pdev->dev.of_node, "qcom,nap-allowed",
+		&pdata->nap_allowed))
+		pdata->nap_allowed = 1;
+
+	if (adreno_of_read_property(pdev->dev.of_node, "qcom,clk-map",
+		&pdata->clk_map))
+		goto err;
+
+	device = (struct kgsl_device *)pdev->id_entry->driver_data;
+
+	if (device->id != KGSL_DEVICE_3D0)
+		goto err;
+
+	/* Bus Scale Data */
+
+	pdata->bus_scale_table = msm_bus_cl_get_pdata(pdev);
+	if (IS_ERR_OR_NULL(pdata->bus_scale_table)) {
+		ret = PTR_ERR(pdata->bus_scale_table);
+		goto err;
+	}
+
+	pdata->core_info = adreno_of_get_dcvs(pdev->dev.of_node);
+	if (IS_ERR_OR_NULL(pdata->core_info)) {
+		ret = PTR_ERR(pdata->core_info);
+		goto err;
+	}
+
+	ret = adreno_of_get_iommu(pdev->dev.of_node, pdata);
+	if (ret)
+		goto err;
+
+	pdev->dev.platform_data = pdata;
+	return 0;
+
+err:
+	if (pdata) {
+		if (pdata->core_info)
+			kfree(pdata->core_info->freq_tbl);
+		kfree(pdata->core_info);
+
+		if (pdata->iommu_data)
+			kfree(pdata->iommu_data->iommu_ctxs);
+
+		kfree(pdata->iommu_data);
+	}
+
+	kfree(pdata);
+
+	return ret;
+}
+
+#ifdef CONFIG_MSM_OCMEM
+static int
+adreno_ocmem_gmem_malloc(struct adreno_device *adreno_dev)
+{
+	if (!adreno_is_a330(adreno_dev))
+		return 0;
+
+	/* OCMEM is only needed once, do not support consective allocation */
+	if (adreno_dev->ocmem_hdl != NULL)
+		return 0;
+
+	adreno_dev->ocmem_hdl =
+		ocmem_allocate(OCMEM_GRAPHICS, adreno_dev->gmem_size);
+	if (adreno_dev->ocmem_hdl == NULL)
+		return -ENOMEM;
+
+	adreno_dev->gmem_size = adreno_dev->ocmem_hdl->len;
+	adreno_dev->ocmem_base = adreno_dev->ocmem_hdl->addr;
+
+	return 0;
+}
+
+static void
+adreno_ocmem_gmem_free(struct adreno_device *adreno_dev)
+{
+	if (!adreno_is_a330(adreno_dev))
+		return;
+
+	if (adreno_dev->ocmem_hdl == NULL)
+		return;
+
+	ocmem_free(OCMEM_GRAPHICS, adreno_dev->ocmem_hdl);
+	adreno_dev->ocmem_hdl = NULL;
+}
+#else
+static int
+adreno_ocmem_gmem_malloc(struct adreno_device *adreno_dev)
+{
+	return 0;
+}
+
+static void
+adreno_ocmem_gmem_free(struct adreno_device *adreno_dev)
+{
+}
+#endif
+
 static int __devinit
 adreno_probe(struct platform_device *pdev)
 {
 	struct kgsl_device *device;
 	struct adreno_device *adreno_dev;
 	int status = -EINVAL;
+	bool is_dt;
+
+	is_dt = of_match_device(adreno_match_table, &pdev->dev);
+
+	if (is_dt && pdev->dev.of_node) {
+		status = adreno_of_get_pdata(pdev);
+		if (status)
+			goto error_return;
+	}
 
 	device = (struct kgsl_device *)pdev->id_entry->driver_data;
 	adreno_dev = ADRENO_DEVICE(device);
@@ -657,6 +1181,7 @@
 	adreno_ringbuffer_close(&adreno_dev->ringbuffer);
 error:
 	device->parentdev = NULL;
+error_return:
 	return status;
 }
 
@@ -682,13 +1207,13 @@
 	int status = -EINVAL;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	if (KGSL_STATE_DUMP_AND_RECOVER != device->state)
+	if (KGSL_STATE_DUMP_AND_FT != device->state)
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
 
-	
+	/* Power up the device */
 	kgsl_pwrctrl_enable(device);
 
-	
+	/* Identify the specific GPU */
 	adreno_identify_gpu(adreno_dev);
 
 	if (adreno_ringbuffer_read_pm4_ucode(device)) {
@@ -710,6 +1235,10 @@
 	}
 
 
+	/*
+	 * Check if firmware supports the sync lock PM4 packets needed
+	 * for IOMMUv1
+	 */
 
 	if ((adreno_dev->pm4_fw_version >=
 		adreno_gpulist[adreno_dev->gpulist_index].sync_lock_pm4_ver) &&
@@ -717,8 +1246,12 @@
 		adreno_gpulist[adreno_dev->gpulist_index].sync_lock_pfp_ver))
 		device->mmu.flags |= KGSL_MMU_FLAGS_IOMMU_SYNC;
 
-	
+	/* Set up the MMU */
 	if (adreno_is_a2xx(adreno_dev)) {
+		/*
+		 * the MH_CLNT_INTF_CTRL_CONFIG registers aren't present
+		 * on older gpus
+		 */
 		if (adreno_is_a20x(adreno_dev)) {
 			device->mh.mh_intf_cfg1 = 0;
 			device->mh.mh_intf_cfg2 = 0;
@@ -727,13 +1260,31 @@
 		kgsl_mh_start(device);
 	}
 
-	hang_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+	/* Assign correct RBBM status register to hang detect regs
+	 */
+	ft_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+
+	/* Add A3XX specific registers for hang detection */
+	if (adreno_is_a3xx(adreno_dev)) {
+		ft_detect_regs[6] = A3XX_RBBM_PERFCTR_SP_7_LO;
+		ft_detect_regs[7] = A3XX_RBBM_PERFCTR_SP_7_HI;
+		ft_detect_regs[8] = A3XX_RBBM_PERFCTR_SP_6_LO;
+		ft_detect_regs[9] = A3XX_RBBM_PERFCTR_SP_6_HI;
+		ft_detect_regs[10] = A3XX_RBBM_PERFCTR_SP_5_LO;
+		ft_detect_regs[11] = A3XX_RBBM_PERFCTR_SP_5_HI;
+	}
 
 	status = kgsl_mmu_start(device);
 	if (status)
 		goto error_clk_off;
 
-	
+	status = adreno_ocmem_gmem_malloc(adreno_dev);
+	if (status) {
+		KGSL_DRV_ERR(device, "OCMEM malloc failed\n");
+		goto error_mmu_off;
+	}
+
+	/* Start the GPU */
 	adreno_dev->gpudev->start(adreno_dev);
 
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
@@ -741,13 +1292,18 @@
 
 	status = adreno_ringbuffer_start(&adreno_dev->ringbuffer, init_ram);
 	if (status == 0) {
-		if (KGSL_STATE_DUMP_AND_RECOVER != device->state)
+		/* While fault tolerance is on we do not want timer to
+		 * fire and attempt to change any device state */
+		if (KGSL_STATE_DUMP_AND_FT != device->state)
 			mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
 		return 0;
 	}
 
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
+
+error_mmu_off:
 	kgsl_mmu_stop(&device->mmu);
+
 error_clk_off:
 	kgsl_pwrctrl_disable(device);
 
@@ -762,32 +1318,41 @@
 
 	adreno_ringbuffer_stop(&adreno_dev->ringbuffer);
 
+	kgsl_mmu_stop(&device->mmu);
+
 	device->ftbl->irqctrl(device, 0);
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
 	del_timer_sync(&device->idle_timer);
 
-	kgsl_mmu_stop(&device->mmu);
-	
+	adreno_ocmem_gmem_free(adreno_dev);
+
+	/* Power down the device */
 	kgsl_pwrctrl_disable(device);
 
 	return 0;
 }
 
 static void adreno_mark_context_status(struct kgsl_device *device,
-					int recovery_status)
+					int ft_status)
 {
 	struct kgsl_context *context;
 	int next = 0;
+	/*
+	 * Set the reset status of all contexts to
+	 * INNOCENT_CONTEXT_RESET_EXT except for the bad context
+	 * since thats the guilty party, if fault tolerance failed then
+	 * mark all as guilty
+	 */
 	while ((context = idr_get_next(&device->context_idr, &next))) {
 		struct adreno_context *adreno_context = context->devctxt;
-		if (recovery_status) {
+		if (ft_status) {
 			context->reset_status =
 					KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
 			adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
 		} else if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
 			context->reset_status) {
-			if (adreno_context->flags & (CTXT_FLAGS_GPU_HANG ||
-				CTXT_FLAGS_GPU_HANG_RECOVERED))
+			if (adreno_context->flags & (CTXT_FLAGS_GPU_HANG |
+				CTXT_FLAGS_GPU_HANG_FT))
 				context->reset_status =
 				KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
 			else
@@ -822,209 +1387,610 @@
 	}
 }
 
-static void adreno_destroy_recovery_data(struct adreno_recovery_data *rec_data)
+static void adreno_destroy_ft_data(struct adreno_ft_data *ft_data)
 {
-	vfree(rec_data->rb_buffer);
-	vfree(rec_data->bad_rb_buffer);
+	vfree(ft_data->rb_buffer);
+	vfree(ft_data->bad_rb_buffer);
+	vfree(ft_data->good_rb_buffer);
 }
 
-static int adreno_setup_recovery_data(struct kgsl_device *device,
-					struct adreno_recovery_data *rec_data)
+static int _find_start_of_cmd_seq(struct adreno_ringbuffer *rb,
+					unsigned int *ptr,
+					bool inc)
+{
+	int status = -EINVAL;
+	unsigned int val1;
+	unsigned int size = rb->buffer_desc.size;
+	unsigned int start_ptr = *ptr;
+
+	while ((start_ptr / sizeof(unsigned int)) != rb->wptr) {
+		if (inc)
+			start_ptr = adreno_ringbuffer_inc_wrapped(start_ptr,
+									size);
+		else
+			start_ptr = adreno_ringbuffer_dec_wrapped(start_ptr,
+									size);
+		kgsl_sharedmem_readl(&rb->buffer_desc, &val1, start_ptr);
+		if (KGSL_CMD_IDENTIFIER == val1) {
+			if ((start_ptr / sizeof(unsigned int)) != rb->wptr)
+				start_ptr = adreno_ringbuffer_dec_wrapped(
+							start_ptr, size);
+				*ptr = start_ptr;
+				status = 0;
+				break;
+		}
+	}
+	return status;
+}
+
+static int _find_cmd_seq_after_eop_ts(struct adreno_ringbuffer *rb,
+					unsigned int *rb_rptr,
+					unsigned int global_eop,
+					bool inc)
+{
+	int status = -EINVAL;
+	unsigned int temp_rb_rptr = *rb_rptr;
+	unsigned int size = rb->buffer_desc.size;
+	unsigned int val[3];
+	int i = 0;
+	bool check = false;
+
+	if (inc && temp_rb_rptr / sizeof(unsigned int) != rb->wptr)
+		return status;
+
+	do {
+		/*
+		 * when decrementing we need to decrement first and
+		 * then read make sure we cover all the data
+		 */
+		if (!inc)
+			temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
+					temp_rb_rptr, size);
+		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i],
+					temp_rb_rptr);
+
+		if (check && ((inc && val[i] == global_eop) ||
+			(!inc && (val[i] ==
+			cp_type3_packet(CP_MEM_WRITE, 2) ||
+			val[i] == CACHE_FLUSH_TS)))) {
+			/* decrement i, i.e i = (i - 1 + 3) % 3 if
+			 * we are going forward, else increment i */
+			i = (i + 2) % 3;
+			if (val[i] == rb->device->memstore.gpuaddr +
+				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+						eoptimestamp)) {
+				int j = ((i + 2) % 3);
+				if ((inc && (val[j] == CACHE_FLUSH_TS ||
+						val[j] == cp_type3_packet(
+							CP_MEM_WRITE, 2))) ||
+					(!inc && val[j] == global_eop)) {
+						/* Found the global eop */
+						status = 0;
+						break;
+				}
+			}
+			/* if no match found then increment i again
+			 * since we decremented before matching */
+			i = (i + 1) % 3;
+		}
+		if (inc)
+			temp_rb_rptr = adreno_ringbuffer_inc_wrapped(
+						temp_rb_rptr, size);
+
+		i = (i + 1) % 3;
+		if (2 == i)
+			check = true;
+	} while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr);
+	/* temp_rb_rptr points to the command stream after global eop,
+	 * move backward till the start of command sequence */
+	if (!status) {
+		status = _find_start_of_cmd_seq(rb, &temp_rb_rptr, false);
+		if (!status) {
+			*rb_rptr = temp_rb_rptr;
+			KGSL_FT_INFO(rb->device,
+			"Offset of cmd sequence after eop timestamp: 0x%x\n",
+			temp_rb_rptr / sizeof(unsigned int));
+		}
+	}
+	if (status)
+		KGSL_FT_ERR(rb->device,
+		"Failed to find the command sequence after eop timestamp\n");
+	return status;
+}
+
+static int _find_hanging_ib_sequence(struct adreno_ringbuffer *rb,
+				unsigned int *rb_rptr,
+				unsigned int ib1)
+{
+	int status = -EINVAL;
+	unsigned int temp_rb_rptr = *rb_rptr;
+	unsigned int size = rb->buffer_desc.size;
+	unsigned int val[2];
+	int i = 0;
+	bool check = false;
+	bool ctx_switch = false;
+
+	while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
+		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
+
+		if (check && val[i] == ib1) {
+			/* decrement i, i.e i = (i - 1 + 2) % 2 */
+			i = (i + 1) % 2;
+			if (adreno_cmd_is_ib(val[i])) {
+				/* go till start of command sequence */
+				status = _find_start_of_cmd_seq(rb,
+						&temp_rb_rptr, false);
+
+				KGSL_FT_INFO(rb->device,
+				"Found the hanging IB at offset 0x%x\n",
+				temp_rb_rptr / sizeof(unsigned int));
+				break;
+			}
+			/* if no match the increment i since we decremented
+			 * before checking */
+			i = (i + 1) % 2;
+		}
+		/* Make sure you do not encounter a context switch twice, we can
+		 * encounter it once for the bad context as the start of search
+		 * can point to the context switch */
+		if (val[i] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
+			if (ctx_switch) {
+				KGSL_FT_ERR(rb->device,
+				"Context switch encountered before bad "
+				"IB found\n");
+				break;
+			}
+			ctx_switch = true;
+		}
+		i = (i + 1) % 2;
+		if (1 == i)
+			check = true;
+		temp_rb_rptr = adreno_ringbuffer_inc_wrapped(temp_rb_rptr,
+								size);
+	}
+	if  (!status)
+		*rb_rptr = temp_rb_rptr;
+	return status;
+}
+
+static int adreno_setup_ft_data(struct kgsl_device *device,
+					struct adreno_ft_data *ft_data)
 {
 	int ret = 0;
-	unsigned int ib1_sz, ib2_sz;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
+	struct kgsl_context *context;
+	struct adreno_context *adreno_context;
+	unsigned int rb_rptr = rb->wptr * sizeof(unsigned int);
 
-	memset(rec_data, 0, sizeof(*rec_data));
+	memset(ft_data, 0, sizeof(*ft_data));
+	ft_data->start_of_replay_cmds = 0xFFFFFFFF;
+	ft_data->replay_for_snapshot = 0xFFFFFFFF;
 
-	adreno_regread(device, REG_CP_IB1_BUFSZ, &ib1_sz);
-	adreno_regread(device, REG_CP_IB2_BUFSZ, &ib2_sz);
-	if (ib1_sz || ib2_sz)
-		adreno_regread(device, REG_CP_IB1_BASE, &rec_data->ib1);
+	adreno_regread(device, REG_CP_IB1_BASE, &ft_data->ib1);
 
-	kgsl_sharedmem_readl(&device->memstore, &rec_data->context_id,
+	kgsl_sharedmem_readl(&device->memstore, &ft_data->context_id,
 			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 			current_context));
 
 	kgsl_sharedmem_readl(&device->memstore,
-				&rec_data->global_eop,
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-				eoptimestamp));
+			&ft_data->global_eop,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+			eoptimestamp));
 
-	rec_data->rb_buffer = vmalloc(rb->buffer_desc.size);
-	if (!rec_data->rb_buffer) {
+	ft_data->rb_buffer = vmalloc(rb->buffer_desc.size);
+	if (!ft_data->rb_buffer) {
 		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
 				rb->buffer_desc.size);
 		return -ENOMEM;
 	}
 
-	rec_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
-	if (!rec_data->bad_rb_buffer) {
+	ft_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
+	if (!ft_data->bad_rb_buffer) {
 		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
 				rb->buffer_desc.size);
 		ret = -ENOMEM;
 		goto done;
 	}
 
+	ft_data->good_rb_buffer = vmalloc(rb->buffer_desc.size);
+	if (!ft_data->good_rb_buffer) {
+		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
+				rb->buffer_desc.size);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	ft_data->status =  0;
+
+	/* find the start of bad command sequence in rb */
+	context = idr_find(&device->context_idr, ft_data->context_id);
+	/* Look for the command stream that is right after the global eop */
+
+	if (!context) {
+		/*
+		 * If there is no context then fault tolerance does not need to
+		 * replay anything, just reset GPU and thats it
+		 */
+		goto done;
+	}
+	ret = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
+					ft_data->global_eop + 1, false);
+	if (ret)
+		goto done;
+
+	ft_data->start_of_replay_cmds = rb_rptr;
+
+	if (!adreno_dev->ft_policy)
+		adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
+
+	ft_data->ft_policy = adreno_dev->ft_policy;
+
+
+	adreno_context = context->devctxt;
+	if (adreno_context->flags & CTXT_FLAGS_PREAMBLE) {
+		if (ft_data->ib1) {
+			ret = _find_hanging_ib_sequence(rb,
+					&rb_rptr, ft_data->ib1);
+			if (ret) {
+				KGSL_FT_ERR(device,
+				"Start not found for replay IB sequence\n");
+				ret = 0;
+				goto done;
+			}
+			ft_data->start_of_replay_cmds = rb_rptr;
+			ft_data->replay_for_snapshot = rb_rptr;
+		}
+	}
+
 done:
 	if (ret) {
-		vfree(rec_data->rb_buffer);
-		vfree(rec_data->bad_rb_buffer);
+		vfree(ft_data->rb_buffer);
+		vfree(ft_data->bad_rb_buffer);
+		vfree(ft_data->good_rb_buffer);
 	}
 	return ret;
 }
 
 static int
-_adreno_recover_hang(struct kgsl_device *device,
-			struct adreno_recovery_data *rec_data,
-			bool try_bad_commands)
+_adreno_check_long_ib(struct kgsl_device *device)
 {
-	int ret;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
-	struct kgsl_context *context;
-	struct adreno_context *adreno_context = NULL;
-	struct adreno_context *last_active_ctx = adreno_dev->drawctxt_active;
+	unsigned int curr_global_ts = 0;
 
-	context = idr_find(&device->context_idr, rec_data->context_id);
-	if (context == NULL) {
-		KGSL_DRV_ERR(device, "Last context unknown id:%d\n",
-			rec_data->context_id);
+	/* check if the global ts is still the same */
+	kgsl_sharedmem_readl(&device->memstore,
+			&curr_global_ts,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+			eoptimestamp));
+
+	/* Mark long ib as handled */
+	adreno_dev->long_ib = 0;
+
+	if (curr_global_ts == adreno_dev->long_ib_ts) {
+		KGSL_FT_ERR(device,
+			"IB ran too long, invalidate ctxt\n");
+		return 1;
 	} else {
-		adreno_context = context->devctxt;
-		adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
+		/* Do nothing GPU has gone ahead */
+		KGSL_FT_INFO(device, "false long ib detection return\n");
+		return 0;
+	}
+}
+
+static int
+_adreno_ft_restart_device(struct kgsl_device *device,
+		   struct kgsl_context *context,
+		   struct adreno_ft_data *ft_data)
+{
+
+	struct adreno_context *adreno_context = context->devctxt;
+
+	/* restart device */
+	if (adreno_stop(device)) {
+		KGSL_FT_ERR(device, "Device stop failed\n");
+		return 1;
 	}
 
-	ret = adreno_ringbuffer_extract(rb, rec_data);
-	if (ret)
-		goto done;
-
-	
-	ret = adreno_stop(device);
-	if (ret) {
-		KGSL_DRV_ERR(device, "Device stop failed in recovery\n");
-		goto done;
-	}
-
-	ret = adreno_start(device, true);
-	if (ret) {
-		KGSL_DRV_ERR(device, "Device start failed in recovery\n");
-		goto done;
+	if (adreno_start(device, true)) {
+		KGSL_FT_ERR(device, "Device start failed\n");
+		return 1;
 	}
 
 	if (context)
 		kgsl_mmu_setstate(&device->mmu, adreno_context->pagetable,
 			KGSL_MEMSTORE_GLOBAL);
 
+	/* If iommu is used then we need to make sure that the iommu clocks
+	 * are on since there could be commands in pipeline that touch iommu */
 	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
-		ret = kgsl_mmu_enable_clk(&device->mmu,
-			KGSL_IOMMU_CONTEXT_USER);
-		if (ret)
-			goto done;
+		if (kgsl_mmu_enable_clk(&device->mmu,
+				KGSL_IOMMU_CONTEXT_USER))
+			return 1;
 	}
 
-	if (!try_bad_commands)
-		rec_data->bad_rb_size = 0;
+	return 0;
+}
 
-	if (rec_data->bad_rb_size) {
-		int idle_ret;
-		adreno_ringbuffer_restore(rb, rec_data->bad_rb_buffer,
-					rec_data->bad_rb_size);
-		idle_ret = adreno_idle(device);
-		if (idle_ret) {
-			ret = adreno_stop(device);
-			if (ret) {
-				KGSL_DRV_ERR(device,
-				"Device stop failed in recovery\n");
-				goto done;
-			}
-			ret = adreno_start(device, true);
-			if (ret) {
-				KGSL_DRV_ERR(device,
-				"Device start failed in recovery\n");
-				goto done;
-			}
-			if (context)
-				kgsl_mmu_setstate(&device->mmu,
-						adreno_context->pagetable,
-						KGSL_MEMSTORE_GLOBAL);
+static inline void
+_adreno_debug_ft_info(struct kgsl_device *device,
+			struct adreno_ft_data *ft_data)
+{
 
-			if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
-				ret = kgsl_mmu_enable_clk(&device->mmu,
-						KGSL_IOMMU_CONTEXT_USER);
-				if (ret)
-					goto done;
-			}
+	/*
+	 * Dumping rb is a very useful tool to debug FT.
+	 * It will tell us if we are extracting the rb correctly
+	 * NOP'ing the right IB, skipping the EOF correctly etc.
+	 */
+	if (device->ft_log >= 7)  {
 
-			ret = idle_ret;
-			KGSL_DRV_ERR(device,
-			"Bad context commands hung in recovery\n");
-		} else {
-			KGSL_DRV_ERR(device,
-			"Bad context commands succeeded in recovery\n");
-			if (adreno_context)
-				adreno_context->flags = (adreno_context->flags &
-					~CTXT_FLAGS_GPU_HANG) |
-					CTXT_FLAGS_GPU_HANG_RECOVERED;
-			adreno_dev->drawctxt_active = last_active_ctx;
-		}
+		/* Print fault tolerance data here */
+		KGSL_FT_INFO(device, "Temp RB buffer size 0x%X\n",
+			ft_data->rb_size);
+		adreno_dump_rb(device, ft_data->rb_buffer,
+			ft_data->rb_size<<2, 0, ft_data->rb_size);
+
+		KGSL_FT_INFO(device, "Bad RB buffer size 0x%X\n",
+			ft_data->bad_rb_size);
+		adreno_dump_rb(device, ft_data->bad_rb_buffer,
+			ft_data->bad_rb_size<<2, 0, ft_data->bad_rb_size);
+
+		KGSL_FT_INFO(device, "Good RB buffer size 0x%X\n",
+			ft_data->good_rb_size);
+		adreno_dump_rb(device, ft_data->good_rb_buffer,
+			ft_data->good_rb_size<<2, 0, ft_data->good_rb_size);
+
 	}
-	
-	if (ret || !rec_data->bad_rb_size) {
-		adreno_ringbuffer_restore(rb, rec_data->rb_buffer,
-				rec_data->rb_size);
+}
+
+static int
+_adreno_ft_resubmit_rb(struct kgsl_device *device,
+			struct adreno_ringbuffer *rb,
+			struct kgsl_context *context,
+			struct adreno_ft_data *ft_data,
+			unsigned int *buff, unsigned int size)
+{
+	unsigned int ret = 0;
+
+	_adreno_debug_ft_info(device, ft_data);
+
+	if (_adreno_ft_restart_device(device, context, ft_data))
+		return 1;
+
+	if (size) {
+
+		/* submit commands and wait for them to pass */
+		adreno_ringbuffer_restore(rb, buff, size);
+
 		ret = adreno_idle(device);
-		if (ret) {
-			ret = -EAGAIN;
-			goto done;
+	}
+
+	return ret;
+}
+
+
+static int
+_adreno_ft(struct kgsl_device *device,
+			struct adreno_ft_data *ft_data)
+{
+	int ret = 0, i;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
+	struct kgsl_context *context;
+	struct adreno_context *adreno_context = NULL;
+	struct adreno_context *last_active_ctx = adreno_dev->drawctxt_active;
+
+	context = idr_find(&device->context_idr, ft_data->context_id);
+	if (context == NULL) {
+		KGSL_FT_CRIT(device, "Last context unknown id:%d\n",
+			ft_data->context_id);
+	} else {
+		adreno_context = context->devctxt;
+		adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
+		/*
+		 * set the invalid ts flag to 0 for this context since we have
+		 * detected a hang for it
+		 */
+		context->wait_on_invalid_ts = false;
+
+		/*
+		 *  This flag will be set by userspace for contexts
+		 *  that do not want to be fault tolerant (ex: OPENCL)
+		 */
+		if (adreno_context->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE) {
+			KGSL_FT_ERR(device,
+			"No FT set for this context play good cmds\n");
+			goto play_good_cmds;
 		}
-		if (rec_data->last_valid_ctx_id) {
-			struct kgsl_context *last_ctx =
-					idr_find(&device->context_idr,
-					rec_data->last_valid_ctx_id);
-			if (last_ctx)
-				adreno_dev->drawctxt_active = last_ctx->devctxt;
+
+	}
+
+	/*
+	 * Extract valid contents from rb which can still be executed after
+	 * hang
+	 */
+	adreno_ringbuffer_extract(rb, ft_data);
+
+	/* Check if we detected a long running IB,
+	 * if true do not attempt replay of bad cmds */
+	if (adreno_dev->long_ib) {
+		if (_adreno_check_long_ib(device)) {
+			ft_data->status = 1;
+			_adreno_debug_ft_info(device, ft_data);
+			goto play_good_cmds;
+		} else {
+			adreno_context->flags &= ~CTXT_FLAGS_GPU_HANG;
+			return 0;
 		}
 	}
+
+	/* Do not try the bad commands if  hang is due to a fault */
+	if (device->mmu.fault) {
+		KGSL_FT_ERR(device, "MMU fault skipping bad cmds\n");
+		device->mmu.fault = 0;
+		goto play_good_cmds;
+	}
+
+	if (ft_data->ft_policy & KGSL_FT_DISABLE) {
+		KGSL_FT_ERR(device, "NO FT policy play only good cmds\n");
+		goto play_good_cmds;
+	}
+
+	if (ft_data->ft_policy & KGSL_FT_REPLAY) {
+
+		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
+
+		if (ret) {
+			KGSL_FT_ERR(device, "Replay unsuccessful\n");
+			ft_data->status = 1;
+		} else
+			goto play_good_cmds;
+	}
+
+	if (ft_data->ft_policy & KGSL_FT_SKIPIB) {
+
+		for (i = 0; i < ft_data->bad_rb_size; i++) {
+			if ((ft_data->bad_rb_buffer[i] ==
+					CP_HDR_INDIRECT_BUFFER_PFD) &&
+				(ft_data->bad_rb_buffer[i+1] == ft_data->ib1)) {
+
+				ft_data->bad_rb_buffer[i] = cp_nop_packet(2);
+				ft_data->bad_rb_buffer[i+1] =
+							KGSL_NOP_IB_IDENTIFIER;
+				ft_data->bad_rb_buffer[i+2] =
+							KGSL_NOP_IB_IDENTIFIER;
+				break;
+			}
+		}
+
+		if ((i == (ft_data->bad_rb_size)) || (!ft_data->ib1)) {
+			KGSL_FT_ERR(device, "Bad IB to NOP not found\n");
+			ft_data->status = 1;
+			goto play_good_cmds;
+		}
+
+		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
+
+		if (ret) {
+			KGSL_FT_ERR(device, "NOP faulty IB unsuccessful\n");
+			ft_data->status = 1;
+		} else {
+			ft_data->status = 0;
+			goto play_good_cmds;
+		}
+	}
+
+	if (ft_data->ft_policy & KGSL_FT_SKIPFRAME) {
+
+		for (i = 0; i < ft_data->bad_rb_size; i++) {
+			if (ft_data->bad_rb_buffer[i] ==
+					KGSL_END_OF_FRAME_IDENTIFIER) {
+				ft_data->bad_rb_buffer[0] = cp_nop_packet(i);
+				break;
+			}
+		}
+
+		/* EOF not found in RB, discard till EOF in
+		   next IB submission */
+		if (i == ft_data->bad_rb_size) {
+			adreno_context->flags |= CTXT_FLAGS_SKIP_EOF;
+			KGSL_FT_INFO(device,
+			"EOF not found in RB, skip next issueib till EOF\n");
+			ft_data->bad_rb_buffer[0] = cp_nop_packet(i);
+		}
+
+		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
+
+		if (ret) {
+			KGSL_FT_ERR(device, "Skip EOF unsuccessful\n");
+			ft_data->status = 1;
+		} else {
+			ft_data->status = 0;
+			goto play_good_cmds;
+		}
+	}
+
+play_good_cmds:
+
+	if (ft_data->status)
+		KGSL_FT_ERR(device, "Bad context commands failed\n");
+	else {
+		KGSL_FT_INFO(device, "Bad context commands success\n");
+
+		if (adreno_context) {
+			adreno_context->flags = (adreno_context->flags &
+				~CTXT_FLAGS_GPU_HANG) | CTXT_FLAGS_GPU_HANG_FT;
+		}
+		adreno_dev->drawctxt_active = last_active_ctx;
+	}
+
+	ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+			ft_data->good_rb_buffer, ft_data->good_rb_size);
+
+	if (ret) {
+		/* If we fail here we can try to invalidate another
+		 * context and try fault tolerance again */
+		ret = -EAGAIN;
+		KGSL_FT_ERR(device, "Playing good commands unsuccessful\n");
+		goto done;
+	} else
+		KGSL_FT_INFO(device, "Playing good commands successful\n");
+
+	/* ringbuffer now has data from the last valid context id,
+	 * so restore the active_ctx to the last valid context */
+	if (ft_data->last_valid_ctx_id) {
+		struct kgsl_context *last_ctx =
+				idr_find(&device->context_idr,
+				ft_data->last_valid_ctx_id);
+		if (last_ctx)
+			adreno_dev->drawctxt_active = last_ctx->devctxt;
+	}
+
 done:
-	
+	/* Turn off iommu clocks */
 	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
 		kgsl_mmu_disable_clk_on_ts(&device->mmu, 0, false);
 	return ret;
 }
 
 static int
-adreno_recover_hang(struct kgsl_device *device,
-			struct adreno_recovery_data *rec_data)
+adreno_ft(struct kgsl_device *device,
+			struct adreno_ft_data *ft_data)
 {
 	int ret = 0;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 	unsigned int timestamp;
 
-	KGSL_DRV_ERR(device,
-	"Starting recovery from 3D GPU hang. Recovery parameters: IB1: 0x%X, "
+	KGSL_FT_INFO(device,
+	"Start Parameters: IB1: 0x%X, "
 	"Bad context_id: %u, global_eop: 0x%x\n",
-	rec_data->ib1, rec_data->context_id, rec_data->global_eop);
+	ft_data->ib1, ft_data->context_id, ft_data->global_eop);
 
 	timestamp = rb->timestamp[KGSL_MEMSTORE_GLOBAL];
-	KGSL_DRV_ERR(device, "Last issued global timestamp: %x\n", timestamp);
+	KGSL_FT_INFO(device, "Last issued global timestamp: %x\n", timestamp);
 
+	/* We may need to replay commands multiple times based on whether
+	 * multiple contexts hang the GPU */
 	while (true) {
-		if (!ret)
-			ret = _adreno_recover_hang(device, rec_data, true);
-		else
-			ret = _adreno_recover_hang(device, rec_data, false);
+
+		ret = _adreno_ft(device, ft_data);
 
 		if (-EAGAIN == ret) {
-			adreno_destroy_recovery_data(rec_data);
-			adreno_setup_recovery_data(device, rec_data);
-			KGSL_DRV_ERR(device,
-			"Retry recovery from 3D GPU hang. Recovery parameters: "
+			/* setup new fault tolerance parameters and retry, this
+			 * means more than 1 contexts are causing hang */
+			adreno_destroy_ft_data(ft_data);
+			ret = adreno_setup_ft_data(device, ft_data);
+			if (ret)
+				goto done;
+			KGSL_FT_INFO(device,
+			"Retry. Parameters: "
 			"IB1: 0x%X, Bad context_id: %u, global_eop: 0x%x\n",
-			rec_data->ib1, rec_data->context_id,
-			rec_data->global_eop);
+			ft_data->ib1, ft_data->context_id,
+			ft_data->global_eop);
 		} else {
 			break;
 		}
@@ -1033,7 +1999,7 @@
 	if (ret)
 		goto done;
 
-	
+	/* Restore correct states after fault tolerance */
 	if (adreno_dev->drawctxt_active)
 		device->mmu.hwpagetable =
 			adreno_dev->drawctxt_active->pagetable;
@@ -1044,115 +2010,88 @@
 			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 			eoptimestamp),
 			rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
+
+	/* switch to NULL ctxt */
+	if (adreno_dev->drawctxt_active != NULL)
+		adreno_drawctxt_switch(adreno_dev, NULL, 0);
+
 done:
 	adreno_set_max_ts_for_bad_ctxs(device);
 	adreno_mark_context_status(device, ret);
-
-	if (!ret)
-		KGSL_DRV_ERR(device, "Recovery succeeded\n");
-	else
-		KGSL_DRV_ERR(device, "Recovery failed\n");
-	return ret;
-}
-
-static int adreno_kill_suspect(struct kgsl_device *device)
-{
-	int ret = 1;
-#ifdef CONFIG_MSM_KGSL_KILL_HANG_PROCESS
-	int cankill = 1;
-	char suspect_task_comm[TASK_COMM_LEN+1];
-	char suspect_task_parent_comm[TASK_COMM_LEN+1];
-	int suspect_tgid;
-	struct task_struct *suspect_task = get_current();
-	struct task_struct *suspect_parent_task = suspect_task->group_leader;
-	int i = 0;
-
-	suspect_tgid = task_tgid_nr(suspect_task);
-	get_task_comm(suspect_task_comm, suspect_task);
-
-	if (suspect_parent_task)
-		get_task_comm(suspect_task_parent_comm, suspect_parent_task);
-	else
-		suspect_task_parent_comm[0] = '\0';
-
-	
-
-	for (i = 0; i < ARRAY_SIZE(kgsl_blocking_process_tbl); i++) {
-		if (!((strncmp(suspect_task_comm,
-			kgsl_blocking_process_tbl[i].name, TASK_COMM_LEN)) &&
-			(strncmp(suspect_task_parent_comm,
-			kgsl_blocking_process_tbl[i].name, TASK_COMM_LEN)))) {
-			cankill=0;
-			break;
-		}
-	}
-
-	if (cankill) {
-		KGSL_DRV_ERR(device, "We need to kill suspect process "
-		"causing gpu hung, tgid=%d, name=%s, pname=%s\n",
-		suspect_tgid, suspect_task_comm, suspect_task_parent_comm);
-
-		do_send_sig_info(SIGKILL,
-		SEND_SIG_FORCED, suspect_task, true);
-		ret = 0;
-	}
-#endif
+	KGSL_FT_ERR(device, "policy 0x%X status 0x%x\n",
+			ft_data->ft_policy, ret);
 	return ret;
 }
 
 int
-adreno_dump_and_recover(struct kgsl_device *device)
+adreno_dump_and_exec_ft(struct kgsl_device *device)
 {
 	int result = -ETIMEDOUT;
-	struct adreno_recovery_data rec_data;
+	struct adreno_ft_data ft_data;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+	unsigned int curr_pwrlevel;
 
 	if (device->state == KGSL_STATE_HUNG)
 		goto done;
-	if (device->state == KGSL_STATE_DUMP_AND_RECOVER) {
+	if (device->state == KGSL_STATE_DUMP_AND_FT) {
 		mutex_unlock(&device->mutex);
-		wait_for_completion(&device->recovery_gate);
+		wait_for_completion(&device->ft_gate);
 		mutex_lock(&device->mutex);
 		if (device->state != KGSL_STATE_HUNG)
 			result = 0;
 	} else {
-		kgsl_pwrctrl_set_state(device, KGSL_STATE_DUMP_AND_RECOVER);
-		INIT_COMPLETION(device->recovery_gate);
-		
+		kgsl_pwrctrl_set_state(device, KGSL_STATE_DUMP_AND_FT);
+		INIT_COMPLETION(device->ft_gate);
+		/* Detected a hang */
 
-		
-		result = adreno_setup_recovery_data(device, &rec_data);
-		adreno_postmortem_dump(device, 0);
+		/* Run fault tolerance at max power level */
+		curr_pwrlevel = pwr->active_pwrlevel;
+		kgsl_pwrctrl_pwrlevel_change(device, pwr->max_pwrlevel);
 
-		kgsl_device_snapshot(device, 1);
+		/* Get the fault tolerance data as soon as hang is detected */
+		result = adreno_setup_ft_data(device, &ft_data);
 
-		result = adreno_recover_hang(device, &rec_data);
-		adreno_destroy_recovery_data(&rec_data);
+		/*
+		 * If long ib is detected, do not attempt postmortem or
+		 * snapshot, if GPU is still executing commands
+		 * we will get errors
+		 */
+		if (!adreno_dev->long_ib) {
+			/*
+			 * Trigger an automatic dump of the state to
+			 * the console
+			 */
+			kgsl_postmortem_dump(device, 0);
+
+			/*
+			* Make a GPU snapshot.  For now, do it after the
+			* PM dump so we can at least be sure the PM dump
+			* will work as it always has
+			*/
+			kgsl_device_snapshot(device, 1);
+		}
+
+		if (!result) {
+			result = adreno_ft(device, &ft_data);
+			adreno_destroy_ft_data(&ft_data);
+		}
+
+		/* restore power level */
+		kgsl_pwrctrl_pwrlevel_change(device, curr_pwrlevel);
+
 		if (result) {
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_HUNG);
 		} else {
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 			mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
 		}
-		complete_all(&device->recovery_gate);
-
-		
-		if (!device->snapshot_no_panic) {
-			if (result) {
-				msleep(10000);
-				panic("GPU Hang");
-			} else {
-				if (board_mfg_mode() ||
-					adreno_kill_suspect(device)) {
-					msleep(10000);
-					panic("Recoverable GPU Hang");
-				}
-			}
-		}
+		complete_all(&device->ft_gate);
 	}
 done:
 	return result;
 }
-EXPORT_SYMBOL(adreno_dump_and_recover);
+EXPORT_SYMBOL(adreno_dump_and_exec_ft);
 
 static int adreno_getproperty(struct kgsl_device *device,
 				enum kgsl_property_type type,
@@ -1198,8 +2137,13 @@
 			}
 			memset(&shadowprop, 0, sizeof(shadowprop));
 			if (device->memstore.hostptr) {
-				shadowprop.gpuaddr = device->memstore.physaddr;
+				/*NOTE: with mmu enabled, gpuaddr doesn't mean
+				 * anything to mmap().
+				 */
+				shadowprop.gpuaddr = device->memstore.gpuaddr;
 				shadowprop.size = device->memstore.size;
+				/* GSL needs this to be set, even if it
+				   appears to be meaningless */
 				shadowprop.flags = KGSL_FLAGS_INITIALIZED |
 					KGSL_FLAGS_PER_CONTEXT_TIMESTAMPS;
 			}
@@ -1253,6 +2197,7 @@
 				unsigned int sizebytes)
 {
 	int status = -EINVAL;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
 	switch (type) {
 	case KGSL_PROP_PWRCTRL: {
@@ -1272,16 +2217,50 @@
 			if (enable) {
 				if (pdata->nap_allowed)
 					device->pwrctrl.nap_allowed = true;
-
+				adreno_dev->fast_hang_detect = 1;
 				kgsl_pwrscale_enable(device);
 			} else {
 				device->pwrctrl.nap_allowed = false;
+				adreno_dev->fast_hang_detect = 0;
 				kgsl_pwrscale_disable(device);
 			}
 
 			status = 0;
 		}
 		break;
+	case KGSL_PROP_FAULT_TOLERANCE: {
+			struct kgsl_ft_config ftd;
+
+			if (adreno_dev->ft_user_control == 0)
+				break;
+
+			if (sizebytes != sizeof(ftd))
+				break;
+
+			if (copy_from_user(&ftd, (void __user *) value,
+							   sizeof(ftd))) {
+				status = -EFAULT;
+				break;
+			}
+
+			if (ftd.ft_policy)
+				adreno_dev->ft_policy = ftd.ft_policy;
+			else
+				adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
+
+			if (ftd.ft_pf_policy)
+				adreno_dev->ft_pf_policy = ftd.ft_policy;
+			else
+				adreno_dev->ft_pf_policy =
+					KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
+
+			if (ftd.ft_pm_dump)
+				device->pm_dump_enable = 1;
+			else
+				device->pm_dump_enable = 0;
+
+		}
+		break;
 	default:
 		break;
 	}
@@ -1289,12 +2268,6 @@
 	return status;
 }
 
-static inline void adreno_poke(struct kgsl_device *device)
-{
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	adreno_regwrite(device, REG_CP_RB_WPTR, adreno_dev->ringbuffer.wptr);
-}
-
 static int adreno_ringbuffer_drain(struct kgsl_device *device,
 	unsigned int *regs)
 {
@@ -1306,17 +2279,19 @@
 	if (!(rb->flags & KGSL_FLAGS_STARTED))
 		return 0;
 
+	/*
+	 * The first time into the loop, wait for 100 msecs and kick wptr again
+	 * to ensure that the hardware has updated correctly.  After that, kick
+	 * it periodically every KGSL_TIMEOUT_PART msecs until the timeout
+	 * expires
+	 */
 
 	wait = jiffies + msecs_to_jiffies(100);
 
-	adreno_poke(device);
-
 	do {
 		if (time_after(jiffies, wait)) {
-			adreno_poke(device);
-
-			
-			if (adreno_hang_detect(device, regs))
+			/* Check to see if the core is hung */
+			if (adreno_ft_detect(device, regs))
 				return -ETIMEDOUT;
 
 			wait = jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART);
@@ -1333,13 +2308,14 @@
 	return 0;
 }
 
+/* Caller must hold the device mutex. */
 int adreno_idle(struct kgsl_device *device)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	unsigned int rbbm_status;
 	unsigned long wait_time;
 	unsigned long wait_time_part;
-	unsigned int prev_reg_val[hang_detect_regs_count];
+	unsigned int prev_reg_val[ft_detect_regs_count];
 
 	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
@@ -1348,12 +2324,12 @@
 		0x00000000, 0x80000000);
 
 retry:
-	
+	/* First, wait for the ringbuffer to drain */
 	if (adreno_ringbuffer_drain(device, prev_reg_val))
 		goto err;
 
-	
-	wait_time = jiffies + ADRENO_IDLE_TIMEOUT;
+	/* now, wait for the GPU to finish its operations */
+	wait_time = jiffies + msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
 	wait_time_part = jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART);
 
 	while (time_before(jiffies, wait_time)) {
@@ -1367,10 +2343,12 @@
 				return 0;
 		}
 
+		/* Dont wait for timeout, detect hang faster.
+		 */
 		if (time_after(jiffies, wait_time_part)) {
 				wait_time_part = jiffies +
 					msecs_to_jiffies(KGSL_TIMEOUT_PART);
-				if ((adreno_hang_detect(device, prev_reg_val)))
+				if ((adreno_ft_detect(device, prev_reg_val)))
 					goto err;
 		}
 
@@ -1378,24 +2356,32 @@
 
 err:
 	KGSL_DRV_ERR(device, "spun too long waiting for RB to idle\n");
-	if (KGSL_STATE_DUMP_AND_RECOVER != device->state &&
-		!adreno_dump_and_recover(device)) {
+	if (KGSL_STATE_DUMP_AND_FT != device->state &&
+		!adreno_dump_and_exec_ft(device)) {
 		wait_time = jiffies + ADRENO_IDLE_TIMEOUT;
 		goto retry;
 	}
 	return -ETIMEDOUT;
 }
 
+/**
+ * is_adreno_rbbm_status_idle - Check if GPU core is idle by probing
+ * rbbm_status register
+ * @device - Pointer to the GPU device whose idle status is to be
+ * checked
+ * @returns - Returns whether the core is idle (based on rbbm_status)
+ * false if the core is active, true if the core is idle
+ */
 static bool is_adreno_rbbm_status_idle(struct kgsl_device *device)
 {
 	unsigned int reg_rbbm_status;
 	bool status = false;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	
+	/* Is the core idle? */
 	adreno_regread(device,
-	adreno_dev->gpudev->reg_rbbm_status,
-	&reg_rbbm_status);
+		adreno_dev->gpudev->reg_rbbm_status,
+		&reg_rbbm_status);
 
 	if (adreno_is_a2xx(adreno_dev)) {
 		if (reg_rbbm_status == 0x110)
@@ -1414,13 +2400,22 @@
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
 	WARN_ON(device->state == KGSL_STATE_INIT);
-	
+	/* If the device isn't active, don't force it on. */
 	if (device->state == KGSL_STATE_ACTIVE) {
-		
+		/* Is the ring buffer is empty? */
 		GSL_RB_GET_READPTR(rb, &rb->rptr);
 		if (!device->active_cnt && (rb->rptr == rb->wptr)) {
-			
-			status = is_adreno_rbbm_status_idle(device);
+			/*
+			 * Are there interrupts pending? If so then pretend we
+			 * are not idle - this avoids the possiblity that we go
+			 * to a lower power state without handling interrupts
+			 * first.
+			 */
+
+			if (!adreno_dev->gpudev->irq_pending(adreno_dev)) {
+				/* Is the core idle? */
+				status = is_adreno_rbbm_status_idle(device);
+			}
 		}
 	} else {
 		status = true;
@@ -1428,16 +2423,14 @@
 	return status;
 }
 
+/* Caller must hold the device mutex. */
 static int adreno_suspend_context(struct kgsl_device *device)
 {
 	int status = 0;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	
+	/* switch to NULL ctxt */
 	if (adreno_dev->drawctxt_active != NULL) {
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	device->current_process_priv = NULL;
-#endif
 		adreno_drawctxt_switch(adreno_dev, NULL, 0);
 		status = adreno_idle(device);
 	}
@@ -1445,6 +2438,7 @@
 	return status;
 }
 
+/* Find a memory structure attached to an adreno context */
 
 struct kgsl_memdesc *adreno_find_ctxtmem(struct kgsl_device *device,
 	unsigned int pt_base, unsigned int gpuaddr, unsigned int size)
@@ -1460,7 +2454,8 @@
 
 		adreno_context = (struct adreno_context *)context->devctxt;
 
-		if (kgsl_mmu_pt_equal(adreno_context->pagetable, pt_base)) {
+		if (kgsl_mmu_pt_equal(&device->mmu, adreno_context->pagetable,
+					pt_base)) {
 			struct kgsl_memdesc *desc;
 
 			desc = &adreno_context->gpustate;
@@ -1499,7 +2494,7 @@
 					size))
 		return &device->mmu.setstate_memory;
 
-	entry = kgsl_get_mem_entry(pt_base, gpuaddr, size);
+	entry = kgsl_get_mem_entry(device, pt_base, gpuaddr, size);
 
 	if (entry)
 		return &entry->memdesc;
@@ -1527,6 +2522,8 @@
 	if (!in_interrupt())
 		kgsl_pre_hwaccess(device);
 
+	/*ensure this read finishes before the next one.
+	 * i.e. act like normal readl() */
 	*value = __raw_readl(reg);
 	rmb();
 }
@@ -1544,14 +2541,10 @@
 	kgsl_cffdump_regwrite(device->id, offsetwords << 2, value);
 	reg = (unsigned int *)(device->reg_virt + (offsetwords << 2));
 
+	/*ensure previous writes post before this one,
+	 * i.e. act like normal writel() */
 	wmb();
-	adreno_regwrite_footprint = 1;
-	adreno_regwrite_reg = reg;
-	adreno_regwrite_val = value;
-	dsb();
 	__raw_writel(value, reg);
-	adreno_regwrite_footprint = 0;
-	dsb();
 }
 
 static unsigned int _get_context_id(struct kgsl_context *k_ctxt)
@@ -1568,66 +2561,99 @@
 	return context_id;
 }
 
-static int kgsl_check_interrupt_timestamp(struct kgsl_device *device,
+static unsigned int adreno_check_hw_ts(struct kgsl_device *device,
 		struct kgsl_context *context, unsigned int timestamp)
 {
-	int status;
+	int status = 0;
 	unsigned int ref_ts, enableflag;
-	unsigned int context_id;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	unsigned int context_id = _get_context_id(context);
 
-	mutex_lock(&device->mutex);
-	context_id = _get_context_id(context);
+	/*
+	 * If the context ID is invalid, we are in a race with
+	 * the context being destroyed by userspace so bail.
+	 */
 	if (context_id == KGSL_CONTEXT_INVALID) {
 		KGSL_DRV_WARN(device, "context was detached");
-		status = -EINVAL;
-		goto unlock;
+		return -EINVAL;
 	}
 
 	status = kgsl_check_timestamp(device, context, timestamp);
-	if (!status) {
-		kgsl_sharedmem_readl(&device->memstore, &enableflag,
-			KGSL_MEMSTORE_OFFSET(context_id, ts_cmp_enable));
-		mb();
+	if (status)
+		return status;
 
-		if (enableflag) {
-			kgsl_sharedmem_readl(&device->memstore, &ref_ts,
+	kgsl_sharedmem_readl(&device->memstore, &enableflag,
+			KGSL_MEMSTORE_OFFSET(context_id, ts_cmp_enable));
+	/*
+	 * Barrier is needed here to make sure the read from memstore
+	 * has posted
+	 */
+
+	mb();
+
+	if (enableflag) {
+		kgsl_sharedmem_readl(&device->memstore, &ref_ts,
 				KGSL_MEMSTORE_OFFSET(context_id,
 					ref_wait_ts));
-			mb();
-			if (timestamp_cmp(ref_ts, timestamp) >= 0) {
-				kgsl_sharedmem_writel(&device->memstore,
+
+		/* Make sure the memstore read has posted */
+		mb();
+		if (timestamp_cmp(ref_ts, timestamp) >= 0) {
+			kgsl_sharedmem_writel(&device->memstore,
+					KGSL_MEMSTORE_OFFSET(context_id,
+						ref_wait_ts), timestamp);
+			/* Make sure the memstore write is posted */
+			wmb();
+		}
+	} else {
+		kgsl_sharedmem_writel(&device->memstore,
 				KGSL_MEMSTORE_OFFSET(context_id,
 					ref_wait_ts), timestamp);
-				wmb();
-			}
-		} else {
-			unsigned int cmds[2];
-			kgsl_sharedmem_writel(&device->memstore,
-				KGSL_MEMSTORE_OFFSET(context_id,
-					ref_wait_ts), timestamp);
-			enableflag = 1;
-			kgsl_sharedmem_writel(&device->memstore,
+		enableflag = 1;
+		kgsl_sharedmem_writel(&device->memstore,
 				KGSL_MEMSTORE_OFFSET(context_id,
 					ts_cmp_enable), enableflag);
-			wmb();
-			cmds[0] = cp_type3_packet(CP_NOP, 1);
-			cmds[1] = 0;
+		/* Make sure the memstore write gets posted */
+		wmb();
 
-			if (adreno_dev->drawctxt_active)
-				adreno_ringbuffer_issuecmds(device,
-					adreno_dev->drawctxt_active,
-					KGSL_CMD_FLAGS_NONE, &cmds[0], 2);
-			else
-				BUG();
-		}
+		/*
+		 * submit a dummy packet so that even if all
+		 * commands upto timestamp get executed we will still
+		 * get an interrupt
+		 */
+
+		if (context && device->state != KGSL_STATE_SLUMBER)
+			adreno_ringbuffer_issuecmds(device, context->devctxt,
+					KGSL_CMD_FLAGS_NONE, NULL, 0);
 	}
-unlock:
+
+	return 0;
+}
+
+/* Return 1 if the event timestmp has already passed, 0 if it was marked */
+static int adreno_next_event(struct kgsl_device *device,
+		struct kgsl_event *event)
+{
+	return adreno_check_hw_ts(device, event->context, event->timestamp);
+}
+
+static int adreno_check_interrupt_timestamp(struct kgsl_device *device,
+		struct kgsl_context *context, unsigned int timestamp)
+{
+	int status;
+
+	mutex_lock(&device->mutex);
+	status = adreno_check_hw_ts(device, context, timestamp);
 	mutex_unlock(&device->mutex);
 
 	return status;
 }
 
+/*
+ wait_event_interruptible_timeout checks for the exit condition before
+ placing a process in wait q. For conditional interrupts we expect the
+ process to already be in its wait q when its exit condition checking
+ function is called.
+*/
 #define kgsl_wait_event_interruptible_timeout(wq, condition, timeout, io)\
 ({									\
 	long __ret = timeout;						\
@@ -1640,134 +2666,405 @@
 
 
 
-unsigned int adreno_hang_detect(struct kgsl_device *device,
+unsigned int adreno_ft_detect(struct kgsl_device *device,
 						unsigned int *prev_reg_val)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int curr_reg_val[hang_detect_regs_count];
-	unsigned int hang_detected = 1;
+	unsigned int curr_reg_val[ft_detect_regs_count];
+	unsigned int fast_hang_detected = 1;
+	unsigned int long_ib_detected = 1;
 	unsigned int i;
+	static unsigned long next_hang_detect_time;
+	static unsigned int prev_global_ts;
+	unsigned int curr_global_ts = 0;
+	unsigned int curr_context_id = 0;
+	static struct adreno_context *curr_context;
+	static struct kgsl_context *context;
 
 	if (!adreno_dev->fast_hang_detect)
-		return 0;
+		fast_hang_detected = 0;
 
-	if (is_adreno_rbbm_status_idle(device))
-		return 0;
+	if (!adreno_dev->long_ib_detect)
+		long_ib_detected = 0;
 
-	for (i = 0; i < hang_detect_regs_count; i++) {
-		adreno_regread(device, hang_detect_regs[i],
-					   &curr_reg_val[i]);
-		if (curr_reg_val[i] != prev_reg_val[i]) {
-			prev_reg_val[i] = curr_reg_val[i];
-			hang_detected = 0;
+	if (is_adreno_rbbm_status_idle(device)) {
+
+		/*
+		 * On A20X if the RPTR != WPTR and the device is idle, then
+		 * the last write to WPTR probably failed to latch so write it
+		 * again
+		 */
+
+		if (adreno_is_a2xx(adreno_dev)) {
+			unsigned int rptr;
+			adreno_regread(device, REG_CP_RB_RPTR, &rptr);
+			if (rptr != adreno_dev->ringbuffer.wptr)
+				adreno_regwrite(device, REG_CP_RB_WPTR,
+					adreno_dev->ringbuffer.wptr);
 		}
+
+		return 0;
 	}
 
-	return hang_detected;
+	/*
+	 * Time interval between hang detection should be KGSL_TIMEOUT_PART
+	 * or more, if next hang detection is requested < KGSL_TIMEOUT_PART
+	 * from the last time do nothing.
+	 */
+	if ((next_hang_detect_time) &&
+		(time_before(jiffies, next_hang_detect_time)))
+			return 0;
+	else
+		next_hang_detect_time = (jiffies +
+			msecs_to_jiffies(KGSL_TIMEOUT_PART-1));
+
+	/* Read the current Hang detect reg values here */
+	for (i = 0; i < ft_detect_regs_count; i++) {
+		if (ft_detect_regs[i] == 0)
+			continue;
+		adreno_regread(device, ft_detect_regs[i],
+			&curr_reg_val[i]);
+	}
+
+	/* Read the current global timestamp here */
+	kgsl_sharedmem_readl(&device->memstore,
+			&curr_global_ts,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+			eoptimestamp));
+
+	mb();
+
+	if (curr_global_ts == prev_global_ts) {
+
+		/* Get the current context here */
+		if (context == NULL) {
+			kgsl_sharedmem_readl(&device->memstore,
+				&curr_context_id,
+				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+				current_context));
+			context = idr_find(&device->context_idr,
+				curr_context_id);
+			if (context != NULL) {
+				curr_context = context->devctxt;
+				curr_context->ib_gpu_time_used = 0;
+			} else {
+				KGSL_DRV_ERR(device,
+					"Fault tolerance no context found\n");
+			}
+		}
+
+		mb();
+
+		if (curr_context != NULL) {
+
+			curr_context->ib_gpu_time_used += KGSL_TIMEOUT_PART;
+			KGSL_FT_INFO(device,
+			"Proc %s used GPU Time %d ms on timestamp 0x%X\n",
+			curr_context->pid_name, curr_context->ib_gpu_time_used,
+			curr_global_ts+1);
+
+			for (i = 0; i < ft_detect_regs_count; i++) {
+				if (curr_reg_val[i] != prev_reg_val[i]) {
+					fast_hang_detected = 0;
+
+					/* Check for long IB here */
+					if ((i >=
+						LONG_IB_DETECT_REG_INDEX_START)
+						&&
+						(i <=
+						LONG_IB_DETECT_REG_INDEX_END))
+						long_ib_detected = 0;
+				}
+			}
+
+			if (fast_hang_detected) {
+				KGSL_FT_ERR(device,
+					"Proc %s, ctxt_id %d ts %d triggered fault tolerance"
+					" on global ts %d\n",
+					curr_context->pid_name, curr_context->id
+					, (kgsl_readtimestamp(device, context,
+					KGSL_TIMESTAMP_RETIRED)+1),
+					curr_global_ts+1);
+				return 1;
+			}
+
+			if ((long_ib_detected) &&
+				(!(curr_context->flags &
+				 CTXT_FLAGS_NO_FAULT_TOLERANCE))) {
+				curr_context->ib_gpu_time_used +=
+					KGSL_TIMEOUT_PART;
+				if (curr_context->ib_gpu_time_used >
+					KGSL_TIMEOUT_LONG_IB_DETECTION) {
+					if (adreno_dev->long_ib_ts !=
+						curr_global_ts) {
+						KGSL_FT_ERR(device,
+						"Proc %s, ctxt_id %d ts %d"
+						"used GPU for %d ms long ib "
+						"detected on global ts %d\n",
+						curr_context->pid_name,
+						curr_context->id,
+						(kgsl_readtimestamp(device,
+						context,
+						KGSL_TIMESTAMP_RETIRED)+1),
+						curr_context->ib_gpu_time_used,
+						curr_global_ts+1);
+						adreno_dev->long_ib = 1;
+						adreno_dev->long_ib_ts =
+								curr_global_ts;
+						curr_context->ib_gpu_time_used =
+								0;
+						return 1;
+					}
+				}
+			}
+		} else {
+			KGSL_FT_ERR(device,
+				"Last context unknown id:%d\n",
+				curr_context_id);
+		}
+	} else {
+		/* GPU is moving forward */
+		prev_global_ts = curr_global_ts;
+		context = NULL;
+		curr_context = NULL;
+		adreno_dev->long_ib = 0;
+		adreno_dev->long_ib_ts = 0;
+	}
+
+
+	/* If hangs are not detected copy the current reg values
+	 * to previous values and return no hang */
+	for (i = 0; i < ft_detect_regs_count; i++)
+			prev_reg_val[i] = curr_reg_val[i];
+	return 0;
 }
 
+/**
+ * adreno_handle_hang - Process a hang detected in adreno_waittimestamp
+ * @device - pointer to a KGSL device structure
+ * @context - pointer to the active KGSL context
+ * @timestamp - the timestamp that the process was waiting for
+ *
+ * Process a possible GPU hang and try fault tolerance from it
+ * cleanly
+ */
+static int adreno_handle_hang(struct kgsl_device *device,
+	struct kgsl_context *context, unsigned int timestamp)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	unsigned int context_id = _get_context_id(context);
+	unsigned int ts_issued;
+	unsigned int rptr;
 
+	/* Do one last check to see if we somehow made it through */
+	if (kgsl_check_timestamp(device, context, timestamp))
+		return 0;
+
+	ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
+
+	adreno_regread(device, REG_CP_RB_RPTR, &rptr);
+	mb();
+
+	KGSL_DRV_WARN(device,
+		     "Device hang detected while waiting for timestamp: "
+		     "<%d:0x%x>, last submitted timestamp: <%d:0x%x>, "
+		     "retired timestamp: <%d:0x%x>, wptr: 0x%x, rptr: 0x%x\n",
+		      context_id, timestamp, context_id, ts_issued, context_id,
+			kgsl_readtimestamp(device, context,
+			KGSL_TIMESTAMP_RETIRED),
+		      adreno_dev->ringbuffer.wptr, rptr);
+
+	/* Return 0 after a successful fault tolerance */
+	if (!adreno_dump_and_exec_ft(device))
+		return 0;
+
+	return -ETIMEDOUT;
+}
+
+static int _check_pending_timestamp(struct kgsl_device *device,
+		struct kgsl_context *context, unsigned int timestamp)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	unsigned int context_id = _get_context_id(context);
+	unsigned int ts_issued;
+
+	if (context_id == KGSL_CONTEXT_INVALID)
+		return -EINVAL;
+
+	ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
+
+	if (timestamp_cmp(timestamp, ts_issued) <= 0)
+		return 0;
+
+	if (context && !context->wait_on_invalid_ts) {
+		KGSL_DRV_ERR(device, "Cannot wait for invalid ts <%d:0x%x>, last issued ts <%d:0x%x>\n",
+			context_id, timestamp, context_id, ts_issued);
+
+			/* Only print this message once */
+			context->wait_on_invalid_ts = true;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * adreno_waittimestamp - sleep while waiting for the specified timestamp
+ * @device - pointer to a KGSL device structure
+ * @context - pointer to the active kgsl context
+ * @timestamp - GPU timestamp to wait for
+ * @msecs - amount of time to wait (in milliseconds)
+ *
+ * Wait 'msecs' milliseconds for the specified timestamp to expire. Wake up
+ * every KGSL_TIMEOUT_PART milliseconds to check for a device hang and process
+ * one if it happened.  Otherwise, spend most of our time in an interruptible
+ * wait for the timestamp interrupt to be processed.  This function must be
+ * called with the mutex already held.
+ */
 static int adreno_waittimestamp(struct kgsl_device *device,
 				struct kgsl_context *context,
 				unsigned int timestamp,
 				unsigned int msecs)
 {
-	long status = 0;
-	uint io = 1;
-	static uint io_cnt;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	static unsigned int io_cnt;
+	struct adreno_context *adreno_ctx = context ? context->devctxt : NULL;
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-	int retries = 0;
-	unsigned int ts_issued;
 	unsigned int context_id = _get_context_id(context);
+	unsigned int prev_reg_val[ft_detect_regs_count];
 	unsigned int time_elapsed = 0;
-	unsigned int prev_reg_val[hang_detect_regs_count];
 	unsigned int wait;
+	int ts_compare = 1;
+	int io, ret = -ETIMEDOUT;
 
-	memset(prev_reg_val, 0, sizeof(prev_reg_val));
+	/* Get out early if the context has already been destroyed */
 
-	ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
+	if (context_id == KGSL_CONTEXT_INVALID) {
+		KGSL_DRV_WARN(device, "context was detached");
+		return -EINVAL;
+	}
 
-	
-	if (msecs == KGSL_TIMEOUT_DEFAULT)
-		msecs = adreno_dev->wait_timeout;
+	/*
+	 * Check to see if the requested timestamp is "newer" then the last
+	 * timestamp issued. If it is complain once and return error.  Only
+	 * print the message once per context so that badly behaving
+	 * applications don't spam the logs
+	 */
 
-	if (timestamp_cmp(timestamp, ts_issued) > 0) {
-		KGSL_DRV_ERR(device, "Cannot wait for invalid ts <%d:0x%x>, "
-			"last issued ts <%d:0x%x>\n",
-			context_id, timestamp, context_id, ts_issued);
-		status = -EINVAL;
-		goto done;
+	if (adreno_ctx && !(adreno_ctx->flags & CTXT_FLAGS_USER_GENERATED_TS)) {
+		if (_check_pending_timestamp(device, context, timestamp))
+			return -EINVAL;
+
+		/* Reset the invalid timestamp flag on a valid wait */
+		context->wait_on_invalid_ts = false;
 	}
 
 
-	if (msecs == 0 || msecs >= 100)
-		wait = 100;
-	else
-		wait = 20;
+	/* Clear the registers used for hang detection */
+	memset(prev_reg_val, 0, sizeof(prev_reg_val));
+
+	/*
+	 * On the first time through the loop only wait 100ms.
+	 * this gives enough time for the engine to start moving and oddly
+	 * provides better hang detection results than just going the full
+	 * KGSL_TIMEOUT_PART right off the bat. The exception to this rule
+	 * is if msecs happens to be < 100ms then just use the full timeout
+	 */
+
+	wait = 100;
 
 	do {
-		if (context_id == KGSL_CONTEXT_INVALID) {
-			KGSL_DRV_WARN(device, "context was detached");
-			status = -EINVAL;
-			goto done;
-		}
+		long status;
+
+		/*
+		 * if the timestamp happens while we're not
+		 * waiting, there's a chance that an interrupt
+		 * will not be generated and thus the timestamp
+		 * work needs to be queued.
+		 */
+
 		if (kgsl_check_timestamp(device, context, timestamp)) {
 			queue_work(device->work_queue, &device->ts_expired_ws);
-			status = 0;
-			goto done;
+			ret = 0;
+			break;
 		}
-		adreno_poke(device);
-		io_cnt = (io_cnt + 1) % 100;
-		if (io_cnt <
-		    pwr->pwrlevels[pwr->active_pwrlevel].io_fraction)
-			io = 0;
 
-		if ((retries > 0) &&
-			(adreno_hang_detect(device, prev_reg_val)))
-			goto hang_dump;
+		/* Check to see if the GPU is hung */
+		if (adreno_ft_detect(device, prev_reg_val)) {
+			ret = adreno_handle_hang(device, context, timestamp);
+			break;
+		}
+
+		/*
+		 * For proper power accounting sometimes we need to call
+		 * io_wait_interruptible_timeout and sometimes we need to call
+		 * plain old wait_interruptible_timeout. We call the regular
+		 * timeout N times out of 100, where N is a number specified by
+		 * the current power level
+		 */
+
+		io_cnt = (io_cnt + 1) % 100;
+		io = (io_cnt < pwr->pwrlevels[pwr->active_pwrlevel].io_fraction)
+			? 0 : 1;
 
 		mutex_unlock(&device->mutex);
+
+		/* Wait for a timestamp event */
 		status = kgsl_wait_event_interruptible_timeout(
-				device->wait_queue,
-				kgsl_check_interrupt_timestamp(device,
-					context, timestamp),
-				msecs_to_jiffies(wait), io);
+			device->wait_queue,
+			adreno_check_interrupt_timestamp(device, context,
+				timestamp), msecs_to_jiffies(wait), io);
 
 		mutex_lock(&device->mutex);
 
-		if (status > 0) {
-			
-			status = 0;
-			goto done;
-		} else if (status < 0) {
-			
-			goto done;
+		/*
+		 * If status is non zero then either the condition was satisfied
+		 * or there was an error.  In either event, this is the end of
+		 * the line for us
+		 */
+
+		if (status != 0) {
+			ret = (status > 0) ? 0 : (int) status;
+			break;
 		}
-		
-
 		time_elapsed += wait;
-		wait = KGSL_TIMEOUT_PART;
 
-		retries++;
+		/* If user specified timestamps are being used, wait at least
+		 * KGSL_SYNCOBJ_SERVER_TIMEOUT msecs for the user driver to
+		 * issue a IB for a timestamp before checking to see if the
+		 * current timestamp we are waiting for is valid or not
+		 */
+
+		if (ts_compare && (adreno_ctx &&
+			(adreno_ctx->flags & CTXT_FLAGS_USER_GENERATED_TS))) {
+			if (time_elapsed > KGSL_SYNCOBJ_SERVER_TIMEOUT) {
+				ret = _check_pending_timestamp(device, context,
+					timestamp);
+				if (ret)
+					break;
+
+				/* Don't do this check again */
+				ts_compare = 0;
+
+				/*
+				 * Reset the invalid timestamp flag on a valid
+				 * wait
+				 */
+				context->wait_on_invalid_ts = false;
+			}
+		}
+
+		/*
+		 * We want to wait the floor of KGSL_TIMEOUT_PART
+		 * and (msecs - time_elapsed).
+		 */
+
+		if (KGSL_TIMEOUT_PART < (msecs - time_elapsed))
+			wait = KGSL_TIMEOUT_PART;
+		else
+			wait = (msecs - time_elapsed);
 
 	} while (!msecs || time_elapsed < msecs);
 
-hang_dump:
-	if (kgsl_check_timestamp(device, context, timestamp))
-		goto done;
-	status = -ETIMEDOUT;
-	KGSL_DRV_ERR(device,
-		     "Device hang detected while waiting for timestamp: "
-		     "<%d:0x%x>, last submitted timestamp: <%d:0x%x>, "
-		     "wptr: 0x%x\n",
-		      context_id, timestamp, context_id, ts_issued,
-		      adreno_dev->ringbuffer.wptr);
-	if (!adreno_dump_and_recover(device)) {
-			status = 0;
-	}
-done:
-	return (int)status;
+	return ret;
 }
 
 static unsigned int adreno_readtimestamp(struct kgsl_device *device,
@@ -1776,6 +3073,10 @@
 	unsigned int timestamp = 0;
 	unsigned int context_id = _get_context_id(context);
 
+	/*
+	 * If the context ID is invalid, we are in a race with
+	 * the context being destroyed by userspace so bail.
+	 */
 	if (context_id == KGSL_CONTEXT_INVALID) {
 		KGSL_DRV_WARN(device, "context was detached");
 		return timestamp;
@@ -1849,11 +3150,13 @@
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	unsigned int cycles;
 
-	
-	
+	/* Get the busy cycles counted since the counter was last reset */
+	/* Calling this function also resets and restarts the counter */
 
 	cycles = adreno_dev->gpudev->busy_cycles(adreno_dev);
 
+	/* In order to calculate idle you have to have run the algorithm *
+	 * at least once to get a start time. */
 	if (pwr->time != 0) {
 		s64 tmp = ktime_to_us(ktime_get());
 		stats->total_time = tmp - pwr->time;
@@ -1861,27 +3164,6 @@
 		stats->busy_time = adreno_ticks_to_us(cycles, device->pwrctrl.
 				pwrlevels[device->pwrctrl.active_pwrlevel].
 				gpu_freq);
-
-		
-		stats->busy_time = (stats->busy_time > stats->total_time) ? stats->total_time : stats->busy_time;
-		device->gputime.total = device->gputime.total + stats->total_time;
-		device->gputime.busy = device->gputime.busy + stats->busy_time;
-		device->gputime_in_state[device->pwrctrl.active_pwrlevel].total
-			= device->gputime_in_state[device->pwrctrl.active_pwrlevel].total + stats->total_time;
-		device->gputime_in_state[device->pwrctrl.active_pwrlevel].busy
-			= device->gputime_in_state[device->pwrctrl.active_pwrlevel].busy + stats->busy_time;
-		
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-		if(device->current_process_priv != NULL) {
-			device->current_process_priv->gputime.total = device->current_process_priv->gputime.total + stats->total_time;
-			device->current_process_priv->gputime.busy = device->current_process_priv->gputime.busy + stats->busy_time;
-			device->current_process_priv->gputime_in_state[device->pwrctrl.active_pwrlevel].total
-				= device->current_process_priv->gputime_in_state[device->pwrctrl.active_pwrlevel].total + stats->total_time;
-			device->current_process_priv->gputime_in_state[device->pwrctrl.active_pwrlevel].busy
-				= device->current_process_priv->gputime_in_state[device->pwrctrl.active_pwrlevel].busy + stats->busy_time;
-		} else
-			printk("curent_process_pirv = NULL, skip gpu usage recorde.\n");
-#endif
 	} else {
 		stats->total_time = 0;
 		stats->busy_time = 0;
@@ -1900,16 +3182,22 @@
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
+	/* Some applications need to know the chip ID too, so pass
+	 * that as a parameter */
 
 	if (chipid != NULL)
 		*chipid = adreno_dev->chip_id;
 
+	/* Standard KGSL gpuid format:
+	 * top word is 0x0002 for 2D or 0x0003 for 3D
+	 * Bottom word is core specific identifer
+	 */
 
 	return (0x0003 << 16) | ((int) adreno_dev->gpurev);
 }
 
 static const struct kgsl_functable adreno_functable = {
-	
+	/* Mandatory functions */
 	.regread = adreno_regread,
 	.regwrite = adreno_regwrite,
 	.idle = adreno_idle,
@@ -1929,19 +3217,15 @@
 	.gpuid = adreno_gpuid,
 	.snapshot = adreno_snapshot,
 	.irq_handler = adreno_irq_handler,
-	
+	/* Optional functions */
 	.setstate = adreno_setstate,
 	.drawctxt_create = adreno_drawctxt_create,
 	.drawctxt_destroy = adreno_drawctxt_destroy,
 	.setproperty = adreno_setproperty,
+	.postmortem_dump = adreno_dump,
+	.next_event = adreno_next_event,
 };
 
-static struct platform_device_id adreno_id_table[] = {
-	{ DEVICE_3D0_NAME, (kernel_ulong_t)&device_3d0.dev, },
-	{ },
-};
-MODULE_DEVICE_TABLE(platform, adreno_id_table);
-
 static struct platform_driver adreno_platform_driver = {
 	.probe = adreno_probe,
 	.remove = __devexit_p(adreno_remove),
@@ -1952,6 +3236,7 @@
 		.owner = THIS_MODULE,
 		.name = DEVICE_3D_NAME,
 		.pm = &kgsl_pm_ops,
+		.of_match_table = adreno_match_table,
 	}
 };
 
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 3cfacd8..d319c98 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -17,6 +17,7 @@
 #include "adreno_drawctxt.h"
 #include "adreno_ringbuffer.h"
 #include "kgsl_iommu.h"
+#include <mach/ocmem.h>
 
 #define DEVICE_3D_NAME "kgsl-3d"
 #define DEVICE_3D0_NAME "kgsl-3d0"
@@ -24,14 +25,24 @@
 #define ADRENO_DEVICE(device) \
 		KGSL_CONTAINER_OF(device, struct adreno_device, dev)
 
+#define ADRENO_CHIPID_CORE(_id) (((_id) >> 24) & 0xFF)
+#define ADRENO_CHIPID_MAJOR(_id) (((_id) >> 16) & 0xFF)
+#define ADRENO_CHIPID_MINOR(_id) (((_id) >> 8) & 0xFF)
+#define ADRENO_CHIPID_PATCH(_id) ((_id) & 0xFF)
+
+/* Flags to control command packet settings */
 #define KGSL_CMD_FLAGS_NONE             0x00000000
 #define KGSL_CMD_FLAGS_PMODE		0x00000001
-#define KGSL_CMD_FLAGS_NO_TS_CMP	0x00000002
+#define KGSL_CMD_FLAGS_INTERNAL_ISSUE	0x00000002
+#define KGSL_CMD_FLAGS_EOF	        0x00000100
 
+/* Command identifiers */
 #define KGSL_CONTEXT_TO_MEM_IDENTIFIER	0x2EADBEEF
 #define KGSL_CMD_IDENTIFIER		0x2EEDFACE
 #define KGSL_START_OF_IB_IDENTIFIER	0x2EADEABE
 #define KGSL_END_OF_IB_IDENTIFIER	0x2ABEDEAD
+#define KGSL_END_OF_FRAME_IDENTIFIER	0x2E0F2E0F
+#define KGSL_NOP_IB_IDENTIFIER	        0x20F20F20
 
 #ifdef CONFIG_MSM_SCM
 #define ADRENO_DEFAULT_PWRSCALE_POLICY  (&kgsl_pwrscale_policy_tz)
@@ -41,10 +52,15 @@
 #define ADRENO_DEFAULT_PWRSCALE_POLICY  NULL
 #endif
 
-#define ADRENO_ISTORE_START 0x5000 
+void adreno_debugfs_init(struct kgsl_device *device);
+
+#define ADRENO_ISTORE_START 0x5000 /* Istore offset */
 
 #define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW	50
 
+/* One cannot wait forever for the core to idle, so set an upper limit to the
+ * amount of time to wait for the core to go idle
+ */
 
 #define ADRENO_IDLE_TIMEOUT (20 * 1000)
 
@@ -57,12 +73,13 @@
 	ADRENO_REV_A225 = 225,
 	ADRENO_REV_A305 = 305,
 	ADRENO_REV_A320 = 320,
+	ADRENO_REV_A330 = 330,
 };
 
 struct adreno_gpudev;
 
 struct adreno_device {
-	struct kgsl_device dev;    
+	struct kgsl_device dev;    /* Must be first field in this struct */
 	unsigned int chip_id;
 	enum adreno_gpurev gpurev;
 	unsigned long gmem_base;
@@ -85,17 +102,29 @@
 	unsigned int instruction_size;
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
+	unsigned int ft_policy;
+	unsigned int ft_user_control;
+	unsigned int long_ib_detect;
+	unsigned int long_ib;
+	unsigned int long_ib_ts;
+	unsigned int ft_pf_policy;
 	unsigned int gpulist_index;
+	struct ocmem_buf *ocmem_hdl;
+	unsigned int ocmem_base;
 };
 
 struct adreno_gpudev {
+	/*
+	 * These registers are in a different location on A3XX,  so define
+	 * them in the structure and use them as variables.
+	 */
 	unsigned int reg_rbbm_status;
 	unsigned int reg_cp_pfp_ucode_data;
 	unsigned int reg_cp_pfp_ucode_addr;
-	
+	/* keeps track of when we need to execute the draw workaround code */
 	int ctx_switches_since_last_draw;
 
-	
+	/* GPU specific function hooks */
 	int (*ctxt_create)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_save)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_restore)(struct adreno_device *, struct adreno_context *);
@@ -103,13 +132,36 @@
 					struct adreno_context *);
 	irqreturn_t (*irq_handler)(struct adreno_device *);
 	void (*irq_control)(struct adreno_device *, int);
+	unsigned int (*irq_pending)(struct adreno_device *);
 	void * (*snapshot)(struct adreno_device *, void *, int *, int);
 	void (*rb_init)(struct adreno_device *, struct adreno_ringbuffer *);
 	void (*start)(struct adreno_device *);
 	unsigned int (*busy_cycles)(struct adreno_device *);
 };
 
-struct adreno_recovery_data {
+/*
+ * struct adreno_ft_data - Structure that contains all information to
+ * perform gpu fault tolerance
+ * @ib1 - IB1 that the GPU was executing when hang happened
+ * @context_id - Context which caused the hang
+ * @global_eop - eoptimestamp at time of hang
+ * @rb_buffer - Buffer that holds the commands from good contexts
+ * @rb_size - Number of valid dwords in rb_buffer
+ * @bad_rb_buffer - Buffer that holds commands from the hanging context
+ * bad_rb_size - Number of valid dwords in bad_rb_buffer
+ * @good_rb_buffer - Buffer that holds commands from good contexts
+ * good_rb_size - Number of valid dwords in good_rb_buffer
+ * @last_valid_ctx_id - The last context from which commands were placed in
+ * ringbuffer before the GPU hung
+ * @step - Current fault tolerance step being executed
+ * @err_code - Fault tolerance error code
+ * @fault - Indicates whether the hang was caused due to a pagefault
+ * @start_of_replay_cmds - Offset in ringbuffer from where commands can be
+ * replayed during fault tolerance
+ * @replay_for_snapshot - Offset in ringbuffer where IB's can be saved for
+ * replaying with snapshot
+ */
+struct adreno_ft_data {
 	unsigned int ib1;
 	unsigned int context_id;
 	unsigned int global_eop;
@@ -117,12 +169,20 @@
 	unsigned int rb_size;
 	unsigned int *bad_rb_buffer;
 	unsigned int bad_rb_size;
+	unsigned int *good_rb_buffer;
+	unsigned int good_rb_size;
 	unsigned int last_valid_ctx_id;
+	unsigned int status;
+	unsigned int ft_policy;
+	unsigned int err_code;
+	unsigned int start_of_replay_cmds;
+	unsigned int replay_for_snapshot;
 };
 
 extern struct adreno_gpudev adreno_a2xx_gpudev;
 extern struct adreno_gpudev adreno_a3xx_gpudev;
 
+/* A2XX register sets defined in adreno_a2xx.c */
 extern const unsigned int a200_registers[];
 extern const unsigned int a220_registers[];
 extern const unsigned int a225_registers[];
@@ -130,14 +190,18 @@
 extern const unsigned int a220_registers_count;
 extern const unsigned int a225_registers_count;
 
+/* A3XX register set defined in adreno_a3xx.c */
 extern const unsigned int a3xx_registers[];
 extern const unsigned int a3xx_registers_count;
 
 extern const unsigned int a3xx_hlsq_registers[];
 extern const unsigned int a3xx_hlsq_registers_count;
 
-extern unsigned int hang_detect_regs[];
-extern const unsigned int hang_detect_regs_count;
+extern const unsigned int a330_registers[];
+extern const unsigned int a330_registers_count;
+
+extern unsigned int ft_detect_regs[];
+extern const unsigned int ft_detect_regs_count;
 
 
 int adreno_idle(struct kgsl_device *device);
@@ -146,6 +210,8 @@
 void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value);
 
+int adreno_dump(struct kgsl_device *device, int manual);
+
 struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
@@ -160,9 +226,12 @@
 void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
 		int hang);
 
-int adreno_dump_and_recover(struct kgsl_device *device);
+int adreno_dump_and_exec_ft(struct kgsl_device *device);
 
-unsigned int adreno_hang_detect(struct kgsl_device *device,
+void adreno_dump_rb(struct kgsl_device *device, const void *buf,
+			 size_t len, int start, int size);
+
+unsigned int adreno_ft_detect(struct kgsl_device *device,
 						unsigned int *prev_reg_val);
 
 static inline int adreno_is_a200(struct adreno_device *adreno_dev)
@@ -221,15 +290,33 @@
 	return (adreno_dev->gpurev == ADRENO_REV_A320);
 }
 
+static inline int adreno_is_a330(struct adreno_device *adreno_dev)
+{
+	return (adreno_dev->gpurev == ADRENO_REV_A330);
+}
+
 static inline int adreno_rb_ctxtswitch(unsigned int *cmd)
 {
 	return (cmd[0] == cp_nop_packet(1) &&
 		cmd[1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER);
 }
 
+/**
+ * adreno_encode_istore_size - encode istore size in CP format
+ * @adreno_dev - The 3D device.
+ *
+ * Encode the istore size into the format expected that the
+ * CP_SET_SHADER_BASES and CP_ME_INIT commands:
+ * bits 31:29 - istore size as encoded by this function
+ * bits 27:16 - vertex shader start offset in instructions
+ * bits 11:0 - pixel shader start offset in instructions.
+ */
 static inline int adreno_encode_istore_size(struct adreno_device *adreno_dev)
 {
 	unsigned int size;
+	/* in a225 the CP microcode multiplies the encoded
+	 * value by 3 while decoding.
+	 */
 	if (adreno_is_a225(adreno_dev))
 		size = adreno_dev->istore_size/3;
 	else
@@ -241,6 +328,10 @@
 static inline int __adreno_add_idle_indirect_cmds(unsigned int *cmds,
 						unsigned int nop_gpuaddr)
 {
+	/* Adding an indirect buffer ensures that the prefetch stalls until
+	 * the commands in indirect buffer have completed. We need to stall
+	 * prefetch with a nop indirect buffer when updating pagetables
+	 * because it provides stabler synchronization */
 	*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 	*cmds++ = nop_gpuaddr;
 	*cmds++ = 2;
@@ -255,7 +346,6 @@
 {
 	unsigned int *start = cmds;
 
-	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
 	*cmds++ = cp_type0_packet(MH_MMU_MPU_END, 1);
 	*cmds++ = new_phys_limit;
 	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
@@ -268,13 +358,20 @@
 {
 	unsigned int *start = cmds;
 
-	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
 	*cmds++ = cp_type0_packet(REG_CP_STATE_DEBUG_INDEX, 1);
 	*cmds++ = (cur_ctx_bank ? 0 : 0x20);
 	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
 	return cmds - start;
 }
 
+/*
+ * adreno_read_cmds - Add pm4 packets to perform read
+ * @device - Pointer to device structure
+ * @cmds - Pointer to memory where read commands need to be added
+ * @addr - gpu address of the read
+ * @val - The GPU will wait until the data at address addr becomes
+ * equal to value
+ */
 static inline int adreno_add_read_cmds(struct kgsl_device *device,
 				unsigned int *cmds, unsigned int addr,
 				unsigned int val, unsigned int nop_gpuaddr)
@@ -282,7 +379,7 @@
 	unsigned int *start = cmds;
 
 	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
-	
+	/* MEM SPACE = memory, FUNCTION = equals */
 	*cmds++ = 0x13;
 	*cmds++ = addr;
 	*cmds++ = val;
@@ -292,6 +389,11 @@
 	return cmds - start;
 }
 
+/*
+ * adreno_idle_cmds - Add pm4 packets for GPU idle
+ * @adreno_dev - Pointer to device structure
+ * @cmds - Pointer to memory where idle commands need to be added
+ */
 static inline int adreno_add_idle_cmds(struct adreno_device *adreno_dev,
 							unsigned int *cmds)
 {
@@ -309,4 +411,4 @@
 	return cmds - start;
 }
 
-#endif 
+#endif /*__ADRENO_H */
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
index d224a21..ba4e507 100644
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -21,7 +21,13 @@
 #include "adreno.h"
 #include "adreno_a2xx_trace.h"
 
+/*
+ * These are the registers that are dumped with GPU snapshot
+ * and postmortem.  The lists are dword offset pairs in the
+ * form of {start offset, end offset} inclusive.
+ */
 
+/* A200, A205 */
 const unsigned int a200_registers[] = {
 	0x0000, 0x0002, 0x0004, 0x000B, 0x003B, 0x003D, 0x0040, 0x0044,
 	0x0046, 0x0047, 0x01C0, 0x01C1, 0x01C3, 0x01C8, 0x01D5, 0x01D9,
@@ -114,29 +120,74 @@
 const unsigned int a220_registers_count = ARRAY_SIZE(a220_registers) / 2;
 const unsigned int a225_registers_count = ARRAY_SIZE(a225_registers) / 2;
 
+/*
+ *
+ *  Memory Map for Register, Constant & Instruction Shadow, and Command Buffers
+ *  (34.5KB)
+ *
+ *  +---------------------+------------+-------------+---+---------------------+
+ *  | ALU Constant Shadow | Reg Shadow | C&V Buffers |Tex| Shader Instr Shadow |
+ *  +---------------------+------------+-------------+---+---------------------+
+ *    ________________________________/               \____________________
+ *   /                                                                     |
+ *  +--------------+-----------+------+-----------+------------------------+
+ *  | Restore Regs | Save Regs | Quad | Gmem Save | Gmem Restore | unused  |
+ *  +--------------+-----------+------+-----------+------------------------+
+ *
+ *              8K - ALU Constant Shadow (8K aligned)
+ *              4K - H/W Register Shadow (8K aligned)
+ *              4K - Command and Vertex Buffers
+ *                         - Indirect command buffer : Const/Reg restore
+ *                               - includes Loop & Bool const shadows
+ *                         - Indirect command buffer : Const/Reg save
+ *                         - Quad vertices & texture coordinates
+ *                         - Indirect command buffer : Gmem save
+ *                         - Indirect command buffer : Gmem restore
+ *                         - Unused (padding to 8KB boundary)
+ *             <1K - Texture Constant Shadow (768 bytes) (8K aligned)
+ *       18K - Shader Instruction Shadow
+ *               - 6K vertex (32 byte aligned)
+ *               - 6K pixel  (32 byte aligned)
+ *               - 6K shared (32 byte aligned)
+ *
+ *  Note: Reading constants into a shadow, one at a time using REG_TO_MEM, takes
+ *  3 DWORDS per DWORD transfered, plus 1 DWORD for the shadow, for a total of
+ *  16 bytes per constant.  If the texture constants were transfered this way,
+ *  the Command & Vertex Buffers section would extend past the 16K boundary.
+ *  By moving the texture constant shadow area to start at 16KB boundary, we
+ *  only require approximately 40 bytes more memory, but are able to use the
+ *  LOAD_CONSTANT_CONTEXT shadowing feature for the textures, speeding up
+ *  context switching.
+ *
+ *  [Using LOAD_CONSTANT_CONTEXT shadowing feature for the Loop and/or Bool
+ *  constants would require an additional 8KB each, for alignment.]
+ *
+ */
 
+/* Constants */
 
-#define ALU_CONSTANTS	2048	
-#define NUM_REGISTERS	1024	
+#define ALU_CONSTANTS	2048	/* DWORDS */
+#define NUM_REGISTERS	1024	/* DWORDS */
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-#define CMD_BUFFER_LEN	9216	
+#define CMD_BUFFER_LEN	9216	/* DWORDS */
 #else
-#define CMD_BUFFER_LEN	3072	
+#define CMD_BUFFER_LEN	3072	/* DWORDS */
 #endif
-#define TEX_CONSTANTS		(32*6)	
-#define BOOL_CONSTANTS		8	
-#define LOOP_CONSTANTS		56	
+#define TEX_CONSTANTS		(32*6)	/* DWORDS */
+#define BOOL_CONSTANTS		8	/* DWORDS */
+#define LOOP_CONSTANTS		56	/* DWORDS */
 
-#define LCC_SHADOW_SIZE		0x2000	
+/* LOAD_CONSTANT_CONTEXT shadow size */
+#define LCC_SHADOW_SIZE		0x2000	/* 8KB */
 
-#define ALU_SHADOW_SIZE		LCC_SHADOW_SIZE	
-#define REG_SHADOW_SIZE		0x1000	
+#define ALU_SHADOW_SIZE		LCC_SHADOW_SIZE	/* 8KB */
+#define REG_SHADOW_SIZE		0x1000	/* 4KB */
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-#define CMD_BUFFER_SIZE		0x9000	
+#define CMD_BUFFER_SIZE		0x9000	/* 36KB */
 #else
-#define CMD_BUFFER_SIZE		0x3000	
+#define CMD_BUFFER_SIZE		0x3000	/* 12KB */
 #endif
-#define TEX_SHADOW_SIZE		(TEX_CONSTANTS*4)	
+#define TEX_SHADOW_SIZE		(TEX_CONSTANTS*4)	/* 768 bytes */
 
 #define REG_OFFSET		LCC_SHADOW_SIZE
 #define CMD_OFFSET		(REG_OFFSET + REG_SHADOW_SIZE)
@@ -154,27 +205,40 @@
 	return SHADER_OFFSET + 3*_shader_shadow_size(adreno_dev);
 }
 
+/* A scratchpad used to build commands during context create */
 
 static struct tmp_ctx {
-	unsigned int *start;	
-	unsigned int *cmd;	
+	unsigned int *start;	/* Command & Vertex buffer start */
+	unsigned int *cmd;	/* Next available dword in C&V buffer */
 
-	
-	uint32_t bool_shadow;	
-	uint32_t loop_shadow;	
+	/* address of buffers, needed when creating IB1 command buffers. */
+	uint32_t bool_shadow;	/* bool constants */
+	uint32_t loop_shadow;	/* loop constants */
 
-	uint32_t shader_shared;	
-	uint32_t shader_vertex;	
-	uint32_t shader_pixel;	
+	uint32_t shader_shared;	/* shared shader instruction shadow */
+	uint32_t shader_vertex;	/* vertex shader instruction shadow */
+	uint32_t shader_pixel;	/* pixel shader instruction shadow */
 
+	/* Addresses in command buffer where separately handled registers
+	 * are saved
+	 */
 	uint32_t reg_values[33];
 	uint32_t chicken_restore;
 
-	uint32_t gmem_base;	
+	uint32_t gmem_base;	/* Base gpu address of GMEM */
 
 } tmp_ctx;
 
+/* context save (gmem -> sys) */
 
+/* pre-compiled vertex shader program
+*
+*  attribute vec4  P;
+*  void main(void)
+*  {
+*    gl_Position = P;
+*  }
+*/
 #define GMEM2SYS_VTX_PGM_LEN	0x12
 
 static unsigned int gmem2sys_vtx_pgm[GMEM2SYS_VTX_PGM_LEN] = {
@@ -186,6 +250,15 @@
 	0x14000000, 0x00000000, 0xe2000000
 };
 
+/* pre-compiled fragment shader program
+*
+*  precision highp float;
+*  uniform   vec4  clear_color;
+*  void main(void)
+*  {
+*     gl_FragColor = clear_color;
+*  }
+*/
 
 #define GMEM2SYS_FRAG_PGM_LEN	0x0c
 
@@ -196,6 +269,18 @@
 	0x14000000, 0x00000000, 0xe2000000
 };
 
+/* context restore (sys -> gmem) */
+/* pre-compiled vertex shader program
+*
+*  attribute vec4 position;
+*  attribute vec4 texcoord;
+*  varying   vec4 texcoord0;
+*  void main()
+*  {
+*     gl_Position = position;
+*     texcoord0 = texcoord;
+*  }
+*/
 
 #define SYS2GMEM_VTX_PGM_LEN	0x18
 
@@ -208,6 +293,16 @@
 	0xe2020200, 0x14000000, 0x00000000, 0xe2000000
 };
 
+/* pre-compiled fragment shader program
+*
+*  precision mediump   float;
+*  uniform   sampler2D tex0;
+*  varying   vec4      texcoord0;
+*  void main()
+*  {
+*     gl_FragColor = texture2D(tex0, texcoord0.xy);
+*  }
+*/
 
 #define SYS2GMEM_FRAG_PGM_LEN	0x0f
 
@@ -218,68 +313,85 @@
 	0x14000000, 0x00000000, 0xe2000000
 };
 
+/* shader texture constants (sysmem -> gmem)  */
 #define SYS2GMEM_TEX_CONST_LEN	6
 
 static unsigned int sys2gmem_tex_const[SYS2GMEM_TEX_CONST_LEN] = {
-	0x00000002,		
+	/* Texture, FormatXYZW=Unsigned, ClampXYZ=Wrap/Repeat,
+	 * RFMode=ZeroClamp-1, Dim=1:2d
+	 */
+	0x00000002,		/* Pitch = TBD */
 
-	0x00000800,		
+	/* Format=6:8888_WZYX, EndianSwap=0:None, ReqSize=0:256bit, DimHi=0,
+	 * NearestClamp=1:OGL Mode
+	 */
+	0x00000800,		/* Address[31:12] = TBD */
 
-	
-	0,			
+	/* Width, Height, EndianSwap=0:None */
+	0,			/* Width & Height = TBD */
 
+	/* NumFormat=0:RF, DstSelXYZW=XYZW, ExpAdj=0, MagFilt=MinFilt=0:Point,
+	 * Mip=2:BaseMap
+	 */
 	0 << 1 | 1 << 4 | 2 << 7 | 3 << 10 | 2 << 23,
 
+	/* VolMag=VolMin=0:Point, MinMipLvl=0, MaxMipLvl=1, LodBiasH=V=0,
+	 * Dim3d=0
+	 */
 	0,
 
-	1 << 9			
+	/* BorderColor=0:ABGRBlack, ForceBC=0:diable, TriJuice=0, Aniso=0,
+	 * Dim=1:2d, MipPacking=0
+	 */
+	1 << 9			/* Mip Address[31:12] = TBD */
 };
 
 #define NUM_COLOR_FORMATS   13
 
 static enum SURFACEFORMAT surface_format_table[NUM_COLOR_FORMATS] = {
-	FMT_4_4_4_4,		
-	FMT_1_5_5_5,		
-	FMT_5_6_5,		
-	FMT_8,			
-	FMT_8_8,		
-	FMT_8_8_8_8,		
-	FMT_8_8_8_8,		
-	FMT_16_FLOAT,		
-	FMT_16_16_FLOAT,	
-	FMT_16_16_16_16_FLOAT,	
-	FMT_32_FLOAT,		
-	FMT_32_32_FLOAT,	
-	FMT_32_32_32_32_FLOAT,	
+	FMT_4_4_4_4,		/* COLORX_4_4_4_4 */
+	FMT_1_5_5_5,		/* COLORX_1_5_5_5 */
+	FMT_5_6_5,		/* COLORX_5_6_5 */
+	FMT_8,			/* COLORX_8 */
+	FMT_8_8,		/* COLORX_8_8 */
+	FMT_8_8_8_8,		/* COLORX_8_8_8_8 */
+	FMT_8_8_8_8,		/* COLORX_S8_8_8_8 */
+	FMT_16_FLOAT,		/* COLORX_16_FLOAT */
+	FMT_16_16_FLOAT,	/* COLORX_16_16_FLOAT */
+	FMT_16_16_16_16_FLOAT,	/* COLORX_16_16_16_16_FLOAT */
+	FMT_32_FLOAT,		/* COLORX_32_FLOAT */
+	FMT_32_32_FLOAT,	/* COLORX_32_32_FLOAT */
+	FMT_32_32_32_32_FLOAT,	/* COLORX_32_32_32_32_FLOAT */
 };
 
 static unsigned int format2bytesperpixel[NUM_COLOR_FORMATS] = {
-	2,			
-	2,			
-	2,			
-	1,			
-	2,			
-	4,			
-	4,			
-	2,			
-	4,			
-	8,			
-	4,			
-	8,			
-	16,			
+	2,			/* COLORX_4_4_4_4 */
+	2,			/* COLORX_1_5_5_5 */
+	2,			/* COLORX_5_6_5 */
+	1,			/* COLORX_8 */
+	2,			/* COLORX_8_8 8*/
+	4,			/* COLORX_8_8_8_8 */
+	4,			/* COLORX_S8_8_8_8 */
+	2,			/* COLORX_16_FLOAT */
+	4,			/* COLORX_16_16_FLOAT */
+	8,			/* COLORX_16_16_16_16_FLOAT */
+	4,			/* COLORX_32_FLOAT */
+	8,			/* COLORX_32_32_FLOAT */
+	16,			/* COLORX_32_32_32_32_FLOAT */
 };
 
+/* shader linkage info */
 #define SHADER_CONST_ADDR	(11 * 6 + 3)
 
 
 static unsigned int *program_shader(unsigned int *cmds, int vtxfrag,
 				    unsigned int *shader_pgm, int dwords)
 {
-	
+	/* load the patched vertex shader stream */
 	*cmds++ = cp_type3_packet(CP_IM_LOAD_IMMEDIATE, 2 + dwords);
-	
+	/* 0=vertex shader, 1=fragment shader */
 	*cmds++ = vtxfrag;
-	
+	/* instruction start & size (in 32-bit words) */
 	*cmds++ = ((0 << 16) | dwords);
 
 	memcpy(cmds, shader_pgm, dwords << 2);
@@ -320,6 +432,7 @@
 
 #endif
 
+/* chicken restore */
 static unsigned int *build_chicken_restore_cmds(
 					struct adreno_context *drawctxt)
 {
@@ -333,12 +446,15 @@
 	tmp_ctx.chicken_restore = virt2gpu(cmds, &drawctxt->gpustate);
 	*cmds++ = 0x00000000;
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->chicken_restore, start, cmds);
 
 	return cmds;
 }
 
+/****************************************************************************/
+/* context save                                                             */
+/****************************************************************************/
 
 static const unsigned int register_ranges_a20x[] = {
 	REG_RB_SURFACE_INFO, REG_RB_DEPTH_INFO,
@@ -402,6 +518,9 @@
 };
 
 
+/* save h/w regs, alu constants, texture contants, etc. ...
+*  requires: bool_shadow_gpuaddr, loop_shadow_gpuaddr
+*/
 static void build_regsave_cmds(struct adreno_device *adreno_dev,
 			       struct adreno_context *drawctxt)
 {
@@ -412,6 +531,8 @@
 	*cmd++ = 0;
 
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
+	/* Make sure the HW context has the correct register values
+	 * before reading them. */
 	*cmd++ = cp_type3_packet(CP_CONTEXT_UPDATE, 1);
 	*cmd++ = 0;
 
@@ -420,7 +541,7 @@
 		unsigned int reg_array_size = 0;
 		const unsigned int *ptr_register_ranges;
 
-		
+		/* Based on chip id choose the register ranges */
 		if (adreno_is_a220(adreno_dev)) {
 			ptr_register_ranges = register_ranges_a220;
 			reg_array_size = ARRAY_SIZE(register_ranges_a220);
@@ -433,7 +554,7 @@
 		}
 
 
-		
+		/* Write HW registers into shadow */
 		for (i = 0; i < (reg_array_size/2) ; i++) {
 			build_reg_to_mem_range(ptr_register_ranges[i*2],
 					ptr_register_ranges[i*2+1],
@@ -441,38 +562,53 @@
 		}
 	}
 
-	
+	/* Copy ALU constants */
 	cmd =
 	    reg_to_mem(cmd, (drawctxt->gpustate.gpuaddr) & 0xFFFFE000,
 		       REG_SQ_CONSTANT_0, ALU_CONSTANTS);
 
-	
+	/* Copy Tex constants */
 	cmd =
 	    reg_to_mem(cmd,
 		       (drawctxt->gpustate.gpuaddr + TEX_OFFSET) & 0xFFFFE000,
 		       REG_SQ_FETCH_0, TEX_CONSTANTS);
 #else
 
+	/* Insert a wait for idle packet before reading the registers.
+	 * This is to fix a hang/reset seen during stress testing.  In this
+	 * hang, CP encountered a timeout reading SQ's boolean constant
+	 * register. There is logic in the HW that blocks reading of this
+	 * register when the SQ block is not idle, which we believe is
+	 * contributing to the hang.*/
 	*cmd++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmd++ = 0;
 
+	/* H/w registers are already shadowed; just need to disable shadowing
+	 * to prevent corruption.
+	 */
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = (drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000;
-	*cmd++ = 4 << 16;	
-	*cmd++ = 0x0;		
+	*cmd++ = 4 << 16;	/* regs, start=0 */
+	*cmd++ = 0x0;		/* count = 0 */
 
+	/* ALU constants are already shadowed; just need to disable shadowing
+	 * to prevent corruption.
+	 */
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = drawctxt->gpustate.gpuaddr & 0xFFFFE000;
-	*cmd++ = 0 << 16;	
-	*cmd++ = 0x0;		
+	*cmd++ = 0 << 16;	/* ALU, start=0 */
+	*cmd++ = 0x0;		/* count = 0 */
 
+	/* Tex constants are already shadowed; just need to disable shadowing
+	 *  to prevent corruption.
+	 */
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = (drawctxt->gpustate.gpuaddr + TEX_OFFSET) & 0xFFFFE000;
-	*cmd++ = 1 << 16;	
-	*cmd++ = 0x0;		
+	*cmd++ = 1 << 16;	/* Tex, start=0 */
+	*cmd++ = 0x0;		/* count = 0 */
 #endif
 
-	
+	/* Need to handle some of the registers separately */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = REG_SQ_GPR_MANAGEMENT;
 	*cmd++ = tmp_ctx.reg_values[0];
@@ -493,33 +629,34 @@
 		}
 	}
 
-	
+	/* Copy Boolean constants */
 	cmd = reg_to_mem(cmd, tmp_ctx.bool_shadow, REG_SQ_CF_BOOLEANS,
 			 BOOL_CONSTANTS);
 
-	
+	/* Copy Loop constants */
 	cmd = reg_to_mem(cmd, tmp_ctx.loop_shadow,
 		REG_SQ_CF_LOOP, LOOP_CONSTANTS);
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->reg_save, start, cmd);
 
 	tmp_ctx.cmd = cmd;
 }
 
+/*copy colour, depth, & stencil buffers from graphics memory to system memory*/
 static unsigned int *build_gmem2sys_cmds(struct adreno_device *adreno_dev,
 					 struct adreno_context *drawctxt,
 					 struct gmem_shadow_t *shadow)
 {
 	unsigned int *cmds = shadow->gmem_save_commands;
 	unsigned int *start = cmds;
-	
+	/* Calculate the new offset based on the adjusted base */
 	unsigned int bytesperpixel = format2bytesperpixel[shadow->format];
 	unsigned int addr = shadow->gmemshadow.gpuaddr;
 	unsigned int offset = (addr - (addr & 0xfffff000)) / bytesperpixel;
 
 	if (!(drawctxt->flags & CTXT_FLAGS_PREAMBLE)) {
-		
+		/* Store TP0_CHICKEN register */
 		*cmds++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 		*cmds++ = REG_TP0_CHICKEN;
 
@@ -529,48 +666,48 @@
 		*cmds++ = 0;
 	}
 
-	
+	/* Set TP0_CHICKEN to zero */
 	*cmds++ = cp_type0_packet(REG_TP0_CHICKEN, 1);
 	*cmds++ = 0x00000000;
 
-	
+	/* Set PA_SC_AA_CONFIG to 0 */
 	*cmds++ = cp_type0_packet(REG_PA_SC_AA_CONFIG, 1);
 	*cmds++ = 0x00000000;
 
-	
+	/* program shader */
 
-	
+	/* load shader vtx constants ... 5 dwords */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 4);
 	*cmds++ = (0x1 << 16) | SHADER_CONST_ADDR;
 	*cmds++ = 0;
-	
+	/* valid(?) vtx constant flag & addr */
 	*cmds++ = shadow->quad_vertices.gpuaddr | 0x3;
-	
+	/* limit = 12 dwords */
 	*cmds++ = 0x00000030;
 
-	
+	/* Invalidate L2 cache to make sure vertices are updated */
 	*cmds++ = cp_type0_packet(REG_TC_CNTL_STATUS, 1);
 	*cmds++ = 0x1;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 4);
 	*cmds++ = CP_REG(REG_VGT_MAX_VTX_INDX);
-	*cmds++ = 0x00ffffff;	
-	*cmds++ = 0x0;		
-	*cmds++ = 0x00000000;	
+	*cmds++ = 0x00ffffff;	/* REG_VGT_MAX_VTX_INDX */
+	*cmds++ = 0x0;		/* REG_VGT_MIN_VTX_INDX */
+	*cmds++ = 0x00000000;	/* REG_VGT_INDX_OFFSET */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_SC_AA_MASK);
-	*cmds++ = 0x0000ffff;	
+	*cmds++ = 0x0000ffff;	/* REG_PA_SC_AA_MASK */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_COLORCONTROL);
 	*cmds++ = 0x00000c20;
 
-	
+	/* Repartition shaders */
 	*cmds++ = cp_type0_packet(REG_SQ_INST_STORE_MANAGMENT, 1);
 	*cmds++ = adreno_dev->pix_shader_start;
 
-	
+	/* Invalidate Vertex & Pixel instruction code address and sizes */
 	*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
 	*cmds++ = 0x00003F00;
 
@@ -578,14 +715,14 @@
 	*cmds++ = adreno_encode_istore_size(adreno_dev)
 		  | adreno_dev->pix_shader_start;
 
-	
+	/* load the patched vertex shader stream */
 	cmds = program_shader(cmds, 0, gmem2sys_vtx_pgm, GMEM2SYS_VTX_PGM_LEN);
 
-	
+	/* Load the patched fragment shader stream */
 	cmds =
 	    program_shader(cmds, 1, gmem2sys_frag_pgm, GMEM2SYS_FRAG_PGM_LEN);
 
-	
+	/* SQ_PROGRAM_CNTL / SQ_CONTEXT_MISC */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_SQ_PROGRAM_CNTL);
 	if (adreno_is_a22x(adreno_dev))
@@ -594,26 +731,29 @@
 		*cmds++ = 0x10010001;
 	*cmds++ = 0x00000008;
 
-	
+	/* resolve */
 
-	
+	/* PA_CL_VTE_CNTL */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_CL_VTE_CNTL);
-	
+	/* disable X/Y/Z transforms, X/Y/Z are premultiplied by W */
 	*cmds++ = 0x00000b00;
 
-	
+	/* program surface info */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_RB_SURFACE_INFO);
-	*cmds++ = shadow->gmem_pitch;	
+	*cmds++ = shadow->gmem_pitch;	/* pitch, MSAA = 1 */
 
-	
+	/* RB_COLOR_INFO Endian=none, Linear, Format=RGBA8888, Swap=0,
+	 *                Base=gmem_base
+	 */
+	/* gmem base assumed 4K aligned. */
 	BUG_ON(tmp_ctx.gmem_base & 0xFFF);
 	*cmds++ =
 	    (shadow->
 	     format << RB_COLOR_INFO__COLOR_FORMAT__SHIFT) | tmp_ctx.gmem_base;
 
-	
+	/* disable Z */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_DEPTHCONTROL);
 	if (adreno_is_a22x(adreno_dev))
@@ -621,10 +761,17 @@
 	else
 		*cmds++ = 0;
 
+	/* set REG_PA_SU_SC_MODE_CNTL
+	 *              Front_ptype = draw triangles
+	 *              Back_ptype = draw triangles
+	 *              Provoking vertex = last
+	 */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_SU_SC_MODE_CNTL);
 	*cmds++ = 0x00080240;
 
+	/* Use maximum scissor values -- quad vertices already have the
+	 * correct bounds */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_PA_SC_SCREEN_SCISSOR_TL);
 	*cmds++ = (0 << 16) | 0;
@@ -634,14 +781,17 @@
 	*cmds++ = (unsigned int)((1U << 31) | (0 << 16) | 0);
 	*cmds++ = (0x1fff << 16) | (0x1fff);
 
+	/* load the viewport so that z scale = clear depth and
+	 *  z offset = 0.0f
+	 */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_PA_CL_VPORT_ZSCALE);
-	*cmds++ = 0xbf800000;	
+	*cmds++ = 0xbf800000;	/* -1.0f */
 	*cmds++ = 0x0;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_COLOR_MASK);
-	*cmds++ = 0x0000000f;	
+	*cmds++ = 0x0000000f;	/* R = G = B = 1:enabled */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_COLOR_DEST_MASK);
@@ -652,23 +802,29 @@
 	*cmds++ = 0x00000000;
 	*cmds++ = 0x00000000;
 
+	/* load the stencil ref value
+	 * $AAM - do this later
+	 */
 
-	
+	/* load the COPY state */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 6);
 	*cmds++ = CP_REG(REG_RB_COPY_CONTROL);
-	*cmds++ = 0;		
-	*cmds++ = addr & 0xfffff000;	
-	*cmds++ = shadow->pitch >> 5;	
+	*cmds++ = 0;		/* RB_COPY_CONTROL */
+	*cmds++ = addr & 0xfffff000;	/* RB_COPY_DEST_BASE */
+	*cmds++ = shadow->pitch >> 5;	/* RB_COPY_DEST_PITCH */
 
+	/* Endian=none, Linear, Format=RGBA8888,Swap=0,!Dither,
+	 *  MaskWrite:R=G=B=A=1
+	 */
 	*cmds++ = 0x0003c008 |
 	    (shadow->format << RB_COPY_DEST_INFO__COPY_DEST_FORMAT__SHIFT);
-	
+	/* Make sure we stay in offsetx field. */
 	BUG_ON(offset & 0xfffff000);
 	*cmds++ = offset;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_MODECONTROL);
-	*cmds++ = 0x6;		
+	*cmds++ = 0x6;		/* EDRAM copy */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_CL_CLIP_CNTL);
@@ -680,25 +836,27 @@
 		*cmds++ = 0x0000000;
 
 		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 3);
-		*cmds++ = 0;           
-		
+		*cmds++ = 0;           /* viz query info. */
+		/* PrimType=RectList, SrcSel=AutoIndex, VisCullMode=Ignore*/
 		*cmds++ = 0x00004088;
-		*cmds++ = 3;	       
+		*cmds++ = 3;	       /* NumIndices=3 */
 	} else {
-		
+		/* queue the draw packet */
 		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 2);
-		*cmds++ = 0;		
-		
+		*cmds++ = 0;		/* viz query info. */
+		/* PrimType=RectList, NumIndices=3, SrcSel=AutoIndex */
 		*cmds++ = 0x00030088;
 	}
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, shadow->gmem_save, start, cmds);
 
 	return cmds;
 }
 
+/* context restore */
 
+/*copy colour, depth, & stencil buffers from system memory to graphics memory*/
 static unsigned int *build_sys2gmem_cmds(struct adreno_device *adreno_dev,
 					 struct adreno_context *drawctxt,
 					 struct gmem_shadow_t *shadow)
@@ -707,7 +865,7 @@
 	unsigned int *start = cmds;
 
 	if (!(drawctxt->flags & CTXT_FLAGS_PREAMBLE)) {
-		
+		/* Store TP0_CHICKEN register */
 		*cmds++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 		*cmds++ = REG_TP0_CHICKEN;
 		*cmds++ = tmp_ctx.chicken_restore;
@@ -716,53 +874,53 @@
 		*cmds++ = 0;
 	}
 
-	
+	/* Set TP0_CHICKEN to zero */
 	*cmds++ = cp_type0_packet(REG_TP0_CHICKEN, 1);
 	*cmds++ = 0x00000000;
 
-	
+	/* Set PA_SC_AA_CONFIG to 0 */
 	*cmds++ = cp_type0_packet(REG_PA_SC_AA_CONFIG, 1);
 	*cmds++ = 0x00000000;
-	
+	/* shader constants */
 
-	
+	/* vertex buffer constants */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 7);
 
 	*cmds++ = (0x1 << 16) | (9 * 6);
-	
+	/* valid(?) vtx constant flag & addr */
 	*cmds++ = shadow->quad_vertices.gpuaddr | 0x3;
-	
+	/* limit = 12 dwords */
 	*cmds++ = 0x00000030;
-	
+	/* valid(?) vtx constant flag & addr */
 	*cmds++ = shadow->quad_texcoords.gpuaddr | 0x3;
-	
+	/* limit = 8 dwords */
 	*cmds++ = 0x00000020;
 	*cmds++ = 0;
 	*cmds++ = 0;
 
-	
+	/* Invalidate L2 cache to make sure vertices are updated */
 	*cmds++ = cp_type0_packet(REG_TC_CNTL_STATUS, 1);
 	*cmds++ = 0x1;
 
 	cmds = program_shader(cmds, 0, sys2gmem_vtx_pgm, SYS2GMEM_VTX_PGM_LEN);
 
-	
+	/* Repartition shaders */
 	*cmds++ = cp_type0_packet(REG_SQ_INST_STORE_MANAGMENT, 1);
 	*cmds++ = adreno_dev->pix_shader_start;
 
-	
+	/* Invalidate Vertex & Pixel instruction code address and sizes */
 	*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
-	*cmds++ = 0x00000300; 
+	*cmds++ = 0x00000300; /* 0x100 = Vertex, 0x200 = Pixel */
 
 	*cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1);
 	*cmds++ = adreno_encode_istore_size(adreno_dev)
 		  | adreno_dev->pix_shader_start;
 
-	
+	/* Load the patched fragment shader stream */
 	cmds =
 	    program_shader(cmds, 1, sys2gmem_frag_pgm, SYS2GMEM_FRAG_PGM_LEN);
 
-	
+	/* SQ_PROGRAM_CNTL / SQ_CONTEXT_MISC */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_SQ_PROGRAM_CNTL);
 	*cmds++ = 0x10030002;
@@ -770,44 +928,49 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_SC_AA_MASK);
-	*cmds++ = 0x0000ffff;	
+	*cmds++ = 0x0000ffff;	/* REG_PA_SC_AA_MASK */
 
 	if (!adreno_is_a22x(adreno_dev)) {
-		
+		/* PA_SC_VIZ_QUERY */
 		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 		*cmds++ = CP_REG(REG_PA_SC_VIZ_QUERY);
-		*cmds++ = 0x0;		
+		*cmds++ = 0x0;		/*REG_PA_SC_VIZ_QUERY */
 	}
 
-	
+	/* RB_COLORCONTROL */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_COLORCONTROL);
 	*cmds++ = 0x00000c20;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 4);
 	*cmds++ = CP_REG(REG_VGT_MAX_VTX_INDX);
-	*cmds++ = 0x00ffffff;	
-	*cmds++ = 0x0;		
-	*cmds++ = 0x00000000;	
+	*cmds++ = 0x00ffffff;	/* mmVGT_MAX_VTX_INDX */
+	*cmds++ = 0x0;		/* mmVGT_MIN_VTX_INDX */
+	*cmds++ = 0x00000000;	/* mmVGT_INDX_OFFSET */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_VGT_VERTEX_REUSE_BLOCK_CNTL);
-	*cmds++ = 0x00000002;	
-	*cmds++ = 0x00000002;	
+	*cmds++ = 0x00000002;	/* mmVGT_VERTEX_REUSE_BLOCK_CNTL */
+	*cmds++ = 0x00000002;	/* mmVGT_OUT_DEALLOC_CNTL */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_SQ_INTERPOLATOR_CNTL);
-	*cmds++ = 0xffffffff;	
+	*cmds++ = 0xffffffff;	/* mmSQ_INTERPOLATOR_CNTL */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_SC_AA_CONFIG);
-	*cmds++ = 0x00000000;	
+	*cmds++ = 0x00000000;	/* REG_PA_SC_AA_CONFIG */
 
+	/* set REG_PA_SU_SC_MODE_CNTL
+	 * Front_ptype = draw triangles
+	 * Back_ptype = draw triangles
+	 * Provoking vertex = last
+	 */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_SU_SC_MODE_CNTL);
 	*cmds++ = 0x00080240;
 
-	
+	/* texture constants */
 	*cmds++ =
 	    cp_type3_packet(CP_SET_CONSTANT, (SYS2GMEM_TEX_CONST_LEN + 1));
 	*cmds++ = (0x1 << 16) | (0 * 6);
@@ -818,24 +981,29 @@
 	cmds[2] |= (shadow->width - 1) | (shadow->height - 1) << 13;
 	cmds += SYS2GMEM_TEX_CONST_LEN;
 
-	
+	/* program surface info */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_RB_SURFACE_INFO);
-	*cmds++ = shadow->gmem_pitch;	
+	*cmds++ = shadow->gmem_pitch;	/* pitch, MSAA = 1 */
 
+	/* RB_COLOR_INFO Endian=none, Linear, Format=RGBA8888, Swap=0,
+	 *                Base=gmem_base
+	 */
 	*cmds++ =
 	    (shadow->
 	     format << RB_COLOR_INFO__COLOR_FORMAT__SHIFT) | tmp_ctx.gmem_base;
 
-	
+	/* RB_DEPTHCONTROL */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_DEPTHCONTROL);
 
 	if (adreno_is_a22x(adreno_dev))
-		*cmds++ = 8;		
+		*cmds++ = 8;		/* disable Z */
 	else
-		*cmds++ = 0;		
+		*cmds++ = 0;		/* disable Z */
 
+	/* Use maximum scissor values -- quad vertices already
+	 * have the correct bounds */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_PA_SC_SCREEN_SCISSOR_TL);
 	*cmds++ = (0 << 16) | 0;
@@ -847,10 +1015,10 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_PA_CL_VTE_CNTL);
-	
+	/* disable X/Y/Z transforms, X/Y/Z are premultiplied by W */
 	*cmds++ = 0x00000b00;
 
-	
+	/*load the viewport so that z scale = clear depth and z offset = 0.0f */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(REG_PA_CL_VPORT_ZSCALE);
 	*cmds++ = 0xbf800000;
@@ -858,7 +1026,7 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_COLOR_MASK);
-	*cmds++ = 0x0000000f;	
+	*cmds++ = 0x0000000f;	/* R = G = B = 1:enabled */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_COLOR_DEST_MASK);
@@ -869,9 +1037,12 @@
 	*cmds++ = 0x00000000;
 	*cmds++ = 0x00000000;
 
+	/* load the stencil ref value
+	 *  $AAM - do this later
+	 */
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(REG_RB_MODECONTROL);
-	
+	/* draw pixels with color and depth/stencil component */
 	*cmds++ = 0x4;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
@@ -884,19 +1055,19 @@
 		*cmds++ = 0x0000000;
 
 		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 3);
-		*cmds++ = 0;           
-		
+		*cmds++ = 0;           /* viz query info. */
+		/* PrimType=RectList, SrcSel=AutoIndex, VisCullMode=Ignore*/
 		*cmds++ = 0x00004088;
-		*cmds++ = 3;	       
+		*cmds++ = 3;	       /* NumIndices=3 */
 	} else {
-		
+		/* queue the draw packet */
 		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 2);
-		*cmds++ = 0;		
-		
+		*cmds++ = 0;		/* viz query info. */
+		/* PrimType=RectList, NumIndices=3, SrcSel=AutoIndex */
 		*cmds++ = 0x00030088;
 	}
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, shadow->gmem_restore, start, cmds);
 
 	return cmds;
@@ -915,17 +1086,17 @@
 	*cmd++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmd++ = 0;
 
-	
-	
+	/* H/W Registers */
+	/* deferred cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, ???); */
 	cmd++;
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	
+	/* Force mismatch */
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000) | 1;
 #else
 	*cmd++ = (drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000;
 #endif
 
-	
+	/* Based on chip id choose the registers ranges*/
 	if (adreno_is_a220(adreno_dev)) {
 		ptr_register_ranges = register_ranges_a220;
 		reg_array_size = ARRAY_SIZE(register_ranges_a220);
@@ -943,16 +1114,19 @@
 				ptr_register_ranges[i*2+1]);
 	}
 
+	/* Now we know how many register blocks we have, we can compute command
+	 * length
+	 */
 	start[2] =
 	    cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, (cmd - start) - 3);
-	
+	/* Enable shadowing for the entire register block. */
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	start[4] |= (0 << 24) | (4 << 16);	
+	start[4] |= (0 << 24) | (4 << 16);	/* Disable shadowing. */
 #else
 	start[4] |= (1 << 24) | (4 << 16);
 #endif
 
-	
+	/* Need to handle some of the registers separately */
 	*cmd++ = cp_type0_packet(REG_SQ_GPR_MANAGEMENT, 1);
 	tmp_ctx.reg_values[0] = virt2gpu(cmd, &drawctxt->gpustate);
 	*cmd++ = 0x00040400;
@@ -976,42 +1150,48 @@
 		}
 	}
 
-	
+	/* ALU Constants */
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = drawctxt->gpustate.gpuaddr & 0xFFFFE000;
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	*cmd++ = (0 << 24) | (0 << 16) | 0;	
+	*cmd++ = (0 << 24) | (0 << 16) | 0;	/* Disable shadowing */
 #else
 	*cmd++ = (1 << 24) | (0 << 16) | 0;
 #endif
 	*cmd++ = ALU_CONSTANTS;
 
-	
+	/* Texture Constants */
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = (drawctxt->gpustate.gpuaddr + TEX_OFFSET) & 0xFFFFE000;
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	
+	/* Disable shadowing */
 	*cmd++ = (0 << 24) | (1 << 16) | 0;
 #else
 	*cmd++ = (1 << 24) | (1 << 16) | 0;
 #endif
 	*cmd++ = TEX_CONSTANTS;
 
-	
+	/* Boolean Constants */
 	*cmd++ = cp_type3_packet(CP_SET_CONSTANT, 1 + BOOL_CONSTANTS);
 	*cmd++ = (2 << 16) | 0;
 
+	/* the next BOOL_CONSTANT dwords is the shadow area for
+	 *  boolean constants.
+	 */
 	tmp_ctx.bool_shadow = virt2gpu(cmd, &drawctxt->gpustate);
 	cmd += BOOL_CONSTANTS;
 
-	
+	/* Loop Constants */
 	*cmd++ = cp_type3_packet(CP_SET_CONSTANT, 1 + LOOP_CONSTANTS);
 	*cmd++ = (3 << 16) | 0;
 
+	/* the next LOOP_CONSTANTS dwords is the shadow area for
+	 * loop constants.
+	 */
 	tmp_ctx.loop_shadow = virt2gpu(cmd, &drawctxt->gpustate);
 	cmd += LOOP_CONSTANTS;
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->reg_restore, start, cmd);
 
 	tmp_ctx.cmd = cmd;
@@ -1027,45 +1207,45 @@
 	unsigned int *partition1;
 	unsigned int *shaderBases, *partition2;
 
-	
+	/* compute vertex, pixel and shared instruction shadow GPU addresses */
 	tmp_ctx.shader_vertex = drawctxt->gpustate.gpuaddr + SHADER_OFFSET;
 	tmp_ctx.shader_pixel = tmp_ctx.shader_vertex
 				+ _shader_shadow_size(adreno_dev);
 	tmp_ctx.shader_shared = tmp_ctx.shader_pixel
 				+  _shader_shadow_size(adreno_dev);
 
-	
+	/* restore shader partitioning and instructions */
 
-	restore = cmd;		
+	restore = cmd;		/* start address */
 
-	
+	/* Invalidate Vertex & Pixel instruction code address and sizes */
 	*cmd++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
-	*cmd++ = 0x00000300;	
+	*cmd++ = 0x00000300;	/* 0x100 = Vertex, 0x200 = Pixel */
 
-	
+	/* Restore previous shader vertex & pixel instruction bases. */
 	*cmd++ = cp_type3_packet(CP_SET_SHADER_BASES, 1);
-	shaderBases = cmd++;	
+	shaderBases = cmd++;	/* TBD #5: shader bases (from fixup) */
 
-	
+	/* write the shader partition information to a scratch register */
 	*cmd++ = cp_type0_packet(REG_SQ_INST_STORE_MANAGMENT, 1);
-	partition1 = cmd++;	
+	partition1 = cmd++;	/* TBD #4a: partition info (from save) */
 
-	
+	/* load vertex shader instructions from the shadow. */
 	*cmd++ = cp_type3_packet(CP_IM_LOAD, 2);
-	*cmd++ = tmp_ctx.shader_vertex + 0x0;	
-	startSizeVtx = cmd++;	
+	*cmd++ = tmp_ctx.shader_vertex + 0x0;	/* 0x0 = Vertex */
+	startSizeVtx = cmd++;	/* TBD #1: start/size (from save) */
 
-	
+	/* load pixel shader instructions from the shadow. */
 	*cmd++ = cp_type3_packet(CP_IM_LOAD, 2);
-	*cmd++ = tmp_ctx.shader_pixel + 0x1;	
-	startSizePix = cmd++;	
+	*cmd++ = tmp_ctx.shader_pixel + 0x1;	/* 0x1 = Pixel */
+	startSizePix = cmd++;	/* TBD #2: start/size (from save) */
 
-	
+	/* load shared shader instructions from the shadow. */
 	*cmd++ = cp_type3_packet(CP_IM_LOAD, 2);
-	*cmd++ = tmp_ctx.shader_shared + 0x2;	
-	startSizeShared = cmd++;	
+	*cmd++ = tmp_ctx.shader_shared + 0x2;	/* 0x2 = Shared */
+	startSizeShared = cmd++;	/* TBD #3: start/size (from save) */
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->shader_restore, restore, cmd);
 
 	/*
@@ -1077,81 +1257,86 @@
 	 *  have been written.
 	 */
 
-	fixup = cmd;		
+	fixup = cmd;		/* start address */
 
-	
+	/* write the shader partition information to a scratch register */
 	*cmd++ = cp_type0_packet(REG_SCRATCH_REG2, 1);
-	partition2 = cmd++;	
+	partition2 = cmd++;	/* TBD #4b: partition info (from save) */
 
-	
+	/* mask off unused bits, then OR with shader instruction memory size */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = REG_SCRATCH_REG2;
-	
+	/* AND off invalid bits. */
 	*cmd++ = 0x0FFF0FFF;
-	
+	/* OR in instruction memory size.  */
 	*cmd++ = adreno_encode_istore_size(adreno_dev);
 
-	
+	/* write the computed value to the SET_SHADER_BASES data field */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = REG_SCRATCH_REG2;
-	
+	/* TBD #5: shader bases (to restore) */
 	*cmd++ = virt2gpu(shaderBases, &drawctxt->gpustate);
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->shader_fixup, fixup, cmd);
 
-	
+	/* save shader partitioning and instructions */
 
-	save = cmd;		
+	save = cmd;		/* start address */
 
 	*cmd++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmd++ = 0;
 
+	/* fetch the SQ_INST_STORE_MANAGMENT register value,
+	 *  store the value in the data fields of the SET_CONSTANT commands
+	 *  above.
+	 */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = REG_SQ_INST_STORE_MANAGMENT;
-	
+	/* TBD #4a: partition info (to restore) */
 	*cmd++ = virt2gpu(partition1, &drawctxt->gpustate);
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = REG_SQ_INST_STORE_MANAGMENT;
-	
+	/* TBD #4b: partition info (to fixup) */
 	*cmd++ = virt2gpu(partition2, &drawctxt->gpustate);
 
 
-	
+	/* store the vertex shader instructions */
 	*cmd++ = cp_type3_packet(CP_IM_STORE, 2);
-	*cmd++ = tmp_ctx.shader_vertex + 0x0;	
-	
+	*cmd++ = tmp_ctx.shader_vertex + 0x0;	/* 0x0 = Vertex */
+	/* TBD #1: start/size (to restore) */
 	*cmd++ = virt2gpu(startSizeVtx, &drawctxt->gpustate);
 
-	
+	/* store the pixel shader instructions */
 	*cmd++ = cp_type3_packet(CP_IM_STORE, 2);
-	*cmd++ = tmp_ctx.shader_pixel + 0x1;	
-	
+	*cmd++ = tmp_ctx.shader_pixel + 0x1;	/* 0x1 = Pixel */
+	/* TBD #2: start/size (to restore) */
 	*cmd++ = virt2gpu(startSizePix, &drawctxt->gpustate);
 
-	
+	/* store the shared shader instructions if vertex base is nonzero */
 
 	*cmd++ = cp_type3_packet(CP_IM_STORE, 2);
-	*cmd++ = tmp_ctx.shader_shared + 0x2;	
-	
+	*cmd++ = tmp_ctx.shader_shared + 0x2;	/* 0x2 = Shared */
+	/* TBD #3: start/size (to restore) */
 	*cmd++ = virt2gpu(startSizeShared, &drawctxt->gpustate);
 
 
 	*cmd++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmd++ = 0;
 
-	
+	/* create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->shader_save, save, cmd);
 
 	tmp_ctx.cmd = cmd;
 }
 
+/* create buffers for saving/restoring registers, constants, & GMEM */
 static int a2xx_create_gpustate_shadow(struct adreno_device *adreno_dev,
 			struct adreno_context *drawctxt)
 {
 	drawctxt->flags |= CTXT_FLAGS_STATE_SHADOW;
 
-	
+	/* build indirect command buffers to save & restore regs/constants */
 	build_regrestore_cmds(adreno_dev, drawctxt);
 	build_regsave_cmds(adreno_dev, drawctxt);
 
@@ -1160,6 +1345,7 @@
 	return 0;
 }
 
+/* create buffers for saving/restoring registers, constants, & GMEM */
 static int a2xx_create_gmem_shadow(struct adreno_device *adreno_dev,
 			struct adreno_context *drawctxt)
 {
@@ -1174,22 +1360,22 @@
 	if (result)
 		return result;
 
-	
+	/* set the gmem shadow flag for the context */
 	drawctxt->flags |= CTXT_FLAGS_GMEM_SHADOW;
 
-	
+	/* blank out gmem shadow. */
 	kgsl_sharedmem_set(&drawctxt->context_gmem_shadow.gmemshadow, 0, 0,
 			   drawctxt->context_gmem_shadow.size);
 
-	
+	/* build quad vertex buffer */
 	build_quad_vtxbuff(drawctxt, &drawctxt->context_gmem_shadow,
 		&tmp_ctx.cmd);
 
-	
+	/* build TP0_CHICKEN register restore command buffer */
 	if (!(drawctxt->flags & CTXT_FLAGS_PREAMBLE))
 		tmp_ctx.cmd = build_chicken_restore_cmds(drawctxt);
 
-	
+	/* build indirect command buffers to save & restore gmem */
 	drawctxt->context_gmem_shadow.gmem_save_commands = tmp_ctx.cmd;
 	tmp_ctx.cmd =
 	    build_gmem2sys_cmds(adreno_dev, drawctxt,
@@ -1215,6 +1401,12 @@
 {
 	int ret;
 
+	/*
+	 * Allocate memory for the GPU state and the context commands.
+	 * Despite the name, this is much more then just storage for
+	 * the gpustate.  This contains command space for gmem save
+	 * and texture and vertex buffer storage too
+	 */
 
 	ret = kgsl_allocate(&drawctxt->gpustate,
 		drawctxt->pagetable, _context_size(adreno_dev));
@@ -1242,7 +1434,7 @@
 			goto done;
 	}
 
-	
+	/* Flush and sync the gpustate memory */
 
 	kgsl_cache_range_op(&drawctxt->gpustate,
 			    KGSL_CACHE_OP_FLUSH);
@@ -1267,11 +1459,23 @@
 
 	if (adreno_is_a225(adreno_dev)) {
 		adreno_dev->gpudev->ctx_switches_since_last_draw++;
+		/* If there have been > than
+		 * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context
+		 * switches w/o gmem being saved then we need to execute
+		 * this workaround */
 		if (adreno_dev->gpudev->ctx_switches_since_last_draw >
 				ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW)
 			adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
 		else
 			return;
+		/*
+		 * Issue an empty draw call to avoid possible hangs due to
+		 * repeated idles without intervening draw calls.
+		 * On adreno 225 the PC block has a cache that is only
+		 * flushed on draw calls and repeated idles can make it
+		 * overflow. The gmem save path contains draw calls so
+		 * this workaround isn't needed there.
+		 */
 		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
 		*cmds++ = 0;
@@ -1284,6 +1488,11 @@
 		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 		*cmds++ = 0x00000000;
 	} else {
+		/* On Adreno 20x/220, if the events for shader space reuse
+		 * gets dropped, the CP block would wait indefinitely.
+		 * Sending CP_SET_SHADER_BASES packet unblocks the CP from
+		 * this wait.
+		 */
 		*cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1);
 		*cmds++ = adreno_encode_istore_size(adreno_dev)
 					| adreno_dev->pix_shader_start;
@@ -1298,7 +1507,7 @@
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 
-	if (context == NULL)
+	if (context == NULL || (context->flags & CTXT_FLAGS_BEING_DESTROYED))
 		return;
 
 	if (context->flags & CTXT_FLAGS_GPU_HANG)
@@ -1307,17 +1516,21 @@
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
 
-		
+		/* save registers and constants. */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->reg_save, 3);
 
 		if (context->flags & CTXT_FLAGS_SHADER_SAVE) {
-			
+			/* save shader partitioning and instructions. */
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_PMODE,
 				context->shader_save, 3);
 
+			/*
+			 * fixup shader partitioning parameter for
+			 *  SET_SHADER_BASES.
+			 */
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->shader_fixup, 3);
@@ -1328,11 +1541,14 @@
 
 	if ((context->flags & CTXT_FLAGS_GMEM_SAVE) &&
 	    (context->flags & CTXT_FLAGS_GMEM_SHADOW)) {
+		/* save gmem.
+		 * (note: changes shader. shader must already be saved.)
+		 */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_PMODE,
 			context->context_gmem_shadow.gmem_save, 3);
 
-		
+		/* Restore TP0_CHICKEN */
 		if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
@@ -1352,7 +1568,7 @@
 	unsigned int cmds[5];
 
 	if (context == NULL) {
-		
+		/* No context - set the default apgetable and thats it */
 		kgsl_mmu_setstate(&device->mmu, device->mmu.defaultpagetable,
 				adreno_dev->drawctxt_active->id);
 		return;
@@ -1376,13 +1592,16 @@
 		REG_SHADOW_SIZE + CMD_BUFFER_SIZE + TEX_SHADOW_SIZE, false);
 #endif
 
+	/* restore gmem.
+	 *  (note: changes shader. shader must not already be restored.)
+	 */
 	if (context->flags & CTXT_FLAGS_GMEM_RESTORE) {
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_PMODE,
 			context->context_gmem_shadow.gmem_restore, 3);
 
 		if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
-			
+			/* Restore TP0_CHICKEN */
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->chicken_restore, 3);
@@ -1393,11 +1612,11 @@
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
 
-		
+		/* restore registers and constants. */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, context->reg_restore, 3);
 
-		
+		/* restore shader instructions & partitioning. */
 		if (context->flags & CTXT_FLAGS_SHADER_RESTORE) {
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
@@ -1413,6 +1632,16 @@
 	}
 }
 
+/*
+ * Interrupt management
+ *
+ * a2xx interrupt control is distributed among the various
+ * hardware components (RB, CP, MMU).  The main interrupt
+ * tells us which component fired the interrupt, but one needs
+ * to go to the individual component to find out why.  The
+ * following functions provide the broken out support for
+ * managing the interrupts
+ */
 
 #define RBBM_INT_MASK RBBM_INT_CNTL__RDERR_INT_MASK
 
@@ -1467,6 +1696,11 @@
 
 	if (!status) {
 		if (master_status & MASTER_INT_SIGNAL__CP_INT_STAT) {
+			/* This indicates that we could not read CP_INT_STAT.
+			 * As a precaution just wake up processes so
+			 * they can check their timestamps. Since, we
+			 * did not ack any interrupts this interrupt will
+			 * be generated again */
 			KGSL_DRV_WARN(device, "Unable to read CP_INT_STATUS\n");
 			wake_up_interruptible_all(&device->wait_queue);
 		} else
@@ -1474,32 +1708,21 @@
 		return;
 	}
 
-	if (status & CP_INT_CNTL__RB_INT_MASK) {
-		
-		unsigned int context_id;
-		kgsl_sharedmem_readl(&device->memstore,
-				&context_id,
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-					current_context));
-		if (context_id < KGSL_MEMSTORE_MAX) {
-			kgsl_sharedmem_writel(&rb->device->memstore,
-					KGSL_MEMSTORE_OFFSET(context_id,
-						ts_cmp_enable), 0);
-			wmb();
-		}
-		KGSL_CMD_WARN(rb->device, "ringbuffer rb interrupt\n");
-	}
-
 	for (i = 0; i < ARRAY_SIZE(kgsl_cp_error_irqs); i++) {
 		if (status & kgsl_cp_error_irqs[i].mask) {
 			KGSL_CMD_CRIT(rb->device, "%s\n",
 				 kgsl_cp_error_irqs[i].message);
+			/*
+			 * on fatal errors, turn off the interrupts to
+			 * avoid storming. This has the side effect of
+			 * forcing a PM dump when the timestamp times out
+			 */
 
 			kgsl_pwrctrl_irq(rb->device, KGSL_PWRFLAGS_OFF);
 		}
 	}
 
-	
+	/* only ack bits we understand */
 	status &= CP_INT_MASK;
 	adreno_regwrite(device, REG_CP_INT_ACK, status);
 
@@ -1526,9 +1749,14 @@
 		adreno_regread(device, REG_RBBM_READ_ERROR, &rderr);
 		source = (rderr & RBBM_READ_ERROR_REQUESTER)
 			 ? "host" : "cp";
-		
+		/* convert to dword address */
 		addr = (rderr & RBBM_READ_ERROR_ADDRESS_MASK) >> 2;
 
+		/*
+		 * Log CP_INT_STATUS interrupts from the CP at a
+		 * lower level because they can happen frequently
+		 * and are worked around in a2xx_irq_handler.
+		 */
 		if (addr == REG_CP_INT_STATUS &&
 			rderr & RBBM_READ_ERROR_ERROR &&
 			rderr & RBBM_READ_ERROR_REQUESTER)
@@ -1586,25 +1814,38 @@
 		adreno_regwrite(device, MH_INTERRUPT_MASK, 0);
 	}
 
-	
+	/* Force the writes to post before touching the IRQ line */
 	wmb();
 }
 
+static unsigned int a2xx_irq_pending(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	unsigned int rbbm, cp, mh;
+
+	adreno_regread(device, REG_RBBM_INT_CNTL, &rbbm);
+	adreno_regread(device, REG_CP_INT_CNTL, &cp);
+	adreno_regread(device, MH_INTERRUPT_MASK, &mh);
+
+	return ((rbbm & RBBM_INT_MASK) || (cp & CP_INT_MASK) ||
+		(mh & kgsl_mmu_get_int_mask())) ? 1 : 0;
+}
+
 static void a2xx_rb_init(struct adreno_device *adreno_dev,
 			struct adreno_ringbuffer *rb)
 {
 	unsigned int *cmds, cmds_gpu;
 
-	
-	cmds = adreno_ringbuffer_allocspace(rb, 19);
+	/* ME_INIT */
+	cmds = adreno_ringbuffer_allocspace(rb, NULL, 19);
 	cmds_gpu = rb->buffer_desc.gpuaddr + sizeof(uint)*(rb->wptr-19);
 
 	GSL_RB_WRITE(cmds, cmds_gpu, cp_type3_packet(CP_ME_INIT, 18));
-	
+	/* All fields present (bits 9:0) */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x000003ff);
-	
+	/* Disable/Enable Real-Time Stream processing (present but ignored) */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
-	
+	/* Enable (2D <-> 3D) implicit synchronization (present but ignored) */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
 	GSL_RB_WRITE(cmds, cmds_gpu,
@@ -1624,23 +1865,28 @@
 	GSL_RB_WRITE(cmds, cmds_gpu,
 		SUBBLOCK_OFFSET(REG_PA_SU_POLY_OFFSET_FRONT_SCALE));
 
-	
+	/* Instruction memory size: */
 	GSL_RB_WRITE(cmds, cmds_gpu,
 		(adreno_encode_istore_size(adreno_dev)
 		| adreno_dev->pix_shader_start));
-	
+	/* Maximum Contexts */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000001);
+	/* Write Confirm Interval and The CP will wait the
+	* wait_interval * 16 clocks between polling  */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
-	
+	/* NQ and External Memory Swap */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
+	/* Protected mode error checking
+	 * If iommu is used then protection needs to be turned off
+	 * to enable context bank switching */
 	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
 		GSL_RB_WRITE(cmds, cmds_gpu, 0);
 	else
 		GSL_RB_WRITE(cmds, cmds_gpu, GSL_RB_PROTECTED_MODE_CONTROL);
-	
+	/* Disable header dumping and Header dump address */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
-	
+	/* Header dump size */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
 	adreno_ringbuffer_submit(rb);
@@ -1651,18 +1897,18 @@
 	struct kgsl_device *device = &adreno_dev->dev;
 	unsigned int reg, val;
 
-	
+	/* Freeze the counter */
 	adreno_regwrite(device, REG_CP_PERFMON_CNTL,
 		REG_PERF_MODE_CNT | REG_PERF_STATE_FREEZE);
 
-	
+	/* Get the value */
 	adreno_regread(device, REG_RBBM_PERFCOUNTER1_LO, &val);
 
-	
+	/* Reset the counter */
 	adreno_regwrite(device, REG_CP_PERFMON_CNTL,
 		REG_PERF_MODE_CNT | REG_PERF_STATE_RESET);
 
-	
+	/* Re-Enable the performance monitors */
 	adreno_regread(device, REG_RBBM_PM_OVERRIDE2, &reg);
 	adreno_regwrite(device, REG_RBBM_PM_OVERRIDE2, (reg | 0x40));
 	adreno_regwrite(device, REG_RBBM_PERFCOUNTER1_SELECT, 0x1);
@@ -1679,7 +1925,7 @@
 	unsigned int gmem_size;
 	unsigned int edram_value = 0;
 
-	
+	/* get edram_size value equivalent */
 	gmem_size = (adreno_dev->gmem_size >> 14);
 	while (gmem_size >>= 1)
 		edram_value++;
@@ -1687,9 +1933,9 @@
 	rb_edram_info.val = 0;
 
 	rb_edram_info.f.edram_size = edram_value;
-	rb_edram_info.f.edram_mapping_mode = 0; 
+	rb_edram_info.f.edram_mapping_mode = 0; /* EDRAM_MAP_UPPER */
 
-	
+	/* must be aligned to size */
 	rb_edram_info.f.edram_range = (adreno_dev->gmem_base >> 14);
 
 	adreno_regwrite(device, REG_RB_EDRAM_INFO, rb_edram_info.val);
@@ -1699,9 +1945,18 @@
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 
+	/*
+	 * We need to make sure all blocks are powered up and clocked
+	 * before issuing a soft reset.  The overrides will then be
+	 * turned off (set to 0)
+	 */
 	adreno_regwrite(device, REG_RBBM_PM_OVERRIDE1, 0xfffffffe);
 	adreno_regwrite(device, REG_RBBM_PM_OVERRIDE2, 0xffffffff);
 
+	/*
+	 * Only reset CP block if all blocks have previously been
+	 * reset
+	 */
 	if (!(device->flags & KGSL_FLAGS_SOFT_RESET) ||
 		!adreno_is_a22x(adreno_dev)) {
 		adreno_regwrite(device, REG_RBBM_SOFT_RESET,
@@ -1711,17 +1966,24 @@
 		adreno_regwrite(device, REG_RBBM_SOFT_RESET,
 			0x00000001);
 	}
+	/*
+	 * The core is in an indeterminate state until the reset
+	 * completes after 30ms.
+	 */
 	msleep(30);
 
 	adreno_regwrite(device, REG_RBBM_SOFT_RESET, 0x00000000);
 
 	if (adreno_is_a225(adreno_dev)) {
-		
+		/* Enable large instruction store for A225 */
 		adreno_regwrite(device, REG_SQ_FLOW_CONTROL,
 			0x18000000);
 	}
 
-	if (adreno_is_a203(adreno_dev))
+	if (adreno_is_a20x(adreno_dev))
+		/* For A20X based targets increase number of clocks
+		 * that RBBM will wait before de-asserting Register
+		 * Clock Active signal */
 		adreno_regwrite(device, REG_RBBM_CNTL, 0x0000FFFF);
 	else
 		adreno_regwrite(device, REG_RBBM_CNTL, 0x00004442);
@@ -1741,7 +2003,7 @@
 
 	adreno_regwrite(device, REG_RBBM_DEBUG, 0x00080000);
 
-	
+	/* Make sure interrupts are disabled */
 	adreno_regwrite(device, REG_RBBM_INT_CNTL, 0);
 	adreno_regwrite(device, REG_CP_INT_CNTL, 0);
 	adreno_regwrite(device, REG_SQ_INT_CNTL, 0);
@@ -1749,6 +2011,7 @@
 	a2xx_gmeminit(adreno_dev);
 }
 
+/* Defined in adreno_a2xx_snapshot.c */
 void *a2xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 	int *remain, int hang);
 
@@ -1763,6 +2026,7 @@
 	.ctxt_draw_workaround = a2xx_drawctxt_draw_workaround,
 	.irq_handler = a2xx_irq_handler,
 	.irq_control = a2xx_irq_control,
+	.irq_pending = a2xx_irq_pending,
 	.snapshot = a2xx_snapshot,
 	.rb_init = a2xx_rb_init,
 	.busy_cycles = a2xx_busy_cycles,
diff --git a/drivers/gpu/msm/adreno_a2xx_snapshot.c b/drivers/gpu/msm/adreno_a2xx_snapshot.c
index e1cf325..75795b1 100644
--- a/drivers/gpu/msm/adreno_a2xx_snapshot.c
+++ b/drivers/gpu/msm/adreno_a2xx_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -18,6 +18,7 @@
 #define DEBUG_SECTION_SZ(_dwords) (((_dwords) * sizeof(unsigned int)) \
 		+ sizeof(struct kgsl_snapshot_debug))
 
+/* Dump the SX debug registers into a GPU snapshot debug section */
 
 #define SXDEBUG_COUNT 0x1B
 
@@ -73,6 +74,11 @@
 	return DEBUG_SECTION_SZ(CPDEBUG_COUNT);
 }
 
+/*
+ * The contents of the SQ debug sections are dword pairs:
+ * [register offset]:[value]
+ * This macro writes both dwords for the given register
+ */
 
 #define SQ_DEBUG_WRITE(_device, _reg, _data, _offset) \
 	do { _data[(_offset)++] = (_reg); \
@@ -218,15 +224,19 @@
 	return DEBUG_SECTION_SZ(MIUDEBUG_COUNT);
 }
 
+/* A2XX GPU snapshot function - this is where all of the A2XX specific
+ * bits and pieces are grabbed into the snapshot memory
+ */
 
 void *a2xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 	int *remain, int hang)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
+	struct kgsl_snapshot_registers_list list;
 	struct kgsl_snapshot_registers regs;
 	unsigned int pmoverride;
 
-	
+	/* Choose the register set to dump */
 
 	if (adreno_is_a20x(adreno_dev)) {
 		regs.regs = (unsigned int *) a200_registers;
@@ -239,85 +249,96 @@
 		regs.count = a225_registers_count;
 	}
 
-	
+	list.registers = &regs;
+	list.count = 1;
+
+	/* Master set of (non debug) registers */
 	snapshot = kgsl_snapshot_add_section(device,
 		KGSL_SNAPSHOT_SECTION_REGS, snapshot, remain,
-		kgsl_snapshot_dump_regs, &regs);
+		kgsl_snapshot_dump_regs, &list);
 
-	
+	/* CP_STATE_DEBUG indexed registers */
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_CP_STATE_DEBUG_INDEX,
 			REG_CP_STATE_DEBUG_DATA, 0x0, 0x14);
 
-	
+	/* CP_ME indexed registers */
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_CP_ME_CNTL, REG_CP_ME_STATUS,
 			64, 44);
 
+	/*
+	 * Need to temporarily turn off clock gating for the debug bus to
+	 * work
+	 */
 
 	adreno_regread(device, REG_RBBM_PM_OVERRIDE2, &pmoverride);
 	adreno_regwrite(device, REG_RBBM_PM_OVERRIDE2, 0xFF);
 
-	
+	/* SX debug registers */
 	snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a2xx_snapshot_sxdebug, NULL);
 
-	
+	/* SU debug indexed registers (only for < 470) */
 	if (!adreno_is_a22x(adreno_dev))
 		snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 				remain, REG_PA_SU_DEBUG_CNTL,
 				REG_PA_SU_DEBUG_DATA,
 				0, 0x1B);
 
-	
+	/* CP debug registers */
 	snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a2xx_snapshot_cpdebug, NULL);
 
-	
+	/* MH debug indexed registers */
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, MH_DEBUG_CTRL, MH_DEBUG_DATA, 0x0, 0x40);
 
-	
+	/* Leia only register sets */
 	if (adreno_is_a22x(adreno_dev)) {
-		
+		/* RB DEBUG indexed regisers */
 		snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_RB_DEBUG_CNTL, REG_RB_DEBUG_DATA, 0, 8);
 
-		
+		/* RB DEBUG indexed registers bank 2 */
 		snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_RB_DEBUG_CNTL, REG_RB_DEBUG_DATA + 0x1000,
 			0, 8);
 
-		
+		/* PC_DEBUG indexed registers */
 		snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_PC_DEBUG_CNTL, REG_PC_DEBUG_DATA, 0, 8);
 
-		
+		/* GRAS_DEBUG indexed registers */
 		snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_GRAS_DEBUG_CNTL, REG_GRAS_DEBUG_DATA, 0, 4);
 
-		
+		/* MIU debug registers */
 		snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a2xx_snapshot_miudebug, NULL);
 
-		
+		/* SQ DEBUG debug registers */
 		snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a2xx_snapshot_sqdebug, NULL);
 
+		/*
+		 * Reading SQ THREAD causes bad things to happen on a running
+		 * system, so only read it if the GPU is already hung
+		 */
 
 		if (hang) {
-			
+			/* SQ THREAD debug registers */
 			snapshot = kgsl_snapshot_add_section(device,
 				KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 				a2xx_snapshot_sqthreaddebug, NULL);
 		}
 	}
 
-	
+	/* Reset the clock gating */
 	adreno_regwrite(device, REG_RBBM_PM_OVERRIDE2, pmoverride);
 
 	return snapshot;
diff --git a/drivers/gpu/msm/adreno_a2xx_trace.c b/drivers/gpu/msm/adreno_a2xx_trace.c
index b398c74..87c930b 100644
--- a/drivers/gpu/msm/adreno_a2xx_trace.c
+++ b/drivers/gpu/msm/adreno_a2xx_trace.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -14,5 +14,6 @@
 #include "kgsl.h"
 #include "adreno.h"
 
+/* Instantiate tracepoints */
 #define CREATE_TRACE_POINTS
 #include "adreno_a2xx_trace.h"
diff --git a/drivers/gpu/msm/adreno_a2xx_trace.h b/drivers/gpu/msm/adreno_a2xx_trace.h
index b4fb47d..af355d6 100644
--- a/drivers/gpu/msm/adreno_a2xx_trace.h
+++ b/drivers/gpu/msm/adreno_a2xx_trace.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -25,6 +25,9 @@
 
 struct kgsl_device;
 
+/*
+ * Tracepoint for a2xx irq. Includes status info
+ */
 TRACE_EVENT(kgsl_a2xx_irq_status,
 
 	TP_PROTO(struct kgsl_device *device, unsigned int master_status,
@@ -69,6 +72,7 @@
 	)
 );
 
-#endif 
+#endif /* _ADRENO_A2XX_TRACE_H */
 
+/* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/drivers/gpu/msm/adreno_a3xx.c b/drivers/gpu/msm/adreno_a3xx.c
index d550c62..3d9ec6d 100644
--- a/drivers/gpu/msm/adreno_a3xx.c
+++ b/drivers/gpu/msm/adreno_a3xx.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,6 +22,11 @@
 #include "a3xx_reg.h"
 #include "adreno_a3xx_trace.h"
 
+/*
+ * Set of registers to dump for A3XX on postmortem and snapshot.
+ * Registers in pairs - first value is the start offset, second
+ * is the stop offset (inclusive)
+ */
 
 const unsigned int a3xx_registers[] = {
 	0x0000, 0x0002, 0x0010, 0x0012, 0x0018, 0x0018, 0x0020, 0x0027,
@@ -64,6 +69,9 @@
 
 const unsigned int a3xx_registers_count = ARRAY_SIZE(a3xx_registers) / 2;
 
+/* Removed the following HLSQ register ranges from being read during
+ * fault tolerance since reading the registers may cause the device to hang:
+ */
 const unsigned int a3xx_hlsq_registers[] = {
 	0x0e00, 0x0e05, 0x0e0c, 0x0e0c, 0x0e22, 0x0e23,
 	0x2200, 0x2212, 0x2214, 0x2217, 0x221a, 0x221a,
@@ -73,26 +81,58 @@
 const unsigned int a3xx_hlsq_registers_count =
 			ARRAY_SIZE(a3xx_hlsq_registers) / 2;
 
+/* The set of additional registers to be dumped for A330 */
+
+const unsigned int a330_registers[] = {
+	0x1d0, 0x1d0, 0x1d4, 0x1d4, 0x453, 0x453,
+};
+
+const unsigned int a330_registers_count = ARRAY_SIZE(a330_registers) / 2;
+
+/* Simple macro to facilitate bit setting in the gmem2sys and sys2gmem
+ * functions.
+ */
 
 #define _SET(_shift, _val) ((_val) << (_shift))
 
+/*
+ ****************************************************************************
+ *
+ * Context state shadow structure:
+ *
+ * +---------------------+------------+-------------+---------------------+---+
+ * | ALU Constant Shadow | Reg Shadow | C&V Buffers | Shader Instr Shadow |Tex|
+ * +---------------------+------------+-------------+---------------------+---+
+ *
+ *		 8K - ALU Constant Shadow (8K aligned)
+ *		 4K - H/W Register Shadow (8K aligned)
+ *		 5K - Command and Vertex Buffers
+ *		 8K - Shader Instruction Shadow
+ *		 ~6K - Texture Constant Shadow
+ *
+ *
+ ***************************************************************************
+ */
 
-#define ALU_SHADOW_SIZE      (8*1024) 
-#define REG_SHADOW_SIZE      (4*1024) 
-#define CMD_BUFFER_SIZE      (5*1024) 
-#define TEX_SIZE_MEM_OBJECTS 896      
-#define TEX_SIZE_MIPMAP      1936     
-#define TEX_SIZE_SAMPLER_OBJ 256      
+/* Sizes of all sections in state shadow memory */
+#define ALU_SHADOW_SIZE      (8*1024) /* 8KB */
+#define REG_SHADOW_SIZE      (4*1024) /* 4KB */
+#define CMD_BUFFER_SIZE      (5*1024) /* 5KB */
+#define TEX_SIZE_MEM_OBJECTS 896      /* bytes */
+#define TEX_SIZE_MIPMAP      1936     /* bytes */
+#define TEX_SIZE_SAMPLER_OBJ 256      /* bytes */
 #define TEX_SHADOW_SIZE                            \
 	((TEX_SIZE_MEM_OBJECTS + TEX_SIZE_MIPMAP + \
-	TEX_SIZE_SAMPLER_OBJ)*2) 
-#define SHADER_SHADOW_SIZE   (8*1024) 
+	TEX_SIZE_SAMPLER_OBJ)*2) /* ~6KB */
+#define SHADER_SHADOW_SIZE   (8*1024) /* 8KB */
 
+/* Total context size, excluding GMEM shadow */
 #define CONTEXT_SIZE                         \
 	(ALU_SHADOW_SIZE+REG_SHADOW_SIZE +   \
 	CMD_BUFFER_SIZE+SHADER_SHADOW_SIZE + \
 	TEX_SHADOW_SIZE)
 
+/* Offsets to different sections in context shadow memory */
 #define REG_OFFSET ALU_SHADOW_SIZE
 #define CMD_OFFSET (REG_OFFSET+REG_SHADOW_SIZE)
 #define SHADER_OFFSET (CMD_OFFSET+CMD_BUFFER_SIZE)
@@ -105,12 +145,14 @@
 #define FS_TEX_OFFSET_MIPMAP (FS_TEX_OFFSET_MEM_OBJECTS+TEX_SIZE_MEM_OBJECTS)
 #define FS_TEX_OFFSET_SAMPLER_OBJ (FS_TEX_OFFSET_MIPMAP+TEX_SIZE_MIPMAP)
 
+/* The offset for fragment shader data in HLSQ context */
 #define SSIZE (16*1024)
 
 #define HLSQ_SAMPLER_OFFSET 0x000
 #define HLSQ_MEMOBJ_OFFSET  0x400
 #define HLSQ_MIPMAP_OFFSET  0x800
 
+/* Use shadow RAM */
 #define HLSQ_SHADOW_BASE		(0x10000+SSIZE*2)
 
 #define REG_TO_MEM_LOOP_COUNT_SHIFT	18
@@ -124,6 +166,10 @@
 	((vis_cull_mode)   << PC_DRAW_INITIATOR_VISIBILITY_CULLING_MODE) | \
 	(1                 << PC_DRAW_INITIATOR_PRE_DRAW_INITIATOR_ENABLE))
 
+/*
+ * List of context registers (starting from dword offset 0x2000).
+ * Each line contains start and end of a range of registers.
+ */
 static const unsigned int context_register_ranges[] = {
 	A3XX_GRAS_CL_CLIP_CNTL, A3XX_GRAS_CL_CLIP_CNTL,
 	A3XX_GRAS_CL_GB_CLIP_ADJ, A3XX_GRAS_CL_GB_CLIP_ADJ,
@@ -158,6 +204,7 @@
 	A3XX_VPC_ATTR, A3XX_VPC_VARY_CYLWRAP_ENABLE_1,
 };
 
+/* Global registers that need to be saved separately */
 static const unsigned int global_registers[] = {
 	A3XX_GRAS_CL_USER_PLANE_X0, A3XX_GRAS_CL_USER_PLANE_Y0,
 	A3XX_GRAS_CL_USER_PLANE_Z0, A3XX_GRAS_CL_USER_PLANE_W0,
@@ -189,31 +236,35 @@
 
 #define GLOBAL_REGISTER_COUNT ARRAY_SIZE(global_registers)
 
+/* A scratchpad used to build commands during context create */
 static struct tmp_ctx {
-	unsigned int *cmd; 
+	unsigned int *cmd; /* Next available dword in C&V buffer */
 
-	
+	/* Addresses in comamnd buffer where registers are saved */
 	uint32_t reg_values[GLOBAL_REGISTER_COUNT];
-	uint32_t gmem_base; 
+	uint32_t gmem_base; /* Base GPU address of GMEM */
 } tmp_ctx;
 
 #ifndef GSL_CONTEXT_SWITCH_CPU_SYNC
+/*
+ * Function for executing dest = ( (reg & and) ROL rol ) | or
+ */
 static unsigned int *rmw_regtomem(unsigned int *cmd,
 				  unsigned int reg, unsigned int and,
 				  unsigned int rol, unsigned int or,
 				  unsigned int dest)
 {
-	
+	/* CP_SCRATCH_REG2 = (CP_SCRATCH_REG2 & 0x00000000) | reg */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = (1 << 30) | A3XX_CP_SCRATCH_REG2;
-	*cmd++ = 0x00000000;	
-	*cmd++ = reg;		
+	*cmd++ = 0x00000000;	/* AND value */
+	*cmd++ = reg;		/* OR address */
 
-	
+	/* CP_SCRATCH_REG2 = ( (CP_SCRATCH_REG2 & and) ROL rol ) |  or */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = (rol << 24) | A3XX_CP_SCRATCH_REG2;
-	*cmd++ = and;		
-	*cmd++ = or;		
+	*cmd++ = and;		/* AND value */
+	*cmd++ = or;		/* OR value */
 
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_CP_SCRATCH_REG2;
@@ -241,15 +292,23 @@
 	*cmd++ = 0;
 
 #ifndef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
+	/*
+	 * Context registers are already shadowed; just need to
+	 * disable shadowing to prevent corruption.
+	 */
 
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = (drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000;
-	*cmd++ = 4 << 16;	
-	*cmd++ = 0x0;		
+	*cmd++ = 4 << 16;	/* regs, start=0 */
+	*cmd++ = 0x0;		/* count = 0 */
 
 #else
+	/*
+	 * Make sure the HW context has the correct register values before
+	 * reading them.
+	 */
 
-	
+	/* Write context registers into shadow */
 	for (i = 0; i < ARRAY_SIZE(context_register_ranges) / 2; i++) {
 		unsigned int start = context_register_ranges[i * 2];
 		unsigned int end = context_register_ranges[i * 2 + 1];
@@ -261,42 +320,75 @@
 	}
 #endif
 
-	
+	/* Need to handle some of the global registers separately */
 	for (i = 0; i < ARRAY_SIZE(global_registers); i++) {
 		*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 		*cmd++ = global_registers[i];
 		*cmd++ = tmp_ctx.reg_values[i];
 	}
 
-	
+	/* Save vertex shader constants */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[2].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[2].gpuaddr >> 2;
 	*cmd++ = 0x0000FFFF;
-	*cmd++ = 3; 
+	*cmd++ = 3; /* EXEC_COUNT */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	drawctxt->constant_save_commands[1].hostptr = cmd;
 	drawctxt->constant_save_commands[1].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;	
-	
+	/*
+	   From fixup:
+
+	   dwords = SP_VS_CTRL_REG1.VSCONSTLENGTH / 4
+	   src = (HLSQ_SHADOW_BASE + 0x2000) / 4
+
+	   From register spec:
+	   SP_VS_CTRL_REG1.VSCONSTLENGTH [09:00]: 0-512, unit = 128bits.
+	 */
+	*cmd++ = 0;	/* (dwords << REG_TO_MEM_LOOP_COUNT_SHIFT) | src  */
+	/* ALU constant shadow base */
 	*cmd++ = drawctxt->gpustate.gpuaddr & 0xfffffffc;
 
-	
+	/* Save fragment shader constants */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[3].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[3].gpuaddr >> 2;
 	*cmd++ = 0x0000FFFF;
-	*cmd++ = 3; 
+	*cmd++ = 3; /* EXEC_COUNT */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	drawctxt->constant_save_commands[2].hostptr = cmd;
 	drawctxt->constant_save_commands[2].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;	
+	/*
+	   From fixup:
 
-	*cmd++ = 0;		
+	   dwords = SP_FS_CTRL_REG1.FSCONSTLENGTH / 4
+	   src = (HLSQ_SHADOW_BASE + 0x2000 + SSIZE) / 4
 
-	
+	   From register spec:
+	   SP_FS_CTRL_REG1.FSCONSTLENGTH [09:00]: 0-512, unit = 128bits.
+	 */
+	*cmd++ = 0;	/* (dwords << REG_TO_MEM_LOOP_COUNT_SHIFT) | src  */
+
+	/*
+	   From fixup:
+
+	   base = drawctxt->gpustate.gpuaddr (ALU constant shadow base)
+	   offset = SP_FS_OBJ_OFFSET_REG.CONSTOBJECTSTARTOFFSET
+
+	   From register spec:
+	   SP_FS_OBJ_OFFSET_REG.CONSTOBJECTSTARTOFFSET [16:24]: Constant object
+	   start offset in on chip RAM,
+	   128bit aligned
+
+	   dst = base + offset
+	   Because of the base alignment we can use
+	   dst = base | offset
+	 */
+	*cmd++ = 0;		/* dst */
+
+	/* Save VS texture memory objects */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ =
 	    ((TEX_SIZE_MEM_OBJECTS / 4) << REG_TO_MEM_LOOP_COUNT_SHIFT) |
@@ -305,7 +397,7 @@
 	    (drawctxt->gpustate.gpuaddr +
 	     VS_TEX_OFFSET_MEM_OBJECTS) & 0xfffffffc;
 
-	
+	/* Save VS texture mipmap pointers */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ =
 	    ((TEX_SIZE_MIPMAP / 4) << REG_TO_MEM_LOOP_COUNT_SHIFT) |
@@ -313,7 +405,7 @@
 	*cmd++ =
 	    (drawctxt->gpustate.gpuaddr + VS_TEX_OFFSET_MIPMAP) & 0xfffffffc;
 
-	
+	/* Save VS texture sampler objects */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = ((TEX_SIZE_SAMPLER_OBJ / 4) << REG_TO_MEM_LOOP_COUNT_SHIFT) |
 		((HLSQ_SHADOW_BASE + HLSQ_SAMPLER_OFFSET) / 4);
@@ -321,7 +413,7 @@
 	    (drawctxt->gpustate.gpuaddr +
 	     VS_TEX_OFFSET_SAMPLER_OBJ) & 0xfffffffc;
 
-	
+	/* Save FS texture memory objects */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ =
 	    ((TEX_SIZE_MEM_OBJECTS / 4) << REG_TO_MEM_LOOP_COUNT_SHIFT) |
@@ -330,7 +422,7 @@
 	    (drawctxt->gpustate.gpuaddr +
 	     FS_TEX_OFFSET_MEM_OBJECTS) & 0xfffffffc;
 
-	
+	/* Save FS texture mipmap pointers */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ =
 	    ((TEX_SIZE_MIPMAP / 4) << REG_TO_MEM_LOOP_COUNT_SHIFT) |
@@ -338,7 +430,7 @@
 	*cmd++ =
 	    (drawctxt->gpustate.gpuaddr + FS_TEX_OFFSET_MIPMAP) & 0xfffffffc;
 
-	
+	/* Save FS texture sampler objects */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ =
 	    ((TEX_SIZE_SAMPLER_OBJ / 4) << REG_TO_MEM_LOOP_COUNT_SHIFT) |
@@ -347,12 +439,13 @@
 	    (drawctxt->gpustate.gpuaddr +
 	     FS_TEX_OFFSET_SAMPLER_OBJ) & 0xfffffffc;
 
-	
+	/* Create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->regconstant_save, start, cmd);
 
 	tmp_ctx.cmd = cmd;
 }
 
+/* Copy GMEM contents to system memory shadow. */
 static unsigned int *build_gmem2sys_cmds(struct adreno_device *adreno_dev,
 					 struct adreno_context *drawctxt,
 					 struct gmem_shadow_t *shadow)
@@ -366,28 +459,28 @@
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MODE_CONTROL);
 
-	
+	/* RB_MODE_CONTROL */
 	*cmds++ = _SET(RB_MODECONTROL_RENDER_MODE, RB_RESOLVE_PASS) |
 		_SET(RB_MODECONTROL_MARB_CACHE_SPLIT_MODE, 1) |
 		_SET(RB_MODECONTROL_PACKER_TIMER_ENABLE, 1);
-	
+	/* RB_RENDER_CONTROL */
 	*cmds++ = _SET(RB_RENDERCONTROL_BIN_WIDTH, shadow->width >> 5) |
 		_SET(RB_RENDERCONTROL_DISABLE_COLOR_PIPE, 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_RB_COPY_CONTROL);
-	
+	/* RB_COPY_CONTROL */
 	*cmds++ = _SET(RB_COPYCONTROL_RESOLVE_CLEAR_MODE,
 		RB_CLEAR_MODE_RESOLVE) |
 		_SET(RB_COPYCONTROL_COPY_GMEM_BASE,
 		tmp_ctx.gmem_base >> 14);
-	
+	/* RB_COPY_DEST_BASE */
 	*cmds++ = _SET(RB_COPYDESTBASE_COPY_DEST_BASE,
 		shadow->gmemshadow.gpuaddr >> 5);
-	
+	/* RB_COPY_DEST_PITCH */
 	*cmds++ = _SET(RB_COPYDESTPITCH_COPY_DEST_PITCH,
 		(shadow->pitch * 4) / 32);
-	
+	/* RB_COPY_DEST_INFO */
 	*cmds++ = _SET(RB_COPYDESTINFO_COPY_DEST_TILE,
 		RB_TILINGMODE_LINEAR) |
 		_SET(RB_COPYDESTINFO_COPY_DEST_FORMAT, RB_R8G8B8A8_UNORM) |
@@ -396,34 +489,34 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_SC_CONTROL);
-	
+	/* GRAS_SC_CONTROL */
 	*cmds++ = _SET(GRAS_SC_CONTROL_RENDER_MODE, 2);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_VFD_CONTROL_0);
-	
+	/* VFD_CONTROL_0 */
 	*cmds++ = _SET(VFD_CTRLREG0_TOTALATTRTOVS, 4) |
 		_SET(VFD_CTRLREG0_PACKETSIZE, 2) |
 		_SET(VFD_CTRLREG0_STRMDECINSTRCNT, 1) |
 		_SET(VFD_CTRLREG0_STRMFETCHINSTRCNT, 1);
-	
+	/* VFD_CONTROL_1 */
 	*cmds++ = _SET(VFD_CTRLREG1_MAXSTORAGE, 1) |
 		_SET(VFD_CTRLREG1_REGID4VTX,  252) |
 		_SET(VFD_CTRLREG1_REGID4INST,  252);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_VFD_FETCH_INSTR_0_0);
-	
+	/* VFD_FETCH_INSTR_0_0 */
 	*cmds++ = _SET(VFD_FETCHINSTRUCTIONS_FETCHSIZE, 11) |
 		_SET(VFD_FETCHINSTRUCTIONS_BUFSTRIDE, 12) |
 		_SET(VFD_FETCHINSTRUCTIONS_STEPRATE, 1);
-	
+	/* VFD_FETCH_INSTR_1_0 */
 	*cmds++ = _SET(VFD_BASEADDR_BASEADDR,
 		shadow->quad_vertices.gpuaddr);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_VFD_DECODE_INSTR_0);
-	
+	/* VFD_DECODE_INSTR_0 */
 	*cmds++ = _SET(VFD_DECODEINSTRUCTIONS_WRITEMASK, 0x0F) |
 		_SET(VFD_DECODEINSTRUCTIONS_CONSTFILL, 1) |
 		_SET(VFD_DECODEINSTRUCTIONS_FORMAT, 2) |
@@ -432,47 +525,47 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG);
-	
+	/* HLSQ_CONTROL_0_REG */
 	*cmds++ = _SET(HLSQ_CTRL0REG_FSTHREADSIZE, HLSQ_FOUR_PIX_QUADS) |
 		_SET(HLSQ_CTRL0REG_FSSUPERTHREADENABLE, 1) |
 		_SET(HLSQ_CTRL0REG_RESERVED2, 1) |
 		_SET(HLSQ_CTRL0REG_SPCONSTFULLUPDATE, 1);
-	
+	/* HLSQ_CONTROL_1_REG */
 	*cmds++ = _SET(HLSQ_CTRL1REG_VSTHREADSIZE, HLSQ_TWO_VTX_QUADS) |
 		_SET(HLSQ_CTRL1REG_VSSUPERTHREADENABLE, 1);
-	
+	/* HLSQ_CONTROL_2_REG */
 	*cmds++ = _SET(HLSQ_CTRL2REG_PRIMALLOCTHRESHOLD, 31);
-	
+	/* HLSQ_CONTROL_3_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_HLSQ_VS_CONTROL_REG);
-	
+	/* HLSQ_VS_CONTROL_REG */
 	*cmds++ = _SET(HLSQ_VSCTRLREG_VSINSTRLENGTH, 1);
-	
+	/* HLSQ_FS_CONTROL_REG */
 	*cmds++ = _SET(HLSQ_FSCTRLREG_FSCONSTLENGTH, 1) |
 		_SET(HLSQ_FSCTRLREG_FSCONSTSTARTOFFSET, 128) |
 		_SET(HLSQ_FSCTRLREG_FSINSTRLENGTH, 1);
-	
+	/* HLSQ_CONST_VSPRESV_RANGE_REG */
 	*cmds++ = 0x00000000;
-	
+	/* HLSQ_CONST_FSPRESV_RANGE_REQ */
 	*cmds++ = _SET(HLSQ_CONSTFSPRESERVEDRANGEREG_STARTENTRY, 32) |
 		_SET(HLSQ_CONSTFSPRESERVEDRANGEREG_ENDENTRY, 32);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_FS_LENGTH_REG);
-	
+	/* SP_FS_LENGTH_REG */
 	*cmds++ = _SET(SP_SHADERLENGTH_LEN, 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_SP_CTRL_REG);
-	
+	/* SP_SP_CTRL_REG */
 	*cmds++ = _SET(SP_SPCTRLREG_SLEEPMODE, 1) |
 		_SET(SP_SPCTRLREG_LOMODE, 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 12);
 	*cmds++ = CP_REG(A3XX_SP_VS_CTRL_REG0);
-	
+	/* SP_VS_CTRL_REG0 */
 	*cmds++ = _SET(SP_VSCTRLREG0_VSTHREADMODE, SP_MULTI) |
 		_SET(SP_VSCTRLREG0_VSINSTRBUFFERMODE, SP_BUFFER_MODE) |
 		_SET(SP_VSCTRLREG0_VSICACHEINVALID, 1) |
@@ -480,47 +573,47 @@
 		_SET(SP_VSCTRLREG0_VSTHREADSIZE, SP_TWO_VTX_QUADS) |
 		_SET(SP_VSCTRLREG0_VSSUPERTHREADMODE, 1) |
 		_SET(SP_VSCTRLREG0_VSLENGTH, 1);
-	
+	/* SP_VS_CTRL_REG1 */
 	*cmds++ = _SET(SP_VSCTRLREG1_VSINITIALOUTSTANDING, 4);
-	
+	/* SP_VS_PARAM_REG */
 	*cmds++ = _SET(SP_VSPARAMREG_PSIZEREGID, 252);
-	
+	/* SP_VS_OUT_REG_0 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_1 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_2 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_3 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_4 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_5 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_6 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG_7 */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 7);
 	*cmds++ = CP_REG(A3XX_SP_VS_VPC_DST_REG_0);
-	
+	/* SP_VS_VPC_DST_REG_0 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_VPC_DST_REG_1 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_VPC_DST_REG_2 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_VPC_DST_REG_3 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OBJ_OFFSET_REG */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OBJ_START_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 6);
 	*cmds++ = CP_REG(A3XX_SP_VS_LENGTH_REG);
-	
+	/* SP_VS_LENGTH_REG */
 	*cmds++ = _SET(SP_SHADERLENGTH_LEN, 1);
-	
+	/* SP_FS_CTRL_REG0 */
 	*cmds++ = _SET(SP_FSCTRLREG0_FSTHREADMODE, SP_MULTI) |
 		_SET(SP_FSCTRLREG0_FSINSTRBUFFERMODE, SP_BUFFER_MODE) |
 		_SET(SP_FSCTRLREG0_FSICACHEINVALID, 1) |
@@ -529,61 +622,61 @@
 		_SET(SP_FSCTRLREG0_FSTHREADSIZE, SP_FOUR_PIX_QUADS) |
 		_SET(SP_FSCTRLREG0_FSSUPERTHREADMODE, 1) |
 		_SET(SP_FSCTRLREG0_FSLENGTH, 1);
-	
+	/* SP_FS_CTRL_REG1 */
 	*cmds++ = _SET(SP_FSCTRLREG1_FSCONSTLENGTH, 1) |
 		_SET(SP_FSCTRLREG1_HALFPRECVAROFFSET, 63);
-	
+	/* SP_FS_OBJ_OFFSET_REG */
 	*cmds++ = _SET(SP_OBJOFFSETREG_CONSTOBJECTSTARTOFFSET, 128) |
 		_SET(SP_OBJOFFSETREG_SHADEROBJOFFSETINIC, 127);
-	
+	/* SP_FS_OBJ_START_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_SP_FS_FLAT_SHAD_MODE_REG_0);
-	
+	/* SP_FS_FLAT_SHAD_MODE_REG_0 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_FS_FLAT_SHAD_MODE_REG_1 */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_FS_OUTPUT_REG);
-	
+	/* SP_FS_OUTPUT_REG */
 	*cmds++ = _SET(SP_IMAGEOUTPUTREG_DEPTHOUTMODE, SP_PIXEL_BASED);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_SP_FS_MRT_REG_0);
-	
+	/* SP_FS_MRT_REG_0 */
 	*cmds++ = _SET(SP_FSMRTREG_PRECISION, 1);
 
-	
+	/* SP_FS_MRT_REG_1 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_FS_MRT_REG_2 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_FS_MRT_REG_3 */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 11);
 	*cmds++ = CP_REG(A3XX_VPC_ATTR);
-	
+	/* VPC_ATTR */
 	*cmds++ = _SET(VPC_VPCATTR_THRHDASSIGN, 1) |
 		_SET(VPC_VPCATTR_LMSIZE, 1);
-	
+	/* VPC_PACK */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARRYING_INTERUPT_MODE_0 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARRYING_INTERUPT_MODE_1 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARRYING_INTERUPT_MODE_2 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARRYING_INTERUPT_MODE_3 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_PS_REPL_MODE_0 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_PS_REPL_MODE_1 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_PS_REPL_MODE_2 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_PS_REPL_MODE_3 */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 10);
@@ -594,13 +687,13 @@
 	*cmds++ = (HLSQ_SP_VS_INSTR << CP_LOADSTATE_STATETYPE_SHIFT)
 		| (0 << CP_LOADSTATE_EXTSRCADDR_SHIFT);
 
-	
+	/* (sy)(rpt3)mov.f32f32 r0.y, (r)r1.y; */
 	*cmds++ = 0x00000000; *cmds++ = 0x13001000;
-	
+	/* end; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
 
 
@@ -618,13 +711,13 @@
 	*cmds++ = (HLSQ_SP_FS_INSTR << CP_LOADSTATE_STATETYPE_SHIFT)
 		| (0 << CP_LOADSTATE_EXTSRCADDR_SHIFT);
 
-	
+	/* (sy)(rpt3)mov.f32f32 r0.y, (r)c0.x; */
 	*cmds++ = 0x00000000; *cmds++ = 0x30201b00;
-	
+	/* end; */
 	*cmds++ = 0x00000000; *cmds++ = 0x03000000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
 
 
@@ -638,18 +731,18 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MSAA_CONTROL);
-	
+	/* RB_MSAA_CONTROL */
 	*cmds++ = _SET(RB_MSAACONTROL_MSAA_DISABLE, 1) |
 		_SET(RB_MSAACONTROL_SAMPLE_MASK, 0xFFFF);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_DEPTH_CONTROL);
-	
+	/* RB_DEPTH_CONTROL */
 	*cmds++ = _SET(RB_DEPTHCONTROL_Z_TEST_FUNC, RB_FRAG_NEVER);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_STENCIL_CONTROL);
-	
+	/* RB_STENCIL_CONTROL */
 	*cmds++ = _SET(RB_STENCILCONTROL_STENCIL_FUNC, RB_REF_NEVER) |
 		_SET(RB_STENCILCONTROL_STENCIL_FAIL, RB_STENCIL_KEEP) |
 		_SET(RB_STENCILCONTROL_STENCIL_ZPASS, RB_STENCIL_KEEP) |
@@ -661,12 +754,12 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_SU_MODE_CONTROL);
-	
+	/* GRAS_SU_MODE_CONTROL */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MRT_CONTROL0);
-	
+	/* RB_MRT_CONTROL0 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_ROP_CODE, 12) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_ALWAYS) |
@@ -674,49 +767,49 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL0);
-	
+	/* RB_MRT_BLEND_CONTROL0 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_CLAMP_ENABLE, 1);
-	
+	/* RB_MRT_CONTROL1 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_DISABLE) |
 		_SET(RB_MRTCONTROL_COMPONENT_ENABLE, 0xF);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL1);
-	
+	/* RB_MRT_BLEND_CONTROL1 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_CLAMP_ENABLE, 1);
-	
+	/* RB_MRT_CONTROL2 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_DISABLE) |
 		_SET(RB_MRTCONTROL_COMPONENT_ENABLE, 0xF);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL2);
-	
+	/* RB_MRT_BLEND_CONTROL2 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_CLAMP_ENABLE, 1);
-	
+	/* RB_MRT_CONTROL3 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_DISABLE) |
 		_SET(RB_MRTCONTROL_COMPONENT_ENABLE, 0xF);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL3);
-	
+	/* RB_MRT_BLEND_CONTROL3 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
@@ -726,36 +819,36 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_VFD_INDEX_MIN);
-	
+	/* VFD_INDEX_MIN */
 	*cmds++ = 0x00000000;
-	
+	/* VFD_INDEX_MAX */
 	*cmds++ = 0x155;
-	
+	/* VFD_INSTANCEID_OFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* VFD_INDEX_OFFSET */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_VFD_VS_THREADING_THRESHOLD);
-	
+	/* VFD_VS_THREADING_THRESHOLD */
 	*cmds++ = _SET(VFD_THREADINGTHRESHOLD_REGID_THRESHOLD, 15) |
 		_SET(VFD_THREADINGTHRESHOLD_REGID_VTXCNT, 252);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_TPL1_TP_VS_TEX_OFFSET);
-	
+	/* TPL1_TP_VS_TEX_OFFSET */
 	*cmds++ = 0;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_TPL1_TP_FS_TEX_OFFSET);
-	
+	/* TPL1_TP_FS_TEX_OFFSET */
 	*cmds++ = _SET(TPL1_TPTEXOFFSETREG_SAMPLEROFFSET, 16) |
 		_SET(TPL1_TPTEXOFFSETREG_MEMOBJOFFSET, 16) |
 		_SET(TPL1_TPTEXOFFSETREG_BASETABLEPTR, 224);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_PC_PRIM_VTX_CNTL);
-	
+	/* PC_PRIM_VTX_CNTL */
 	*cmds++ = _SET(PC_PRIM_VTX_CONTROL_POLYMODE_FRONT_PTYPE,
 		PC_DRAW_TRIANGLES) |
 		_SET(PC_PRIM_VTX_CONTROL_POLYMODE_BACK_PTYPE,
@@ -764,41 +857,41 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_GRAS_SC_WINDOW_SCISSOR_TL);
-	
+	/* GRAS_SC_WINDOW_SCISSOR_TL */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_SC_WINDOW_SCISSOR_BR */
 	*cmds++ = _SET(GRAS_SC_WINDOW_SCISSOR_BR_BR_X, shadow->width - 1) |
 		_SET(GRAS_SC_WINDOW_SCISSOR_BR_BR_Y, shadow->height - 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_GRAS_SC_SCREEN_SCISSOR_TL);
-	
+	/* GRAS_SC_SCREEN_SCISSOR_TL */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_SC_SCREEN_SCISSOR_BR */
 	*cmds++ = _SET(GRAS_SC_SCREEN_SCISSOR_BR_BR_X, shadow->width - 1) |
 		_SET(GRAS_SC_SCREEN_SCISSOR_BR_BR_Y, shadow->height - 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_VPORT_XOFFSET);
-	
+	/* GRAS_CL_VPORT_XOFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_CL_VPORT_XSCALE */
 	*cmds++ = _SET(GRAS_CL_VPORT_XSCALE_VPORT_XSCALE, 0x3f800000);
-	
+	/* GRAS_CL_VPORT_YOFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_CL_VPORT_YSCALE */
 	*cmds++ = _SET(GRAS_CL_VPORT_YSCALE_VPORT_YSCALE, 0x3f800000);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_VPORT_ZOFFSET);
-	
+	/* GRAS_CL_VPORT_ZOFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_CL_VPORT_ZSCALE */
 	*cmds++ = _SET(GRAS_CL_VPORT_ZSCALE_VPORT_ZSCALE, 0x3f800000);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_CLIP_CNTL);
-	
+	/* GRAS_CL_CLIP_CNTL */
 	*cmds++ = _SET(GRAS_CL_CLIP_CNTL_CLIP_DISABLE, 1) |
 		_SET(GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE, 1) |
 		_SET(GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE, 1) |
@@ -807,14 +900,14 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_GB_CLIP_ADJ);
-	
+	/* GRAS_CL_GB_CLIP_ADJ */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmds++ = 0x00000000;
 
 
-	
+	/* oxili_generate_context_roll_packets */
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_CTRL_REG0, 1);
 	*cmds++ = 0x00000400;
 
@@ -822,12 +915,12 @@
 	*cmds++ = 0x00000400;
 
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00008000; 
+	*cmds++ = 0x00008000; /* SP_VS_MEM_SIZE_REG */
 
 	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00008000; 
+	*cmds++ = 0x00008000; /* SP_FS_MEM_SIZE_REG */
 
-	
+	/* Clear cache invalidate bit when re-loading the shader control regs */
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_CTRL_REG0, 1);
 	*cmds++ = _SET(SP_VSCTRLREG0_VSTHREADMODE, SP_MULTI) |
 		_SET(SP_VSCTRLREG0_VSINSTRBUFFERMODE, SP_BUFFER_MODE) |
@@ -846,38 +939,43 @@
 		_SET(SP_FSCTRLREG0_FSLENGTH, 1);
 
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00000000;		 
+	*cmds++ = 0x00000000;		 /* SP_VS_MEM_SIZE_REG */
 
 	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00000000;		 
+	*cmds++ = 0x00000000;		 /* SP_FS_MEM_SIZE_REG */
 
-	
+	/* end oxili_generate_context_roll_packets */
 
+	/*
+	 * Resolve using two draw calls with a dummy register
+	 * write in between. This is a HLM workaround
+	 * that should be removed later.
+	 */
 	*cmds++ = cp_type3_packet(CP_DRAW_INDX_2, 6);
-	*cmds++ = 0x00000000; 
+	*cmds++ = 0x00000000; /* Viz query info */
 	*cmds++ = BUILD_PC_DRAW_INITIATOR(PC_DI_PT_TRILIST,
 					  PC_DI_SRC_SEL_IMMEDIATE,
 					  PC_DI_INDEX_SIZE_32_BIT,
 					  PC_DI_IGNORE_VISIBILITY);
-	*cmds++ = 0x00000003; 
-	*cmds++ = 0x00000000; 
-	*cmds++ = 0x00000001; 
-	*cmds++ = 0x00000002; 
+	*cmds++ = 0x00000003; /* Num indices */
+	*cmds++ = 0x00000000; /* Index 0 */
+	*cmds++ = 0x00000001; /* Index 1 */
+	*cmds++ = 0x00000002; /* Index 2 */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_HLSQ_CL_CONTROL_0_REG);
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_DRAW_INDX_2, 6);
-	*cmds++ = 0x00000000; 
+	*cmds++ = 0x00000000; /* Viz query info */
 	*cmds++ = BUILD_PC_DRAW_INITIATOR(PC_DI_PT_TRILIST,
 					  PC_DI_SRC_SEL_IMMEDIATE,
 					  PC_DI_INDEX_SIZE_32_BIT,
 					  PC_DI_IGNORE_VISIBILITY);
-	*cmds++ = 0x00000003; 
-	*cmds++ = 0x00000002; 
-	*cmds++ = 0x00000001; 
-	*cmds++ = 0x00000003; 
+	*cmds++ = 0x00000003; /* Num indices */
+	*cmds++ = 0x00000002; /* Index 0 */
+	*cmds++ = 0x00000001; /* Index 1 */
+	*cmds++ = 0x00000003; /* Index 2 */
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_HLSQ_CL_CONTROL_0_REG);
@@ -886,7 +984,7 @@
 	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmds++ = 0x00000000;
 
-	
+	/* Create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, shadow->gmem_save, start, cmds);
 
 	return cmds;
@@ -897,7 +995,7 @@
 	unsigned int *cmd = tmp_ctx.cmd;
 	unsigned int *start;
 
-	
+	/* Reserve space for boolean values used for COND_EXEC packet */
 	drawctxt->cond_execs[0].hostptr = cmd;
 	drawctxt->cond_execs[0].gpuaddr = virt2gpu(cmd, &drawctxt->gpustate);
 	*cmd++ = 0;
@@ -916,42 +1014,78 @@
 
 	start = cmd;
 
-	
+	/* Save vertex shader */
 
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[0].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[0].gpuaddr >> 2;
 	*cmd++ = 0x0000FFFF;
-	*cmd++ = 3;		
+	*cmd++ = 3;		/* EXEC_COUNT */
 
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	drawctxt->shader_save_commands[2].hostptr = cmd;
 	drawctxt->shader_save_commands[2].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;	
+	/*
+	   From fixup:
+
+	   dwords = SP_VS_CTRL_REG0.VS_LENGTH * 8
+
+	   From regspec:
+	   SP_VS_CTRL_REG0.VS_LENGTH [31:24]: VS length, unit = 256bits.
+	   If bit31 is 1, it means overflow
+	   or any long shader.
+
+	   src = (HLSQ_SHADOW_BASE + 0x1000)/4
+	 */
+	*cmd++ = 0;	/*(dwords << REG_TO_MEM_LOOP_COUNT_SHIFT) | src */
 	*cmd++ = (drawctxt->gpustate.gpuaddr + SHADER_OFFSET) & 0xfffffffc;
 
-	
+	/* Save fragment shader */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[1].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[1].gpuaddr >> 2;
 	*cmd++ = 0x0000FFFF;
-	*cmd++ = 3;		
+	*cmd++ = 3;		/* EXEC_COUNT */
 
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	drawctxt->shader_save_commands[3].hostptr = cmd;
 	drawctxt->shader_save_commands[3].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;	
+	/*
+	   From fixup:
+
+	   dwords = SP_FS_CTRL_REG0.FS_LENGTH * 8
+
+	   From regspec:
+	   SP_FS_CTRL_REG0.FS_LENGTH [31:24]: FS length, unit = 256bits.
+	   If bit31 is 1, it means overflow
+	   or any long shader.
+
+	   fs_offset = SP_FS_OBJ_OFFSET_REG.SHADEROBJOFFSETINIC * 32
+	   From regspec:
+
+	   SP_FS_OBJ_OFFSET_REG.SHADEROBJOFFSETINIC [31:25]:
+	   First instruction of the whole shader will be stored from
+	   the offset in instruction cache, unit = 256bits, a cache line.
+	   It can start from 0 if no VS available.
+
+	   src = (HLSQ_SHADOW_BASE + 0x1000 + SSIZE + fs_offset)/4
+	 */
+	*cmd++ = 0;	/*(dwords << REG_TO_MEM_LOOP_COUNT_SHIFT) | src */
 	*cmd++ = (drawctxt->gpustate.gpuaddr + SHADER_OFFSET
 		  + (SHADER_SHADOW_SIZE / 2)) & 0xfffffffc;
 
-	
+	/* Create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->shader_save, start, cmd);
 
 	tmp_ctx.cmd = cmd;
 }
 
+/*
+ * Make an IB to modify context save IBs with the correct shader instruction
+ * and constant sizes and offsets.
+ */
 
 static void build_save_fixup_cmds(struct adreno_device *adreno_dev,
 				  struct adreno_context *drawctxt)
@@ -959,26 +1093,26 @@
 	unsigned int *cmd = tmp_ctx.cmd;
 	unsigned int *start = cmd;
 
-	
+	/* Flush HLSQ lazy updates */
 	*cmd++ = cp_type3_packet(CP_EVENT_WRITE, 1);
-	*cmd++ = 0x7;		
+	*cmd++ = 0x7;		/* HLSQ_FLUSH */
 	*cmd++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmd++ = 0;
 
 	*cmd++ = cp_type0_packet(A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
-	*cmd++ = 0x00000000; 
+	*cmd++ = 0x00000000; /* No start addr for full invalidate */
 	*cmd++ = (unsigned int)
 		UCHE_ENTIRE_CACHE << UCHE_INVALIDATE1REG_ALLORPORTION |
 		UCHE_OP_INVALIDATE << UCHE_INVALIDATE1REG_OPCODE |
-		0; 
+		0; /* No end addr for full invalidate */
 
-	
+	/* Make sure registers are flushed */
 	*cmd++ = cp_type3_packet(CP_CONTEXT_UPDATE, 1);
 	*cmd++ = 0;
 
 #ifdef GSL_CONTEXT_SWITCH_CPU_SYNC
 
-	
+	/* Save shader sizes */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_VS_CTRL_REG0;
 	*cmd++ = drawctxt->shader_save_commands[2].gpuaddr;
@@ -987,12 +1121,12 @@
 	*cmd++ = A3XX_SP_FS_CTRL_REG0;
 	*cmd++ = drawctxt->shader_save_commands[3].gpuaddr;
 
-	
+	/* Save shader offsets */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_FS_OBJ_OFFSET_REG;
 	*cmd++ = drawctxt->shader_save_commands[1].gpuaddr;
 
-	
+	/* Save constant sizes */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_VS_CTRL_REG1;
 	*cmd++ = drawctxt->constant_save_commands[1].gpuaddr;
@@ -1000,59 +1134,72 @@
 	*cmd++ = A3XX_SP_FS_CTRL_REG1;
 	*cmd++ = drawctxt->constant_save_commands[2].gpuaddr;
 
-	
+	/* Save FS constant offset */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_FS_OBJ_OFFSET_REG;
 	*cmd++ = drawctxt->constant_save_commands[0].gpuaddr;
 
 
-	
+	/* Save VS instruction store mode */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_VS_CTRL_REG0;
 	*cmd++ = drawctxt->cond_execs[0].gpuaddr;
 
-	
+	/* Save FS instruction store mode */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_FS_CTRL_REG0;
 	*cmd++ = drawctxt->cond_execs[1].gpuaddr;
 #else
 
-	
+	/* Shader save */
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG0, 0x7f000000,
 			11+REG_TO_MEM_LOOP_COUNT_SHIFT,
 			(HLSQ_SHADOW_BASE + 0x1000) / 4,
 			drawctxt->shader_save_commands[2].gpuaddr);
 
-	
+	/* CP_SCRATCH_REG2 = (CP_SCRATCH_REG2 & 0x00000000) | SP_FS_CTRL_REG0 */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = (1 << 30) | A3XX_CP_SCRATCH_REG2;
-	*cmd++ = 0x00000000;	
-	*cmd++ = A3XX_SP_FS_CTRL_REG0;	
+	*cmd++ = 0x00000000;	/* AND value */
+	*cmd++ = A3XX_SP_FS_CTRL_REG0;	/* OR address */
+	/* CP_SCRATCH_REG2 = ( (CP_SCRATCH_REG2 & 0x7f000000) >> 21 )
+	   |  ((HLSQ_SHADOW_BASE+0x1000+SSIZE)/4) */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = ((11 + REG_TO_MEM_LOOP_COUNT_SHIFT) << 24) |
 		A3XX_CP_SCRATCH_REG2;
-	*cmd++ = 0x7f000000;	
-	*cmd++ = (HLSQ_SHADOW_BASE + 0x1000 + SSIZE) / 4;	
+	*cmd++ = 0x7f000000;	/* AND value */
+	*cmd++ = (HLSQ_SHADOW_BASE + 0x1000 + SSIZE) / 4;	/* OR value */
 
+	/*
+	 * CP_SCRATCH_REG3 = (CP_SCRATCH_REG3 & 0x00000000) |
+	 * SP_FS_OBJ_OFFSET_REG
+	 */
 
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = (1 << 30) | A3XX_CP_SCRATCH_REG3;
-	*cmd++ = 0x00000000;	
-	*cmd++ = A3XX_SP_FS_OBJ_OFFSET_REG;	
+	*cmd++ = 0x00000000;	/* AND value */
+	*cmd++ = A3XX_SP_FS_OBJ_OFFSET_REG;	/* OR address */
+	/*
+	 * CP_SCRATCH_REG3 = ( (CP_SCRATCH_REG3 & 0xfe000000) >> 25 ) |
+	 * 0x00000000
+	 */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = A3XX_CP_SCRATCH_REG3;
-	*cmd++ = 0xfe000000;	
-	*cmd++ = 0x00000000;	
+	*cmd++ = 0xfe000000;	/* AND value */
+	*cmd++ = 0x00000000;	/* OR value */
+	/*
+	 * CP_SCRATCH_REG2 =  (CP_SCRATCH_REG2 & 0xffffffff) | CP_SCRATCH_REG3
+	 */
 	*cmd++ = cp_type3_packet(CP_REG_RMW, 3);
 	*cmd++ = (1 << 30) | A3XX_CP_SCRATCH_REG2;
-	*cmd++ = 0xffffffff;	
-	*cmd++ = A3XX_CP_SCRATCH_REG3;	
+	*cmd++ = 0xffffffff;	/* AND value */
+	*cmd++ = A3XX_CP_SCRATCH_REG3;	/* OR address */
 
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_CP_SCRATCH_REG2;
 	*cmd++ = drawctxt->shader_save_commands[3].gpuaddr;
 
-	
+	/* Constant save */
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG1, 0x000003ff,
 			   2 + REG_TO_MEM_LOOP_COUNT_SHIFT,
 			   (HLSQ_SHADOW_BASE + 0x2000) / 4,
@@ -1068,19 +1215,19 @@
 			   drawctxt->constant_save_commands[2].gpuaddr
 			   + sizeof(unsigned int));
 
-	
+	/* Modify constant save conditionals */
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG1, 0x000003ff,
 		0, 0, drawctxt->cond_execs[2].gpuaddr);
 
 	cmd = rmw_regtomem(cmd, A3XX_SP_FS_CTRL_REG1, 0x000003ff,
 		0, 0, drawctxt->cond_execs[3].gpuaddr);
 
-	
+	/* Save VS instruction store mode */
 
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG0, 0x00000002,
 			   31, 0, drawctxt->cond_execs[0].gpuaddr);
 
-	
+	/* Save FS instruction store mode */
 	cmd = rmw_regtomem(cmd, A3XX_SP_FS_CTRL_REG0, 0x00000002,
 			   31, 0, drawctxt->cond_execs[1].gpuaddr);
 
@@ -1091,6 +1238,9 @@
 	tmp_ctx.cmd = cmd;
 }
 
+/****************************************************************************/
+/* Functions to build context restore IBs                                   */
+/****************************************************************************/
 
 static unsigned int *build_sys2gmem_cmds(struct adreno_device *adreno_dev,
 					 struct adreno_context *drawctxt,
@@ -1104,31 +1254,31 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG);
-	
+	/* HLSQ_CONTROL_0_REG */
 	*cmds++ = _SET(HLSQ_CTRL0REG_FSTHREADSIZE, HLSQ_FOUR_PIX_QUADS) |
 		_SET(HLSQ_CTRL0REG_FSSUPERTHREADENABLE, 1) |
 		_SET(HLSQ_CTRL0REG_SPSHADERRESTART, 1) |
 		_SET(HLSQ_CTRL0REG_CHUNKDISABLE, 1) |
 		_SET(HLSQ_CTRL0REG_SPCONSTFULLUPDATE, 1);
-	
+	/* HLSQ_CONTROL_1_REG */
 	*cmds++ = _SET(HLSQ_CTRL1REG_VSTHREADSIZE, HLSQ_TWO_VTX_QUADS) |
 		_SET(HLSQ_CTRL1REG_VSSUPERTHREADENABLE, 1);
-	
+	/* HLSQ_CONTROL_2_REG */
 	*cmds++ = _SET(HLSQ_CTRL2REG_PRIMALLOCTHRESHOLD, 31);
-	
+	/* HLSQ_CONTROL3_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BUF_INFO0);
-	
+	/* RB_MRT_BUF_INFO0 */
 	*cmds++ = _SET(RB_MRTBUFINFO_COLOR_FORMAT, RB_R8G8B8A8_UNORM) |
 		_SET(RB_MRTBUFINFO_COLOR_TILE_MODE, RB_TILINGMODE_32X32) |
 		_SET(RB_MRTBUFINFO_COLOR_BUF_PITCH,
 		(shadow->gmem_pitch * 4 * 8) / 256);
-	
+	/* RB_MRT_BUF_BASE0 */
 	*cmds++ = _SET(RB_MRTBUFBASE_COLOR_BUF_BASE, tmp_ctx.gmem_base >> 5);
 
-	
+	/* Texture samplers */
 	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 4);
 	*cmds++ = (16 << CP_LOADSTATE_DSTOFFSET_SHIFT)
 		| (HLSQ_DIRECT << CP_LOADSTATE_STATESRC_SHIFT)
@@ -1142,7 +1292,7 @@
 	*cmds++ = cp_type0_packet(A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 	*cmds++ = 0x00000000;
 
-	
+	/* Texture memobjs */
 	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 6);
 	*cmds++ = (16 << CP_LOADSTATE_DSTOFFSET_SHIFT)
 		| (HLSQ_DIRECT << CP_LOADSTATE_STATESRC_SHIFT)
@@ -1158,7 +1308,7 @@
 	*cmds++ = cp_type0_packet(A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 	*cmds++ = 0x00000000;
 
-	
+	/* Mipmap bases */
 	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 16);
 	*cmds++ = (224 << CP_LOADSTATE_DSTOFFSET_SHIFT)
 		| (HLSQ_DIRECT << CP_LOADSTATE_STATESRC_SHIFT)
@@ -1186,74 +1336,74 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_HLSQ_VS_CONTROL_REG);
-	
+	/* HLSQ_VS_CONTROL_REG */
 	*cmds++ = _SET(HLSQ_VSCTRLREG_VSINSTRLENGTH, 1);
-	
+	/* HLSQ_FS_CONTROL_REG */
 	*cmds++ = _SET(HLSQ_FSCTRLREG_FSCONSTLENGTH, 1) |
 		_SET(HLSQ_FSCTRLREG_FSCONSTSTARTOFFSET, 128) |
 		_SET(HLSQ_FSCTRLREG_FSINSTRLENGTH, 2);
-	
+	/* HLSQ_CONST_VSPRESV_RANGE_REG */
 	*cmds++ = 0x00000000;
-	
+	/* HLSQ_CONST_FSPRESV_RANGE_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_FS_LENGTH_REG);
-	
+	/* SP_FS_LENGTH_REG */
 	*cmds++ = _SET(SP_SHADERLENGTH_LEN, 2);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 12);
 	*cmds++ = CP_REG(A3XX_SP_VS_CTRL_REG0);
-	
+	/* SP_VS_CTRL_REG0 */
 	*cmds++ = _SET(SP_VSCTRLREG0_VSTHREADMODE, SP_MULTI) |
 		_SET(SP_VSCTRLREG0_VSINSTRBUFFERMODE, SP_BUFFER_MODE) |
 		_SET(SP_VSCTRLREG0_VSICACHEINVALID, 1) |
 		_SET(SP_VSCTRLREG0_VSFULLREGFOOTPRINT, 2) |
 		_SET(SP_VSCTRLREG0_VSTHREADSIZE, SP_TWO_VTX_QUADS) |
 		_SET(SP_VSCTRLREG0_VSLENGTH, 1);
-	
+	/* SP_VS_CTRL_REG1 */
 	*cmds++ = _SET(SP_VSCTRLREG1_VSINITIALOUTSTANDING, 8);
-	
+	/* SP_VS_PARAM_REG */
 	*cmds++ = _SET(SP_VSPARAMREG_POSREGID, 4) |
 		_SET(SP_VSPARAMREG_PSIZEREGID, 252) |
 		_SET(SP_VSPARAMREG_TOTALVSOUTVAR, 1);
-	
+	/* SP_VS_OUT_REG0 */
 	*cmds++ = _SET(SP_VSOUTREG_COMPMASK0, 3);
-	
+	/* SP_VS_OUT_REG1 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG2 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG3 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG4 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG5 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG6 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OUT_REG7 */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 7);
 	*cmds++ = CP_REG(A3XX_SP_VS_VPC_DST_REG_0);
-	
+	/* SP_VS_VPC_DST_REG0 */
 	*cmds++ = _SET(SP_VSVPCDSTREG_OUTLOC0, 8);
-	
+	/* SP_VS_VPC_DST_REG1 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_VPC_DST_REG2 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_VPC_DST_REG3 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OBJ_OFFSET_REG */
 	*cmds++ = 0x00000000;
-	
+	/* SP_VS_OBJ_START_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 6);
 	*cmds++ = CP_REG(A3XX_SP_VS_LENGTH_REG);
-	
+	/* SP_VS_LENGTH_REG */
 	*cmds++ = _SET(SP_SHADERLENGTH_LEN, 1);
-	
+	/* SP_FS_CTRL_REG0 */
 	*cmds++ = _SET(SP_FSCTRLREG0_FSTHREADMODE, SP_MULTI) |
 		_SET(SP_FSCTRLREG0_FSINSTRBUFFERMODE, SP_BUFFER_MODE) |
 		_SET(SP_FSCTRLREG0_FSICACHEINVALID, 1) |
@@ -1264,57 +1414,57 @@
 		_SET(SP_FSCTRLREG0_FSSUPERTHREADMODE, 1) |
 		_SET(SP_FSCTRLREG0_PIXLODENABLE, 1) |
 		_SET(SP_FSCTRLREG0_FSLENGTH, 2);
-	
+	/* SP_FS_CTRL_REG1 */
 	*cmds++ = _SET(SP_FSCTRLREG1_FSCONSTLENGTH, 1) |
 		_SET(SP_FSCTRLREG1_FSINITIALOUTSTANDING, 2) |
 		_SET(SP_FSCTRLREG1_HALFPRECVAROFFSET, 63);
-	
+	/* SP_FS_OBJ_OFFSET_REG */
 	*cmds++ = _SET(SP_OBJOFFSETREG_CONSTOBJECTSTARTOFFSET, 128) |
 		_SET(SP_OBJOFFSETREG_SHADEROBJOFFSETINIC, 126);
-	
+	/* SP_FS_OBJ_START_REG */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_SP_FS_FLAT_SHAD_MODE_REG_0);
-	
+	/* SP_FS_FLAT_SHAD_MODE_REG0 */
 	*cmds++ = 0x00000000;
-	
+	/* SP_FS_FLAT_SHAD_MODE_REG1 */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_FS_OUTPUT_REG);
-	
+	/* SP_FS_OUT_REG */
 	*cmds++ = _SET(SP_FSOUTREG_PAD0, SP_PIXEL_BASED);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_SP_FS_MRT_REG_0);
-	
+	/* SP_FS_MRT_REG0 */
 	*cmds++ = _SET(SP_FSMRTREG_PRECISION, 1);
-	
+	/* SP_FS_MRT_REG1 */
 	*cmds++ = 0;
-	
+	/* SP_FS_MRT_REG2 */
 	*cmds++ = 0;
-	
+	/* SP_FS_MRT_REG3 */
 	*cmds++ = 0;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 11);
 	*cmds++ = CP_REG(A3XX_VPC_ATTR);
-	
+	/* VPC_ATTR */
 	*cmds++ = _SET(VPC_VPCATTR_TOTALATTR, 2) |
 		_SET(VPC_VPCATTR_THRHDASSIGN, 1) |
 		_SET(VPC_VPCATTR_LMSIZE, 1);
-	
+	/* VPC_PACK */
 	*cmds++ = _SET(VPC_VPCPACK_NUMFPNONPOSVAR, 2) |
 		_SET(VPC_VPCPACK_NUMNONPOSVSVAR, 2);
-	
+	/* VPC_VARYING_INTERP_MODE_0 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_INTERP_MODE1 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_INTERP_MODE2 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARYING_IINTERP_MODE3 */
 	*cmds++ = 0x00000000;
-	
+	/* VPC_VARRYING_PS_REPL_MODE_0 */
 	*cmds++ = _SET(VPC_VPCVARPSREPLMODE_COMPONENT08, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT09, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT0A,	1) |
@@ -1331,7 +1481,7 @@
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT15, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT16, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT17, 2);
-	
+	/* VPC_VARRYING_PS_REPL_MODE_1 */
 	*cmds++ = _SET(VPC_VPCVARPSREPLMODE_COMPONENT08, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT09, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT0A,	1) |
@@ -1348,7 +1498,7 @@
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT15, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT16, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT17, 2);
-	
+	/* VPC_VARRYING_PS_REPL_MODE_2 */
 	*cmds++ = _SET(VPC_VPCVARPSREPLMODE_COMPONENT08, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT09, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT0A,	1) |
@@ -1365,7 +1515,7 @@
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT15, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT16, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT17, 2);
-	
+	/* VPC_VARRYING_PS_REPL_MODE_3 */
 	*cmds++ = _SET(VPC_VPCVARPSREPLMODE_COMPONENT08, 1) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT09, 2) |
 		_SET(VPC_VPCVARPSREPLMODE_COMPONENT0A,	1) |
@@ -1385,11 +1535,11 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_SP_CTRL_REG);
-	
+	/* SP_SP_CTRL_REG */
 	*cmds++ = _SET(SP_SPCTRLREG_SLEEPMODE, 1) |
 		_SET(SP_SPCTRLREG_LOMODE, 1);
 
-	
+	/* Load vertex shader */
 	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 10);
 	*cmds++ = (0 << CP_LOADSTATE_DSTOFFSET_SHIFT)
 		| (HLSQ_DIRECT << CP_LOADSTATE_STATESRC_SHIFT)
@@ -1397,13 +1547,13 @@
 		| (1 << CP_LOADSTATE_NUMOFUNITS_SHIFT);
 	*cmds++ = (HLSQ_SP_VS_INSTR << CP_LOADSTATE_STATETYPE_SHIFT)
 		| (0 << CP_LOADSTATE_EXTSRCADDR_SHIFT);
-	
+	/* (sy)end; */
 	*cmds++ = 0x00000000; *cmds++ = 0x13001000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
 
 	*cmds++ = cp_type0_packet(A3XX_VFD_PERFCOUNTER0_SELECT, 1);
@@ -1413,7 +1563,7 @@
 	*cmds++ = 0x00000000;
 
 
-	
+	/* Load fragment shader */
 	*cmds++ = cp_type3_packet(CP_LOAD_STATE, 18);
 	*cmds++ = (0 << CP_LOADSTATE_DSTOFFSET_SHIFT)
 		| (HLSQ_DIRECT << CP_LOADSTATE_STATESRC_SHIFT)
@@ -1421,21 +1571,21 @@
 		| (2 << CP_LOADSTATE_NUMOFUNITS_SHIFT);
 	*cmds++ = (HLSQ_SP_FS_INSTR << CP_LOADSTATE_STATETYPE_SHIFT)
 		| (0 << CP_LOADSTATE_EXTSRCADDR_SHIFT);
-	
+	/* (sy)(rpt1)bary.f (ei)r0.z, (r)0, r0.x; */
 	*cmds++ = 0x00002000; *cmds++ = 0x57309902;
-	
+	/* (rpt5)nop; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000500;
-	
+	/* sam (f32)r0.xyzw, r0.z, s#0, t#0; */
 	*cmds++ = 0x00000005; *cmds++ = 0xa0c01f00;
-	
+	/* (sy)mov.f32f32 r1.x, r0.x; */
 	*cmds++ = 0x00000000; *cmds++ = 0x30040b00;
-	
+	/* mov.f32f32 r1.y, r0.y; */
 	*cmds++ = 0x00000000; *cmds++ = 0x03000000;
-	
+	/* mov.f32f32 r1.z, r0.z; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* mov.f32f32 r1.w, r0.w; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
-	
+	/* end; */
 	*cmds++ = 0x00000000; *cmds++ = 0x00000000;
 
 	*cmds++ = cp_type0_packet(A3XX_VFD_PERFCOUNTER0_SELECT, 1);
@@ -1446,45 +1596,45 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_VFD_CONTROL_0);
-	
+	/* VFD_CONTROL_0 */
 	*cmds++ = _SET(VFD_CTRLREG0_TOTALATTRTOVS, 8) |
 		_SET(VFD_CTRLREG0_PACKETSIZE, 2) |
 		_SET(VFD_CTRLREG0_STRMDECINSTRCNT, 2) |
 		_SET(VFD_CTRLREG0_STRMFETCHINSTRCNT, 2);
-	
+	/* VFD_CONTROL_1 */
 	*cmds++ =  _SET(VFD_CTRLREG1_MAXSTORAGE, 2) |
 		_SET(VFD_CTRLREG1_REGID4VTX, 252) |
 		_SET(VFD_CTRLREG1_REGID4INST, 252);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_VFD_FETCH_INSTR_0_0);
-	
+	/* VFD_FETCH_INSTR_0_0 */
 	*cmds++ = _SET(VFD_FETCHINSTRUCTIONS_FETCHSIZE, 7) |
 		_SET(VFD_FETCHINSTRUCTIONS_BUFSTRIDE, 8) |
 		_SET(VFD_FETCHINSTRUCTIONS_SWITCHNEXT, 1) |
 		_SET(VFD_FETCHINSTRUCTIONS_STEPRATE, 1);
-	
+	/* VFD_FETCH_INSTR_1_0 */
 	*cmds++ = _SET(VFD_BASEADDR_BASEADDR,
 		shadow->quad_vertices_restore.gpuaddr);
-	
+	/* VFD_FETCH_INSTR_0_1 */
 	*cmds++ = _SET(VFD_FETCHINSTRUCTIONS_FETCHSIZE, 11) |
 		_SET(VFD_FETCHINSTRUCTIONS_BUFSTRIDE, 12) |
 		_SET(VFD_FETCHINSTRUCTIONS_INDEXDECODE, 1) |
 		_SET(VFD_FETCHINSTRUCTIONS_STEPRATE, 1);
-	
+	/* VFD_FETCH_INSTR_1_1 */
 	*cmds++ = _SET(VFD_BASEADDR_BASEADDR,
 		shadow->quad_vertices_restore.gpuaddr + 16);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_VFD_DECODE_INSTR_0);
-	
+	/* VFD_DECODE_INSTR_0 */
 	*cmds++ = _SET(VFD_DECODEINSTRUCTIONS_WRITEMASK, 0x0F) |
 		_SET(VFD_DECODEINSTRUCTIONS_CONSTFILL, 1) |
 		_SET(VFD_DECODEINSTRUCTIONS_FORMAT, 1) |
 		_SET(VFD_DECODEINSTRUCTIONS_SHIFTCNT, 8) |
 		_SET(VFD_DECODEINSTRUCTIONS_LASTCOMPVALID, 1) |
 		_SET(VFD_DECODEINSTRUCTIONS_SWITCHNEXT, 1);
-	
+	/* VFD_DECODE_INSTR_1 */
 	*cmds++ = _SET(VFD_DECODEINSTRUCTIONS_WRITEMASK, 0x0F) |
 		_SET(VFD_DECODEINSTRUCTIONS_CONSTFILL, 1) |
 		_SET(VFD_DECODEINSTRUCTIONS_FORMAT, 2) |
@@ -1494,12 +1644,12 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_DEPTH_CONTROL);
-	
+	/* RB_DEPTH_CONTROL */
 	*cmds++ = _SET(RB_DEPTHCONTROL_Z_TEST_FUNC, RB_FRAG_LESS);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_STENCIL_CONTROL);
-	
+	/* RB_STENCIL_CONTROL */
 	*cmds++ = _SET(RB_STENCILCONTROL_STENCIL_FUNC, RB_REF_ALWAYS) |
 		_SET(RB_STENCILCONTROL_STENCIL_FAIL, RB_STENCIL_KEEP) |
 		_SET(RB_STENCILCONTROL_STENCIL_ZPASS, RB_STENCIL_KEEP) |
@@ -1511,32 +1661,32 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MODE_CONTROL);
-	
+	/* RB_MODE_CONTROL */
 	*cmds++ = _SET(RB_MODECONTROL_RENDER_MODE, RB_RENDERING_PASS) |
 		_SET(RB_MODECONTROL_MARB_CACHE_SPLIT_MODE, 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_RENDER_CONTROL);
-	
+	/* RB_RENDER_CONTROL */
 	*cmds++ = _SET(RB_RENDERCONTROL_BIN_WIDTH, shadow->width >> 5) |
 		_SET(RB_RENDERCONTROL_ALPHA_TEST_FUNC, 7);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MSAA_CONTROL);
-	
+	/* RB_MSAA_CONTROL */
 	*cmds++ = _SET(RB_MSAACONTROL_MSAA_DISABLE, 1) |
 		_SET(RB_MSAACONTROL_SAMPLE_MASK, 0xFFFF);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MRT_CONTROL0);
-	
+	/* RB_MRT_CONTROL0 */
 	*cmds++ = _SET(RB_MRTCONTROL_ROP_CODE, 12) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_DISABLE) |
 		_SET(RB_MRTCONTROL_COMPONENT_ENABLE, 0xF);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL0);
-	
+	/* RB_MRT_BLENDCONTROL0 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
@@ -1544,7 +1694,7 @@
 		_SET(RB_MRTBLENDCONTROL_ALPHA_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_CLAMP_ENABLE, 1);
-	
+	/* RB_MRT_CONTROL1 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_ROP_CODE, 12) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_ALWAYS) |
@@ -1552,7 +1702,7 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL1);
-	
+	/* RB_MRT_BLENDCONTROL1 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
@@ -1560,7 +1710,7 @@
 		_SET(RB_MRTBLENDCONTROL_ALPHA_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_CLAMP_ENABLE, 1);
-	
+	/* RB_MRT_CONTROL2 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_ROP_CODE, 12) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_ALWAYS) |
@@ -1568,7 +1718,7 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL2);
-	
+	/* RB_MRT_BLENDCONTROL2 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
@@ -1576,7 +1726,7 @@
 		_SET(RB_MRTBLENDCONTROL_ALPHA_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_ALPHA_DEST_FACTOR, RB_FACTOR_ZERO) |
 		_SET(RB_MRTBLENDCONTROL_CLAMP_ENABLE, 1);
-	
+	/* RB_MRT_CONTROL3 */
 	*cmds++ = _SET(RB_MRTCONTROL_READ_DEST_ENABLE, 1) |
 		_SET(RB_MRTCONTROL_ROP_CODE, 12) |
 		_SET(RB_MRTCONTROL_DITHER_MODE, RB_DITHER_ALWAYS) |
@@ -1584,7 +1734,7 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_RB_MRT_BLEND_CONTROL3);
-	
+	/* RB_MRT_BLENDCONTROL3 */
 	*cmds++ = _SET(RB_MRTBLENDCONTROL_RGB_SRC_FACTOR, RB_FACTOR_ONE) |
 		_SET(RB_MRTBLENDCONTROL_RGB_BLEND_OPCODE, RB_BLEND_OP_ADD) |
 		_SET(RB_MRTBLENDCONTROL_RGB_DEST_FACTOR, RB_FACTOR_ZERO) |
@@ -1595,90 +1745,92 @@
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_VFD_INDEX_MIN);
-	
+	/* VFD_INDEX_MIN */
 	*cmds++ = 0x00000000;
-	
+	/* VFD_INDEX_MAX */
 	*cmds++ = 340;
-	
+	/* VFD_INDEX_OFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* TPL1_TP_VS_TEX_OFFSET */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_VFD_VS_THREADING_THRESHOLD);
-	
+	/* VFD_VS_THREADING_THRESHOLD */
 	*cmds++ = _SET(VFD_THREADINGTHRESHOLD_REGID_THRESHOLD, 15) |
 		_SET(VFD_THREADINGTHRESHOLD_REGID_VTXCNT, 252);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_TPL1_TP_VS_TEX_OFFSET);
-	
+	/* TPL1_TP_VS_TEX_OFFSET */
 	*cmds++ = 0x00000000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_TPL1_TP_FS_TEX_OFFSET);
-	
+	/* TPL1_TP_FS_TEX_OFFSET */
 	*cmds++ = _SET(TPL1_TPTEXOFFSETREG_SAMPLEROFFSET, 16) |
 		_SET(TPL1_TPTEXOFFSETREG_MEMOBJOFFSET, 16) |
 		_SET(TPL1_TPTEXOFFSETREG_BASETABLEPTR, 224);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_SC_CONTROL);
-	
+	/* GRAS_SC_CONTROL */
+	/*cmds++ = _SET(GRAS_SC_CONTROL_RASTER_MODE, 1);
+		*cmds++ = _SET(GRAS_SC_CONTROL_RASTER_MODE, 1) |*/
 	*cmds++ = 0x04001000;
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_SU_MODE_CONTROL);
-	
+	/* GRAS_SU_MODE_CONTROL */
 	*cmds++ = _SET(GRAS_SU_CTRLMODE_LINEHALFWIDTH, 2);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_GRAS_SC_WINDOW_SCISSOR_TL);
-	
+	/* GRAS_SC_WINDOW_SCISSOR_TL */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_SC_WINDOW_SCISSOR_BR */
 	*cmds++ = _SET(GRAS_SC_WINDOW_SCISSOR_BR_BR_X, shadow->width - 1) |
 		_SET(GRAS_SC_WINDOW_SCISSOR_BR_BR_Y, shadow->height - 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_GRAS_SC_SCREEN_SCISSOR_TL);
-	
+	/* GRAS_SC_SCREEN_SCISSOR_TL */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_SC_SCREEN_SCISSOR_BR */
 	*cmds++ = _SET(GRAS_SC_SCREEN_SCISSOR_BR_BR_X, shadow->width - 1) |
 		_SET(GRAS_SC_SCREEN_SCISSOR_BR_BR_Y, shadow->height - 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_VPORT_XOFFSET);
-	
+	/* GRAS_CL_VPORT_XOFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_CL_VPORT_XSCALE */
 	*cmds++ = _SET(GRAS_CL_VPORT_XSCALE_VPORT_XSCALE, 0x3F800000);
-	
+	/* GRAS_CL_VPORT_YOFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_CL_VPORT_YSCALE */
 	*cmds++ = _SET(GRAS_CL_VPORT_YSCALE_VPORT_YSCALE, 0x3F800000);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_VPORT_ZOFFSET);
-	
+	/* GRAS_CL_VPORT_ZOFFSET */
 	*cmds++ = 0x00000000;
-	
+	/* GRAS_CL_VPORT_ZSCALE */
 	*cmds++ = _SET(GRAS_CL_VPORT_ZSCALE_VPORT_ZSCALE, 0x3F800000);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_GRAS_CL_CLIP_CNTL);
-	
+	/* GRAS_CL_CLIP_CNTL */
 	*cmds++ = _SET(GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER, 1);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_SP_FS_IMAGE_OUTPUT_REG_0);
-	
+	/* SP_FS_IMAGE_OUTPUT_REG_0 */
 	*cmds++ = _SET(SP_IMAGEOUTPUTREG_MRTFORMAT, SP_R8G8B8A8_UNORM);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmds++ = CP_REG(A3XX_PC_PRIM_VTX_CNTL);
-	
+	/* PC_PRIM_VTX_CONTROL */
 	*cmds++ = _SET(PC_PRIM_VTX_CONTROL_STRIDE_IN_VPC, 2) |
 		_SET(PC_PRIM_VTX_CONTROL_POLYMODE_FRONT_PTYPE,
 		PC_DRAW_TRIANGLES) |
@@ -1687,7 +1839,7 @@
 		_SET(PC_PRIM_VTX_CONTROL_PROVOKING_VTX_LAST, 1);
 
 
-	
+	/* oxili_generate_context_roll_packets */
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_CTRL_REG0, 1);
 	*cmds++ = 0x00000400;
 
@@ -1695,12 +1847,12 @@
 	*cmds++ = 0x00000400;
 
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00008000; 
+	*cmds++ = 0x00008000; /* SP_VS_MEM_SIZE_REG */
 
 	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00008000; 
+	*cmds++ = 0x00008000; /* SP_FS_MEM_SIZE_REG */
 
-	
+	/* Clear cache invalidate bit when re-loading the shader control regs */
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_CTRL_REG0, 1);
 	*cmds++ = _SET(SP_VSCTRLREG0_VSTHREADMODE, SP_MULTI) |
 		_SET(SP_VSCTRLREG0_VSINSTRBUFFERMODE, SP_BUFFER_MODE) |
@@ -1719,22 +1871,22 @@
 		_SET(SP_FSCTRLREG0_FSLENGTH, 2);
 
 	*cmds++ = cp_type0_packet(A3XX_SP_VS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00000000;		 
+	*cmds++ = 0x00000000;		 /* SP_VS_MEM_SIZE_REG */
 
 	*cmds++ = cp_type0_packet(A3XX_SP_FS_PVT_MEM_SIZE_REG, 1);
-	*cmds++ = 0x00000000;		 
+	*cmds++ = 0x00000000;		 /* SP_FS_MEM_SIZE_REG */
 
-	
+	/* end oxili_generate_context_roll_packets */
 
 	*cmds++ = cp_type3_packet(CP_DRAW_INDX, 3);
-	*cmds++ = 0x00000000; 
+	*cmds++ = 0x00000000; /* Viz query info */
 	*cmds++ = BUILD_PC_DRAW_INITIATOR(PC_DI_PT_RECTLIST,
 					  PC_DI_SRC_SEL_AUTO_INDEX,
 					  PC_DI_INDEX_SIZE_16_BIT,
 					  PC_DI_IGNORE_VISIBILITY);
-	*cmds++ = 0x00000002; 
+	*cmds++ = 0x00000002; /* Num indices */
 
-	
+	/* Create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, shadow->gmem_restore, start, cmds);
 
 	return cmds;
@@ -1750,26 +1902,26 @@
 
 	int i;
 
-	
+	/* Flush HLSQ lazy updates */
 	*cmd++ = cp_type3_packet(CP_EVENT_WRITE, 1);
-	*cmd++ = 0x7;		
+	*cmd++ = 0x7;		/* HLSQ_FLUSH */
 	*cmd++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
 	*cmd++ = 0;
 
 	*cmd++ = cp_type0_packet(A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
-	*cmd++ = 0x00000000;    
+	*cmd++ = 0x00000000;    /* No start addr for full invalidate */
 	*cmd++ = (unsigned int)
 		UCHE_ENTIRE_CACHE << UCHE_INVALIDATE1REG_ALLORPORTION |
 		UCHE_OP_INVALIDATE << UCHE_INVALIDATE1REG_OPCODE |
-		0;  
+		0;  /* No end addr for full invalidate */
 
 	lcc_start = cmd;
 
-	
+	/* deferred cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, ???); */
 	cmd++;
 
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	
+	/* Force mismatch */
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000) | 1;
 #else
 	*cmd++ = (drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000;
@@ -1784,7 +1936,7 @@
 					(cmd - lcc_start) - 1);
 
 #ifdef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	lcc_start[2] |= (0 << 24) | (4 << 16);	
+	lcc_start[2] |= (0 << 24) | (4 << 16);	/* Disable shadowing. */
 #else
 	lcc_start[2] |= (1 << 24) | (4 << 16);
 #endif
@@ -1804,7 +1956,7 @@
 {
 	unsigned int *cmd = tmp_ctx.cmd;
 	unsigned int *start = cmd;
-	unsigned int mode = 4;	
+	unsigned int mode = 4;	/* Indirect mode */
 	unsigned int stateblock;
 	unsigned int numunits;
 	unsigned int statetype;
@@ -1822,52 +1974,90 @@
 	*cmd++ = 4 << 16;
 	*cmd++ = 0x0;
 #endif
-	
+	/* HLSQ full update */
 	*cmd++ = cp_type3_packet(CP_SET_CONSTANT, 2);
 	*cmd++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG);
-	*cmd++ = 0x68000240;	
+	*cmd++ = 0x68000240;	/* A3XX_HLSQ_CONTROL_0_REG */
 
 #ifndef CONFIG_MSM_KGSL_DISABLE_SHADOW_WRITES
-	
+	/* Re-enable shadowing */
 	*cmd++ = cp_type3_packet(CP_LOAD_CONSTANT_CONTEXT, 3);
 	*cmd++ = (drawctxt->gpustate.gpuaddr + REG_OFFSET) & 0xFFFFE000;
 	*cmd++ = (4 << 16) | (1 << 24);
 	*cmd++ = 0x0;
 #endif
 
-	
+	/* Load vertex shader constants */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[2].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[2].gpuaddr >> 2;
 	*cmd++ = 0x0000ffff;
-	*cmd++ = 3; 
+	*cmd++ = 3; /* EXEC_COUNT */
 	*cmd++ = cp_type3_packet(CP_LOAD_STATE, 2);
 	drawctxt->constant_load_commands[0].hostptr = cmd;
 	drawctxt->constant_load_commands[0].gpuaddr = virt2gpu(cmd,
 		&drawctxt->gpustate);
 
+	/*
+	   From fixup:
 
-	*cmd++ = 0;		
+	   mode = 4 (indirect)
+	   stateblock = 4 (Vertex constants)
+	   numunits = SP_VS_CTRL_REG1.VSCONSTLENGTH * 2; (256bit units)
+
+	   From register spec:
+	   SP_VS_CTRL_REG1.VSCONSTLENGTH [09:00]: 0-512, unit = 128bits.
+
+	   ord1 = (numunits<<22) | (stateblock<<19) | (mode<<16);
+	 */
+
+	*cmd++ = 0;		/* ord1 */
 	*cmd++ = ((drawctxt->gpustate.gpuaddr) & 0xfffffffc) | 1;
 
-	
+	/* Load fragment shader constants */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[3].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[3].gpuaddr >> 2;
 	*cmd++ = 0x0000ffff;
-	*cmd++ = 3; 
+	*cmd++ = 3; /* EXEC_COUNT */
 	*cmd++ = cp_type3_packet(CP_LOAD_STATE, 2);
 	drawctxt->constant_load_commands[1].hostptr = cmd;
 	drawctxt->constant_load_commands[1].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
+	/*
+	   From fixup:
 
-	*cmd++ = 0;		
+	   mode = 4 (indirect)
+	   stateblock = 6 (Fragment constants)
+	   numunits = SP_FS_CTRL_REG1.FSCONSTLENGTH * 2; (256bit units)
+
+	   From register spec:
+	   SP_FS_CTRL_REG1.FSCONSTLENGTH [09:00]: 0-512, unit = 128bits.
+
+	   ord1 = (numunits<<22) | (stateblock<<19) | (mode<<16);
+	 */
+
+	*cmd++ = 0;		/* ord1 */
 	drawctxt->constant_load_commands[2].hostptr = cmd;
 	drawctxt->constant_load_commands[2].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;		
+	/*
+	   From fixup:
+	   base = drawctxt->gpustate.gpuaddr (ALU constant shadow base)
+	   offset = SP_FS_OBJ_OFFSET_REG.CONSTOBJECTSTARTOFFSET
 
-	
+	   From register spec:
+	   SP_FS_OBJ_OFFSET_REG.CONSTOBJECTSTARTOFFSET [16:24]: Constant object
+	   start offset in on chip RAM,
+	   128bit aligned
+
+	   ord2 = base + offset | 1
+	   Because of the base alignment we can use
+	   ord2 = base | offset | 1
+	 */
+	*cmd++ = 0;		/* ord2 */
+
+	/* Restore VS texture memory objects */
 	stateblock = 0;
 	statetype = 1;
 	numunits = (TEX_SIZE_MEM_OBJECTS / 7) / 4;
@@ -1877,7 +2067,7 @@
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + VS_TEX_OFFSET_MEM_OBJECTS)
 	    & 0xfffffffc) | statetype;
 
-	
+	/* Restore VS texture mipmap addresses */
 	stateblock = 1;
 	statetype = 1;
 	numunits = TEX_SIZE_MIPMAP / 4;
@@ -1886,7 +2076,7 @@
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + VS_TEX_OFFSET_MIPMAP)
 	    & 0xfffffffc) | statetype;
 
-	
+	/* Restore VS texture sampler objects */
 	stateblock = 0;
 	statetype = 0;
 	numunits = (TEX_SIZE_SAMPLER_OBJ / 2) / 4;
@@ -1895,7 +2085,7 @@
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + VS_TEX_OFFSET_SAMPLER_OBJ)
 	    & 0xfffffffc) | statetype;
 
-	
+	/* Restore FS texture memory objects */
 	stateblock = 2;
 	statetype = 1;
 	numunits = (TEX_SIZE_MEM_OBJECTS / 7) / 4;
@@ -1904,7 +2094,7 @@
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + FS_TEX_OFFSET_MEM_OBJECTS)
 	    & 0xfffffffc) | statetype;
 
-	
+	/* Restore FS texture mipmap addresses */
 	stateblock = 3;
 	statetype = 1;
 	numunits = TEX_SIZE_MIPMAP / 4;
@@ -1913,7 +2103,7 @@
 	*cmd++ = ((drawctxt->gpustate.gpuaddr + FS_TEX_OFFSET_MIPMAP)
 	    & 0xfffffffc) | statetype;
 
-	
+	/* Restore FS texture sampler objects */
 	stateblock = 2;
 	statetype = 0;
 	numunits = (TEX_SIZE_SAMPLER_OBJ / 2) / 4;
@@ -1932,32 +2122,60 @@
 	unsigned int *cmd = tmp_ctx.cmd;
 	unsigned int *start = cmd;
 
-	
+	/* Vertex shader */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[0].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[0].gpuaddr >> 2;
 	*cmd++ = 1;
-	*cmd++ = 3;		
+	*cmd++ = 3;		/* EXEC_COUNT */
 
 	*cmd++ = cp_type3_packet(CP_LOAD_STATE, 2);
 	drawctxt->shader_load_commands[0].hostptr = cmd;
 	drawctxt->shader_load_commands[0].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;		
+	/*
+	   From fixup:
+
+	   mode = 4 (indirect)
+	   stateblock = 4 (Vertex shader)
+	   numunits = SP_VS_CTRL_REG0.VS_LENGTH
+
+	   From regspec:
+	   SP_VS_CTRL_REG0.VS_LENGTH [31:24]: VS length, unit = 256bits.
+	   If bit31 is 1, it means overflow
+	   or any long shader.
+
+	   ord1 = (numunits<<22) | (stateblock<<19) | (mode<<11)
+	 */
+	*cmd++ = 0;		/*ord1 */
 	*cmd++ = (drawctxt->gpustate.gpuaddr + SHADER_OFFSET) & 0xfffffffc;
 
-	
+	/* Fragment shader */
 	*cmd++ = cp_type3_packet(CP_COND_EXEC, 4);
 	*cmd++ = drawctxt->cond_execs[1].gpuaddr >> 2;
 	*cmd++ = drawctxt->cond_execs[1].gpuaddr >> 2;
 	*cmd++ = 1;
-	*cmd++ = 3;		
+	*cmd++ = 3;		/* EXEC_COUNT */
 
 	*cmd++ = cp_type3_packet(CP_LOAD_STATE, 2);
 	drawctxt->shader_load_commands[1].hostptr = cmd;
 	drawctxt->shader_load_commands[1].gpuaddr =
 	    virt2gpu(cmd, &drawctxt->gpustate);
-	*cmd++ = 0;		
+	/*
+	   From fixup:
+
+	   mode = 4 (indirect)
+	   stateblock = 6 (Fragment shader)
+	   numunits = SP_FS_CTRL_REG0.FS_LENGTH
+
+	   From regspec:
+	   SP_FS_CTRL_REG0.FS_LENGTH [31:24]: FS length, unit = 256bits.
+	   If bit31 is 1, it means overflow
+	   or any long shader.
+
+	   ord1 = (numunits<<22) | (stateblock<<19) | (mode<<11)
+	 */
+	*cmd++ = 0;		/*ord1 */
 	*cmd++ = (drawctxt->gpustate.gpuaddr + SHADER_OFFSET
 		  + (SHADER_SHADOW_SIZE / 2)) & 0xfffffffc;
 
@@ -1978,12 +2196,13 @@
 	    = virt2gpu(cmd, &drawctxt->gpustate);
 	*cmd++ = 0;
 
-	
+	/* Create indirect buffer command for above command sequence */
 	create_ib1(drawctxt, drawctxt->hlsqcontrol_restore, start, cmd);
 
 	tmp_ctx.cmd = cmd;
 }
 
+/* IB that modifies the shader and constant sizes and offsets in restore IBs. */
 static void build_restore_fixup_cmds(struct adreno_device *adreno_dev,
 				     struct adreno_context *drawctxt)
 {
@@ -1991,7 +2210,7 @@
 	unsigned int *start = cmd;
 
 #ifdef GSL_CONTEXT_SWITCH_CPU_SYNC
-	
+	/* Save shader sizes */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_VS_CTRL_REG0;
 	*cmd++ = drawctxt->shader_load_commands[0].gpuaddr;
@@ -2000,7 +2219,7 @@
 	*cmd++ = A3XX_SP_FS_CTRL_REG0;
 	*cmd++ = drawctxt->shader_load_commands[1].gpuaddr;
 
-	
+	/* Save constant sizes */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_VS_CTRL_REG1;
 	*cmd++ = drawctxt->constant_load_commands[0].gpuaddr;
@@ -2009,12 +2228,12 @@
 	*cmd++ = A3XX_SP_FS_CTRL_REG1;
 	*cmd++ = drawctxt->constant_load_commands[1].gpuaddr;
 
-	
+	/* Save constant offsets */
 	*cmd++ = cp_type3_packet(CP_REG_TO_MEM, 2);
 	*cmd++ = A3XX_SP_FS_OBJ_OFFSET_REG;
 	*cmd++ = drawctxt->constant_load_commands[2].gpuaddr;
 #else
-	
+	/* Save shader sizes */
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG0, 0x7f000000,
 			   30, (4 << 19) | (4 << 16),
 			   drawctxt->shader_load_commands[0].gpuaddr);
@@ -2023,7 +2242,7 @@
 			   30, (6 << 19) | (4 << 16),
 			   drawctxt->shader_load_commands[1].gpuaddr);
 
-	
+	/* Save constant sizes */
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG1, 0x000003ff,
 			   23, (4 << 19) | (4 << 16),
 			   drawctxt->constant_load_commands[0].gpuaddr);
@@ -2032,19 +2251,21 @@
 			   23, (6 << 19) | (4 << 16),
 			   drawctxt->constant_load_commands[1].gpuaddr);
 
-	
+	/* Modify constant restore conditionals */
 	cmd = rmw_regtomem(cmd, A3XX_SP_VS_CTRL_REG1, 0x000003ff,
 			0, 0, drawctxt->cond_execs[2].gpuaddr);
 
 	cmd = rmw_regtomem(cmd, A3XX_SP_FS_CTRL_REG1, 0x000003ff,
 			0, 0, drawctxt->cond_execs[3].gpuaddr);
 
-	
+	/* Save fragment constant shadow offset */
 	cmd = rmw_regtomem(cmd, A3XX_SP_FS_OBJ_OFFSET_REG, 0x00ff0000,
 			   18, (drawctxt->gpustate.gpuaddr & 0xfffffe00) | 1,
 			   drawctxt->constant_load_commands[2].gpuaddr);
 #endif
 
+	/* Use mask value to avoid flushing HLSQ which would cause the HW to
+	   discard all the shader data */
 
 	cmd = rmw_regtomem(cmd,  A3XX_HLSQ_CONTROL_0_REG, 0x9ffffdff,
 		0, 0, drawctxt->hlsqcontrol_restore_commands[0].gpuaddr);
@@ -2071,6 +2292,7 @@
 	return 0;
 }
 
+/* create buffers for saving/restoring registers, constants, & GMEM */
 static int a3xx_create_gmem_shadow(struct adreno_device *adreno_dev,
 				 struct adreno_context *drawctxt)
 {
@@ -2106,6 +2328,12 @@
 {
 	int ret;
 
+	/*
+	 * Allocate memory for the GPU state and the context commands.
+	 * Despite the name, this is much more then just storage for
+	 * the gpustate.  This contains command space for gmem save
+	 * and texture and vertex buffer storage too
+	 */
 
 	ret = kgsl_allocate(&drawctxt->gpustate,
 		drawctxt->pagetable, CONTEXT_SIZE);
@@ -2139,7 +2367,7 @@
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 
-	if (context == NULL)
+	if (context == NULL || (context->flags & CTXT_FLAGS_BEING_DESTROYED))
 		return;
 
 	if (context->flags & CTXT_FLAGS_GPU_HANG)
@@ -2147,17 +2375,17 @@
 			       "Current active context has caused gpu hang\n");
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
-		
+		/* Fixup self modifying IBs for save operations */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, context->save_fixup, 3);
 
-		
+		/* save registers and constants. */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->regconstant_save, 3);
 
 		if (context->flags & CTXT_FLAGS_SHADER_SAVE) {
-			
+			/* Save shader instructions */
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_PMODE, context->shader_save, 3);
 
@@ -2167,6 +2395,10 @@
 
 	if ((context->flags & CTXT_FLAGS_GMEM_SAVE) &&
 	    (context->flags & CTXT_FLAGS_GMEM_SHADOW)) {
+		/*
+		 * Save GMEM (note: changes shader. shader must
+		 * already be saved.)
+		 */
 
 		adreno_ringbuffer_issuecmds(device, context,
 					KGSL_CMD_FLAGS_PMODE,
@@ -2183,7 +2415,7 @@
 	unsigned int cmds[5];
 
 	if (context == NULL) {
-		
+		/* No context - set the default pagetable and thats it */
 		kgsl_mmu_setstate(&device->mmu, device->mmu.defaultpagetable,
 				adreno_dev->drawctxt_active->id);
 		return;
@@ -2201,6 +2433,10 @@
 					cmds, 5);
 	kgsl_mmu_setstate(&device->mmu, context->pagetable, context->id);
 
+	/*
+	 * Restore GMEM.  (note: changes shader.
+	 * Shader must not already be restored.)
+	 */
 
 	if (context->flags & CTXT_FLAGS_GMEM_RESTORE) {
 		adreno_ringbuffer_issuecmds(device, context,
@@ -2214,7 +2450,7 @@
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, context->reg_restore, 3);
 
-		
+		/* Fixup self modifying IBs for restore operations */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->restore_fixup, 3);
@@ -2228,7 +2464,7 @@
 				KGSL_CMD_FLAGS_NONE,
 				context->shader_restore, 3);
 
-		
+		/* Restore HLSQ_CONTROL_0 register */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->hlsqcontrol_restore, 3);
@@ -2239,7 +2475,7 @@
 			 struct adreno_ringbuffer *rb)
 {
 	unsigned int *cmds, cmds_gpu;
-	cmds = adreno_ringbuffer_allocspace(rb, 18);
+	cmds = adreno_ringbuffer_allocspace(rb, NULL, 18);
 	cmds_gpu = rb->buffer_desc.gpuaddr + sizeof(uint) * (rb->wptr - 18);
 
 	GSL_RB_WRITE(cmds, cmds_gpu, cp_type3_packet(CP_ME_INIT, 17));
@@ -2257,7 +2493,7 @@
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000001);
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
-	
+	/* Protected mode control - turned off for A3XX */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
@@ -2276,6 +2512,10 @@
 
 		adreno_regread(device, A3XX_RBBM_AHB_ERROR_STATUS, &reg);
 
+		/*
+		 * Return the word address of the erroring register so that it
+		 * matches the register specification
+		 */
 
 		KGSL_DRV_CRIT(device,
 			"RBBM | AHB bus error | %s | addr=%x | ports=%x:%x\n",
@@ -2283,7 +2523,7 @@
 			(reg & 0xFFFFF) >> 2, (reg >> 20) & 0x3,
 			(reg >> 24) & 0x3);
 
-		
+		/* Clear the error */
 		adreno_regwrite(device, A3XX_RBBM_AHB_CMD, (1 << 3));
 		return;
 	}
@@ -2336,23 +2576,10 @@
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 
-	if (irq == A3XX_INT_CP_RB_INT) {
-		unsigned int context_id;
-		kgsl_sharedmem_readl(&device->memstore, &context_id,
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-					current_context));
-		if (context_id < KGSL_MEMSTORE_MAX) {
-			kgsl_sharedmem_writel(&device->memstore,
-					KGSL_MEMSTORE_OFFSET(context_id,
-						ts_cmp_enable), 0);
-			wmb();
-		}
-		KGSL_CMD_WARN(device, "ringbuffer rb interrupt\n");
-	}
-
+	/* Wake up everybody waiting for the interrupt */
 	wake_up_interruptible_all(&device->wait_queue);
 
-	
+	/* Schedule work to free mem and issue ibs */
 	queue_work(device->work_queue, &device->ts_expired_ws);
 
 	atomic_notifier_call_chain(&device->ts_notifier_list,
@@ -2378,33 +2605,33 @@
 static struct {
 	void (*func)(struct adreno_device *, int);
 } a3xx_irq_funcs[] = {
-	A3XX_IRQ_CALLBACK(NULL),               
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(a3xx_cp_callback),   
-	A3XX_IRQ_CALLBACK(a3xx_cp_callback),   
-	A3XX_IRQ_CALLBACK(a3xx_cp_callback),   
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(NULL),	       
-	A3XX_IRQ_CALLBACK(a3xx_err_callback),  
-	
+	A3XX_IRQ_CALLBACK(NULL),               /* 0 - RBBM_GPU_IDLE */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 1 - RBBM_AHB_ERROR */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 2 - RBBM_REG_TIMEOUT */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 3 - RBBM_ME_MS_TIMEOUT */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 4 - RBBM_PFP_MS_TIMEOUT */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 5 - RBBM_ATB_BUS_OVERFLOW */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 6 - RBBM_VFD_ERROR */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 7 - CP_SW */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 8 - CP_T0_PACKET_IN_IB */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 9 - CP_OPCODE_ERROR */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 10 - CP_RESERVED_BIT_ERROR */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 11 - CP_HW_FAULT */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 12 - CP_DMA */
+	A3XX_IRQ_CALLBACK(a3xx_cp_callback),   /* 13 - CP_IB2_INT */
+	A3XX_IRQ_CALLBACK(a3xx_cp_callback),   /* 14 - CP_IB1_INT */
+	A3XX_IRQ_CALLBACK(a3xx_cp_callback),   /* 15 - CP_RB_INT */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 16 - CP_REG_PROTECT_FAULT */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 17 - CP_RB_DONE_TS */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 18 - CP_VS_DONE_TS */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 19 - CP_PS_DONE_TS */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 20 - CP_CACHE_FLUSH_TS */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 21 - CP_AHB_ERROR_FAULT */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 22 - Unused */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 23 - Unused */
+	A3XX_IRQ_CALLBACK(NULL),	       /* 24 - MISC_HANG_DETECT */
+	A3XX_IRQ_CALLBACK(a3xx_err_callback),  /* 25 - UCHE_OOB_ACCESS */
+	/* 26 to 31 - Unused */
 };
 
 static irqreturn_t a3xx_irq_handler(struct adreno_device *adreno_dev)
@@ -2448,24 +2675,33 @@
 		adreno_regwrite(device, A3XX_RBBM_INT_0_MASK, 0);
 }
 
+static unsigned int a3xx_irq_pending(struct adreno_device *adreno_dev)
+{
+	unsigned int status;
+
+	adreno_regread(&adreno_dev->dev, A3XX_RBBM_INT_0_STATUS, &status);
+
+	return (status & A3XX_INT_MASK) ? 1 : 0;
+}
+
 static unsigned int a3xx_busy_cycles(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 	unsigned int reg, val;
 
-	
+	/* Freeze the counter */
 	adreno_regread(device, A3XX_RBBM_RBBM_CTL, &reg);
 	reg &= ~RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
 	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
 
-	
+	/* Read the value */
 	adreno_regread(device, A3XX_RBBM_PERFCTR_PWR_1_LO, &val);
 
-	
+	/* Reset the counter */
 	reg |= RBBM_RBBM_CTL_RESET_PWR_CTR1;
 	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
 
-	
+	/* Re-enable the counter */
 	reg &= ~RBBM_RBBM_CTL_RESET_PWR_CTR1;
 	reg |= RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
 	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
@@ -2478,8 +2714,9 @@
 	unsigned int val;
 };
 
+/* VBIF registers start after 0x3000 so use 0x0 as end of list marker */
 static struct a3xx_vbif_data a305_vbif[] = {
-	
+	/* Set up 16 deep read/write request queues */
 	{ A3XX_VBIF_IN_RD_LIM_CONF0, 0x10101010 },
 	{ A3XX_VBIF_IN_RD_LIM_CONF1, 0x10101010 },
 	{ A3XX_VBIF_OUT_RD_LIM_CONF0, 0x10101010 },
@@ -2487,18 +2724,18 @@
 	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303 },
 	{ A3XX_VBIF_IN_WR_LIM_CONF0, 0x10101010 },
 	{ A3XX_VBIF_IN_WR_LIM_CONF1, 0x10101010 },
-	
+	/* Enable WR-REQ */
 	{ A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x0000FF },
-	
+	/* Set up round robin arbitration between both AXI ports */
 	{ A3XX_VBIF_ARB_CTL, 0x00000030 },
-	
+	/* Set up AOOO */
 	{ A3XX_VBIF_OUT_AXI_AOOO_EN, 0x0000003C },
 	{ A3XX_VBIF_OUT_AXI_AOOO, 0x003C003C },
 	{0, 0},
 };
 
 static struct a3xx_vbif_data a320_vbif[] = {
-	
+	/* Set up 16 deep read/write request queues */
 	{ A3XX_VBIF_IN_RD_LIM_CONF0, 0x10101010 },
 	{ A3XX_VBIF_IN_RD_LIM_CONF1, 0x10101010 },
 	{ A3XX_VBIF_OUT_RD_LIM_CONF0, 0x10101010 },
@@ -2506,19 +2743,47 @@
 	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303 },
 	{ A3XX_VBIF_IN_WR_LIM_CONF0, 0x10101010 },
 	{ A3XX_VBIF_IN_WR_LIM_CONF1, 0x10101010 },
-	
+	/* Enable WR-REQ */
 	{ A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x0000FF },
-	
+	/* Set up round robin arbitration between both AXI ports */
 	{ A3XX_VBIF_ARB_CTL, 0x00000030 },
-	
+	/* Set up AOOO */
 	{ A3XX_VBIF_OUT_AXI_AOOO_EN, 0x0000003C },
 	{ A3XX_VBIF_OUT_AXI_AOOO, 0x003C003C },
-	
+	/* Enable 1K sort */
 	{ A3XX_VBIF_ABIT_SORT, 0x000000FF },
 	{ A3XX_VBIF_ABIT_SORT_CONF, 0x000000A4 },
 	{0, 0},
 };
 
+static struct a3xx_vbif_data a330_vbif[] = {
+	/* Set up 16 deep read/write request queues */
+	{ A3XX_VBIF_IN_RD_LIM_CONF0, 0x18181818 },
+	{ A3XX_VBIF_IN_RD_LIM_CONF1, 0x00001818 },
+	{ A3XX_VBIF_OUT_RD_LIM_CONF0, 0x00001818 },
+	{ A3XX_VBIF_OUT_WR_LIM_CONF0, 0x00001818 },
+	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303 },
+	{ A3XX_VBIF_IN_WR_LIM_CONF0, 0x18181818 },
+	{ A3XX_VBIF_IN_WR_LIM_CONF1, 0x00001818 },
+	/* Enable WR-REQ */
+	{ A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x00003F },
+	/* Set up round robin arbitration between both AXI ports */
+	{ A3XX_VBIF_ARB_CTL, 0x00000030 },
+	/* Set up VBIF_ROUND_ROBIN_QOS_ARB */
+	{ A3XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x0001 },
+	/* Set up AOOO */
+	{ A3XX_VBIF_OUT_AXI_AOOO_EN, 0x0000003F },
+	{ A3XX_VBIF_OUT_AXI_AOOO, 0x003F003F },
+	/* Enable 1K sort */
+	{ A3XX_VBIF_ABIT_SORT, 0x0001003F },
+	{ A3XX_VBIF_ABIT_SORT_CONF, 0x000000A4 },
+	/* Disable VBIF clock gating. This is to enable AXI running
+	 * higher frequency than GPU.
+	 */
+	{ A3XX_VBIF_CLKON, 1 },
+	{0, 0},
+};
+
 static void a3xx_start(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
@@ -2528,6 +2793,8 @@
 		vbif = a305_vbif;
 	else if (adreno_is_a320(adreno_dev))
 		vbif = a320_vbif;
+	else if (adreno_is_a330(adreno_dev))
+		vbif = a330_vbif;
 
 	BUG_ON(vbif == NULL);
 
@@ -2536,32 +2803,66 @@
 		vbif++;
 	}
 
-	
+	/* Make all blocks contribute to the GPU BUSY perf counter */
 	adreno_regwrite(device, A3XX_RBBM_GPU_BUSY_MASKED, 0xFFFFFFFF);
 
-	
+	/* Tune the hystersis counters for SP and CP idle detection */
 	adreno_regwrite(device, A3XX_RBBM_SP_HYST_CNT, 0x10);
 	adreno_regwrite(device, A3XX_RBBM_WAIT_IDLE_CLOCKS_CTL, 0x10);
 
+	/* Enable the RBBM error reporting bits.  This lets us get
+	   useful information on failure */
 
 	adreno_regwrite(device, A3XX_RBBM_AHB_CTL0, 0x00000001);
 
-	
+	/* Enable AHB error reporting */
 	adreno_regwrite(device, A3XX_RBBM_AHB_CTL1, 0xA6FFFFFF);
 
-	
+	/* Turn on the power counters */
 	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, 0x00030000);
 
+	/* Turn on hang detection - this spews a lot of useful information
+	 * into the RBBM registers on a hang */
 
 	adreno_regwrite(device, A3XX_RBBM_INTERFACE_HANG_INT_CTL,
 			(1 << 16) | 0xFFF);
 
-	
+	/* Enable 64-byte cacheline size. HW Default is 32-byte (0x000000E0). */
+	adreno_regwrite(device, A3XX_UCHE_CACHE_MODE_CONTROL_REG, 0x00000001);
+
+	/* Enable Clock gating */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL,
 			A3XX_RBBM_CLOCK_CTL_DEFAULT);
 
+	/* Set the OCMEM base address for A330 */
+	if (adreno_is_a330(adreno_dev)) {
+		adreno_regwrite(device, A3XX_RB_GMEM_BASE_ADDR,
+			(unsigned int)(adreno_dev->ocmem_base >> 14));
+	}
+
+	/* Turn on performance counters */
+	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, 0x01);
+
+	/*
+	 * Set SP perfcounter 5 to count SP_ALU_ACTIVE_CYCLES, it includes
+	 * all ALU instruction execution regardless precision or shader ID.
+	 * Set SP perfcounter 6 to count SP0_ICL1_MISSES, It counts
+	 * USP L1 instruction miss request.
+	 * Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS, it
+	 * counts USP flow control instruction execution.
+	 * we will use this to augment our hang detection
+	 */
+	if (adreno_dev->fast_hang_detect) {
+		adreno_regwrite(device, A3XX_SP_PERFCOUNTER5_SELECT,
+			SP_ALU_ACTIVE_CYCLES);
+		adreno_regwrite(device, A3XX_SP_PERFCOUNTER6_SELECT,
+			SP0_ICL1_MISSES);
+		adreno_regwrite(device, A3XX_SP_PERFCOUNTER7_SELECT,
+			SP_FS_CFLOW_INSTRUCTIONS);
+	}
 }
 
+/* Defined in adreno_a3xx_snapshot.c */
 void *a3xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 	int *remain, int hang);
 
@@ -2577,6 +2878,7 @@
 	.rb_init = a3xx_rb_init,
 	.irq_control = a3xx_irq_control,
 	.irq_handler = a3xx_irq_handler,
+	.irq_pending = a3xx_irq_pending,
 	.busy_cycles = a3xx_busy_cycles,
 	.start = a3xx_start,
 	.snapshot = a3xx_snapshot,
diff --git a/drivers/gpu/msm/adreno_a3xx_snapshot.c b/drivers/gpu/msm/adreno_a3xx_snapshot.c
index 14cdaaa..d9d5ec8 100644
--- a/drivers/gpu/msm/adreno_a3xx_snapshot.c
+++ b/drivers/gpu/msm/adreno_a3xx_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -113,6 +113,12 @@
 	header->type = SNAPSHOT_DEBUG_CP_PM4_RAM;
 	header->size = size;
 
+	/*
+	 * Read the firmware from the GPU rather than use our cache in order to
+	 * try to catch mis-programming or corruption in the hardware.  We do
+	 * use the cached version of the size, however, instead of trying to
+	 * maintain always changing hardcoded constants
+	 */
 
 	adreno_regwrite(device, REG_CP_ME_RAM_RADDR, 0x0);
 	for (i = 0; i < size; i++)
@@ -137,6 +143,12 @@
 	header->type = SNAPSHOT_DEBUG_CP_PFP_RAM;
 	header->size = size;
 
+	/*
+	 * Read the firmware from the GPU rather than use our cache in order to
+	 * try to catch mis-programming or corruption in the hardware.  We do
+	 * use the cached version of the size, however, instead of trying to
+	 * maintain always changing hardcoded constants
+	 */
 	kgsl_regwrite(device, A3XX_CP_PFP_UCODE_ADDR, 0x0);
 	for (i = 0; i < size; i++)
 		adreno_regread(device, A3XX_CP_PFP_UCODE_DATA, &data[i]);
@@ -144,28 +156,68 @@
 	return DEBUG_SECTION_SZ(size);
 }
 
-#define CP_ROQ_SIZE 128
+/* This is the ROQ buffer size on both the A305 and A320 */
+#define A320_CP_ROQ_SIZE 128
+/* This is the ROQ buffer size on the A330 */
+#define A330_CP_ROQ_SIZE 512
 
 static int a3xx_snapshot_cp_roq(struct kgsl_device *device, void *snapshot,
 		int remain, void *priv)
 {
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct kgsl_snapshot_debug *header = snapshot;
 	unsigned int *data = snapshot + sizeof(*header);
-	int i;
+	int i, size;
 
-	if (remain < DEBUG_SECTION_SZ(CP_ROQ_SIZE)) {
+	/* The size of the ROQ buffer is core dependent */
+	size = adreno_is_a330(adreno_dev) ?
+		A330_CP_ROQ_SIZE : A320_CP_ROQ_SIZE;
+
+	if (remain < DEBUG_SECTION_SZ(size)) {
 		SNAPSHOT_ERR_NOMEM(device, "CP ROQ DEBUG");
 		return 0;
 	}
 
 	header->type = SNAPSHOT_DEBUG_CP_ROQ;
-	header->size = CP_ROQ_SIZE;
+	header->size = size;
 
 	adreno_regwrite(device, A3XX_CP_ROQ_ADDR, 0x0);
-	for (i = 0; i < CP_ROQ_SIZE; i++)
+	for (i = 0; i < size; i++)
 		adreno_regread(device, A3XX_CP_ROQ_DATA, &data[i]);
 
-	return DEBUG_SECTION_SZ(CP_ROQ_SIZE);
+	return DEBUG_SECTION_SZ(size);
+}
+
+#define A330_CP_MERCIU_QUEUE_SIZE 32
+
+static int a330_snapshot_cp_merciu(struct kgsl_device *device, void *snapshot,
+		int remain, void *priv)
+{
+	struct kgsl_snapshot_debug *header = snapshot;
+	unsigned int *data = snapshot + sizeof(*header);
+	int i, size;
+
+	/* The MERCIU data is two dwords per entry */
+	size = A330_CP_MERCIU_QUEUE_SIZE << 1;
+
+	if (remain < DEBUG_SECTION_SZ(size)) {
+		SNAPSHOT_ERR_NOMEM(device, "CP MERCIU DEBUG");
+		return 0;
+	}
+
+	header->type = SNAPSHOT_DEBUG_CP_MERCIU;
+	header->size = size;
+
+	adreno_regwrite(device, A3XX_CP_MERCIU_ADDR, 0x0);
+
+	for (i = 0; i < A330_CP_MERCIU_QUEUE_SIZE; i++) {
+		adreno_regread(device, A3XX_CP_MERCIU_DATA,
+			&data[(i * 2)]);
+		adreno_regread(device, A3XX_CP_MERCIU_DATA2,
+			&data[(i * 2) + 1]);
+	}
+
+	return DEBUG_SECTION_SZ(size);
 }
 
 #define DEBUGFS_BLOCK_SIZE 0x40
@@ -245,52 +297,97 @@
 	return snapshot;
 }
 
+static void _snapshot_a3xx_regs(struct kgsl_snapshot_registers *regs,
+	struct kgsl_snapshot_registers_list *list)
+{
+	regs[list->count].regs = (unsigned int *) a3xx_registers;
+	regs[list->count].count = a3xx_registers_count;
+	list->count++;
+}
+
+static void _snapshot_hlsq_regs(struct kgsl_snapshot_registers *regs,
+	struct kgsl_snapshot_registers_list *list,
+	struct adreno_device *adreno_dev)
+{
+	/* HLSQ specific registers */
+	/*
+	 * Don't dump any a3xx HLSQ registers just yet.  Reading the HLSQ
+	 * registers can cause the device to hang if the HLSQ block is
+	 * busy.  Add specific checks for each a3xx core as the requirements
+	 * are discovered.  Disable by default for now.
+	 */
+	if (!adreno_is_a3xx(adreno_dev)) {
+		regs[list->count].regs = (unsigned int *) a3xx_hlsq_registers;
+		regs[list->count].count = a3xx_hlsq_registers_count;
+		list->count++;
+	}
+}
+
+static void _snapshot_a330_regs(struct kgsl_snapshot_registers *regs,
+	struct kgsl_snapshot_registers_list *list)
+{
+	/* For A330, append the additional list of new registers to grab */
+	regs[list->count].regs = (unsigned int *) a330_registers;
+	regs[list->count].count = a330_registers_count;
+	list->count++;
+}
+
+/* A3XX GPU snapshot function - this is where all of the A3XX specific
+ * bits and pieces are grabbed into the snapshot memory
+ */
 
 void *a3xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 	int *remain, int hang)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
-	struct kgsl_snapshot_registers regs;
+	struct kgsl_snapshot_registers_list list;
+	struct kgsl_snapshot_registers regs[5];
 
-	regs.regs = (unsigned int *) a3xx_registers;
-	regs.count = a3xx_registers_count;
+	list.registers = regs;
+	list.count = 0;
 
-	
+	/* Disable Clock gating temporarily for the debug bus to work */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL, 0x00);
 
-	
+	/* Store relevant registers in list to snapshot */
+	_snapshot_a3xx_regs(regs, &list);
+	_snapshot_hlsq_regs(regs, &list, adreno_dev);
+	if (adreno_is_a330(adreno_dev))
+		_snapshot_a330_regs(regs, &list);
+
+	/* Master set of (non debug) registers */
 	snapshot = kgsl_snapshot_add_section(device,
 		KGSL_SNAPSHOT_SECTION_REGS, snapshot, remain,
-		kgsl_snapshot_dump_regs, &regs);
+		kgsl_snapshot_dump_regs, &list);
 
-	
+	/* CP_STATE_DEBUG indexed registers */
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_CP_STATE_DEBUG_INDEX,
 			REG_CP_STATE_DEBUG_DATA, 0x0, 0x14);
 
-	
+	/* CP_ME indexed registers */
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_CP_ME_CNTL, REG_CP_ME_STATUS,
 			64, 44);
 
-	
+	/* VPC memory */
 	snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a3xx_snapshot_vpc_memory, NULL);
 
-	
+	/* CP MEQ */
 	snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a3xx_snapshot_cp_meq, NULL);
 
-	
+	/* Shader working/shadow memory */
 	snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a3xx_snapshot_shader_memory, NULL);
 
 
-	
-	
+	/* CP PFP and PM4 */
+	/* Reading these will hang the GPU if it isn't already hung */
 
 	if (hang) {
 		snapshot = kgsl_snapshot_add_section(device,
@@ -302,14 +399,20 @@
 			a3xx_snapshot_cp_pm4_ram, NULL);
 	}
 
-	
+	/* CP ROQ */
 	snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a3xx_snapshot_cp_roq, NULL);
 
+	if (adreno_is_a330(adreno_dev)) {
+		snapshot = kgsl_snapshot_add_section(device,
+			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
+			a330_snapshot_cp_merciu, NULL);
+	}
+
 	snapshot = a3xx_snapshot_debugbus(device, snapshot, remain);
 
-	
+	/* Enable Clock gating */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL,
 			A3XX_RBBM_CLOCK_CTL_DEFAULT);
 
diff --git a/drivers/gpu/msm/adreno_a3xx_trace.c b/drivers/gpu/msm/adreno_a3xx_trace.c
index 80756c6..325b068 100644
--- a/drivers/gpu/msm/adreno_a3xx_trace.c
+++ b/drivers/gpu/msm/adreno_a3xx_trace.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -14,6 +14,7 @@
 #include "kgsl.h"
 #include "adreno.h"
 
+/* Instantiate tracepoints */
 #define CREATE_TRACE_POINTS
 #include "a3xx_reg.h"
 #include "adreno_a3xx_trace.h"
diff --git a/drivers/gpu/msm/adreno_a3xx_trace.h b/drivers/gpu/msm/adreno_a3xx_trace.h
index e4b4d11..d48faf4 100644
--- a/drivers/gpu/msm/adreno_a3xx_trace.h
+++ b/drivers/gpu/msm/adreno_a3xx_trace.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -25,6 +25,9 @@
 
 struct kgsl_device;
 
+/*
+ * Tracepoint for a3xx irq. Includes status info
+ */
 TRACE_EVENT(kgsl_a3xx_irq_status,
 
 	TP_PROTO(struct kgsl_device *device, unsigned int status),
@@ -80,6 +83,7 @@
 	)
 );
 
-#endif 
+#endif /* _ADRENO_A3XX_TRACE_H */
 
+/* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 70eb2db..890c8a1 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2008-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -18,67 +18,11 @@
 #include <linux/io.h>
 
 #include "kgsl.h"
-#include "adreno_postmortem.h"
 #include "adreno.h"
 
 #include "a2xx_reg.h"
 
 unsigned int kgsl_cff_dump_enable;
-int adreno_pm_regs_enabled;
-int adreno_pm_ib_enabled;
-
-static struct dentry *pm_d_debugfs;
-
-static int pm_dump_set(void *data, u64 val)
-{
-	struct kgsl_device *device = data;
-
-	if (val) {
-		mutex_lock(&device->mutex);
-		adreno_postmortem_dump(device, 1);
-		mutex_unlock(&device->mutex);
-	}
-
-	return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(pm_dump_fops,
-			NULL,
-			pm_dump_set, "%llu\n");
-
-static int pm_regs_enabled_set(void *data, u64 val)
-{
-	adreno_pm_regs_enabled = val ? 1 : 0;
-	return 0;
-}
-
-static int pm_regs_enabled_get(void *data, u64 *val)
-{
-	*val = adreno_pm_regs_enabled;
-	return 0;
-}
-
-static int pm_ib_enabled_set(void *data, u64 val)
-{
-	adreno_pm_ib_enabled = val ? 1 : 0;
-	return 0;
-}
-
-static int pm_ib_enabled_get(void *data, u64 *val)
-{
-	*val = adreno_pm_ib_enabled;
-	return 0;
-}
-
-
-DEFINE_SIMPLE_ATTRIBUTE(pm_regs_enabled_fops,
-			pm_regs_enabled_get,
-			pm_regs_enabled_set, "%llu\n");
-
-DEFINE_SIMPLE_ATTRIBUTE(pm_ib_enabled_fops,
-			pm_ib_enabled_get,
-			pm_ib_enabled_set, "%llu\n");
-
 
 static int kgsl_cff_dump_enable_set(void *data, u64 val)
 {
@@ -116,23 +60,43 @@
 		&adreno_dev->wait_timeout);
 	debugfs_create_u32("ib_check", 0644, device->d_debugfs,
 			   &adreno_dev->ib_check_level);
-
-	
+	/* By Default enable fast hang detection */
 	adreno_dev->fast_hang_detect = 1;
 	debugfs_create_u32("fast_hang_detect", 0644, device->d_debugfs,
 			   &adreno_dev->fast_hang_detect);
 
-	
+	/* Top level switch to enable/disable userspace FT control */
+	adreno_dev->ft_user_control = 0;
+	debugfs_create_u32("ft_user_control", 0644, device->d_debugfs,
+			   &adreno_dev->ft_user_control);
+	/*
+	 * FT policy can be set to any of the options below.
+	 * KGSL_FT_DISABLE -> BIT(0) Set to disable FT
+	 * KGSL_FT_REPLAY  -> BIT(1) Set to enable replay
+	 * KGSL_FT_SKIPIB  -> BIT(2) Set to skip IB
+	 * KGSL_FT_SKIPFRAME -> BIT(3) Set to skip frame
+	 * by default set FT policy to KGSL_FT_DEFAULT_POLICY
+	 */
+	adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
+	debugfs_create_u32("ft_policy", 0644, device->d_debugfs,
+			   &adreno_dev->ft_policy);
 
-	pm_d_debugfs = debugfs_create_dir("postmortem", device->d_debugfs);
+	/* By default enable long IB detection */
+	adreno_dev->long_ib_detect = 1;
+	debugfs_create_u32("long_ib_detect", 0644, device->d_debugfs,
+			   &adreno_dev->long_ib_detect);
 
-	if (IS_ERR(pm_d_debugfs))
-		return;
-
-	debugfs_create_file("dump",  0600, pm_d_debugfs, device,
-			    &pm_dump_fops);
-	debugfs_create_file("regs_enabled", 0644, pm_d_debugfs, device,
-			    &pm_regs_enabled_fops);
-	debugfs_create_file("ib_enabled", 0644, pm_d_debugfs, device,
-				    &pm_ib_enabled_fops);
+	/*
+	 * FT pagefault policy can be set to any of the options below.
+	 * KGSL_FT_PAGEFAULT_INT_ENABLE -> BIT(0) set to enable pagefault INT
+	 * KGSL_FT_PAGEFAULT_GPUHALT_ENABLE  -> BIT(1) Set to enable GPU HALT on
+	 * pagefaults. This stalls the GPU on a pagefault on IOMMU v1 HW.
+	 * KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE  -> BIT(2) Set to log only one
+	 * pagefault per page.
+	 * KGSL_FT_PAGEFAULT_LOG_ONE_PER_INT -> BIT(3) Set to log only one
+	 * pagefault per INT.
+	 */
+	adreno_dev->ft_pf_policy = KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
+	debugfs_create_u32("ft_pagefault_policy", 0644, device->d_debugfs,
+			   &adreno_dev->ft_pf_policy);
 }
diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c
index 4db7258..6fbcdee 100644
--- a/drivers/gpu/msm/adreno_drawctxt.c
+++ b/drivers/gpu/msm/adreno_drawctxt.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,6 +19,7 @@
 
 #define KGSL_INIT_REFTIMESTAMP		0x7FFFFFFF
 
+/* quad for copying GMEM to context shadow */
 #define QUAD_LEN 12
 #define QUAD_RESTORE_LEN 14
 
@@ -46,7 +47,15 @@
 	0x3f800000, 0x00000000
 };
 
+/*
+ * Helper functions
+ * These are global helper functions used by the GPUs during context switch
+ */
 
+/**
+ * uint2float - convert a uint to IEEE754 single precision float
+ * @ uintval - value to convert
+ */
 
 unsigned int uint2float(unsigned int uintval)
 {
@@ -57,11 +66,11 @@
 
 	exp = ilog2(uintval);
 
-	
+	/* Calculate fraction */
 	if (23 > exp)
 		frac = (uintval & (~(1 << exp))) << (23 - exp);
 
-	
+	/* Exp is biased by 127 and shifted 23 bits */
 	exp = (exp + 127) << 23;
 
 	return exp | frac;
@@ -69,7 +78,7 @@
 
 static void set_gmem_copy_quad(struct gmem_shadow_t *shadow)
 {
-	
+	/* set vertex buffer values */
 	gmem_copy_quad[1] = uint2float(shadow->height);
 	gmem_copy_quad[3] = uint2float(shadow->width);
 	gmem_copy_quad[4] = uint2float(shadow->height);
@@ -86,26 +95,33 @@
 		TEXCOORD_LEN << 2);
 }
 
+/**
+ * build_quad_vtxbuff - Create a quad for saving/restoring GMEM
+ * @ context - Pointer to the context being created
+ * @ shadow - Pointer to the GMEM shadow structure
+ * @ incmd - Pointer to pointer to the temporary command buffer
+ */
 
+/* quad for saving/restoring gmem */
 void build_quad_vtxbuff(struct adreno_context *drawctxt,
 		struct gmem_shadow_t *shadow, unsigned int **incmd)
 {
 	 unsigned int *cmd = *incmd;
 
-	
+	/* quad vertex buffer location (in GPU space) */
 	shadow->quad_vertices.hostptr = cmd;
 	shadow->quad_vertices.gpuaddr = virt2gpu(cmd, &drawctxt->gpustate);
 
 	cmd += QUAD_LEN;
 
-	
+	/* Used by A3XX, but define for both to make the code easier */
 	shadow->quad_vertices_restore.hostptr = cmd;
 	shadow->quad_vertices_restore.gpuaddr =
 		virt2gpu(cmd, &drawctxt->gpustate);
 
 	cmd += QUAD_RESTORE_LEN;
 
-	
+	/* tex coord buffer location (in GPU space) */
 	shadow->quad_texcoords.hostptr = cmd;
 	shadow->quad_texcoords.gpuaddr = virt2gpu(cmd, &drawctxt->gpustate);
 
@@ -115,12 +131,23 @@
 	*incmd = cmd;
 }
 
+/**
+ * adreno_drawctxt_create - create a new adreno draw context
+ * @device - KGSL device to create the context on
+ * @pagetable - Pagetable for the context
+ * @context- Generic KGSL context structure
+ * @flags - flags for the context (passed from user space)
+ *
+ * Create a new draw context for the 3D core.  Return 0 on success,
+ * or error code on failure.
+ */
 int adreno_drawctxt_create(struct kgsl_device *device,
 			struct kgsl_pagetable *pagetable,
 			struct kgsl_context *context, uint32_t flags)
 {
 	struct adreno_context *drawctxt;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 	int ret;
 
 	drawctxt = kzalloc(sizeof(struct adreno_context), GFP_KERNEL);
@@ -128,9 +155,12 @@
 	if (drawctxt == NULL)
 		return -ENOMEM;
 
+	drawctxt->pid = task_pid_nr(current);
+	strlcpy(drawctxt->pid_name, current->comm, TASK_COMM_LEN);
 	drawctxt->pagetable = pagetable;
 	drawctxt->bin_base_offset = 0;
 	drawctxt->id = context->id;
+	rb->timestamp[context->id] = 0;
 
 	if (flags & KGSL_CONTEXT_PREAMBLE)
 		drawctxt->flags |= CTXT_FLAGS_PREAMBLE;
@@ -141,6 +171,17 @@
 	if (flags & KGSL_CONTEXT_PER_CONTEXT_TS)
 		drawctxt->flags |= CTXT_FLAGS_PER_CONTEXT_TS;
 
+	if (flags & KGSL_CONTEXT_USER_GENERATED_TS) {
+		if (!(flags & KGSL_CONTEXT_PER_CONTEXT_TS)) {
+			ret = -EINVAL;
+			goto err;
+		}
+		drawctxt->flags |= CTXT_FLAGS_USER_GENERATED_TS;
+	}
+
+	if (flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE)
+		drawctxt->flags |= CTXT_FLAGS_NO_FAULT_TOLERANCE;
+
 	ret = adreno_dev->gpudev->ctxt_create(adreno_dev, drawctxt);
 	if (ret)
 		goto err;
@@ -148,6 +189,12 @@
 	kgsl_sharedmem_writel(&device->memstore,
 			KGSL_MEMSTORE_OFFSET(drawctxt->id, ref_wait_ts),
 			KGSL_INIT_REFTIMESTAMP);
+	kgsl_sharedmem_writel(&device->memstore,
+			KGSL_MEMSTORE_OFFSET(drawctxt->id, ts_cmp_enable), 0);
+	kgsl_sharedmem_writel(&device->memstore,
+			KGSL_MEMSTORE_OFFSET(drawctxt->id, soptimestamp), 0);
+	kgsl_sharedmem_writel(&device->memstore,
+			KGSL_MEMSTORE_OFFSET(drawctxt->id, eoptimestamp), 0);
 
 	context->devctxt = drawctxt;
 	return 0;
@@ -156,7 +203,16 @@
 	return ret;
 }
 
+/**
+ * adreno_drawctxt_destroy - destroy a draw context
+ * @device - KGSL device that owns the context
+ * @context- Generic KGSL context container for the context
+ *
+ * Destroy an existing context.  Return 0 on success or error
+ * code on failure.
+ */
 
+/* destroy a drawing context */
 
 void adreno_drawctxt_destroy(struct kgsl_device *device,
 			  struct kgsl_context *context)
@@ -168,15 +224,18 @@
 		return;
 
 	drawctxt = context->devctxt;
-	
+	/* deactivate context */
 	if (adreno_dev->drawctxt_active == drawctxt) {
+		/* no need to save GMEM or shader, the context is
+		 * being destroyed.
+		 */
 		drawctxt->flags &= ~(CTXT_FLAGS_GMEM_SAVE |
 				     CTXT_FLAGS_SHADER_SAVE |
 				     CTXT_FLAGS_GMEM_SHADOW |
 				     CTXT_FLAGS_STATE_SHADOW);
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-		device->current_process_priv = NULL;
-#endif
+
+		drawctxt->flags |= CTXT_FLAGS_BEING_DESTROYED;
+
 		adreno_drawctxt_switch(adreno_dev, NULL, 0);
 	}
 
@@ -194,6 +253,14 @@
 	context->devctxt = NULL;
 }
 
+/**
+ * adreno_drawctxt_set_bin_base_offset - set bin base offset for the context
+ * @device - KGSL device that owns the context
+ * @context- Generic KGSL context container for the context
+ * @offset - Offset to set
+ *
+ * Set the bin base offset for A2XX devices.  Not valid for A3XX devices.
+ */
 
 void adreno_drawctxt_set_bin_base_offset(struct kgsl_device *device,
 				      struct kgsl_context *context,
@@ -205,6 +272,14 @@
 		drawctxt->bin_base_offset = offset;
 }
 
+/**
+ * adreno_drawctxt_switch - switch the current draw context
+ * @adreno_dev - The 3D device that owns the context
+ * @drawctxt - the 3D context to switch to
+ * @flags - Flags to accompany the switch (from user space)
+ *
+ * Switch the current draw context
+ */
 
 void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 				struct adreno_context *drawctxt,
@@ -214,13 +289,15 @@
 
 	if (drawctxt) {
 		if (flags & KGSL_CONTEXT_SAVE_GMEM)
+			/* Set the flag in context so that the save is done
+			* when this context is switched out. */
 			drawctxt->flags |= CTXT_FLAGS_GMEM_SAVE;
 		else
-			
+			/* Remove GMEM saving flag from the context */
 			drawctxt->flags &= ~CTXT_FLAGS_GMEM_SAVE;
 	}
 
-	
+	/* already current? */
 	if (adreno_dev->drawctxt_active == drawctxt) {
 		if (adreno_dev->gpudev->ctxt_draw_workaround &&
 			adreno_is_a225(adreno_dev))
@@ -232,10 +309,10 @@
 	KGSL_CTXT_INFO(device, "from %p to %p flags %d\n",
 			adreno_dev->drawctxt_active, drawctxt, flags);
 
-	
+	/* Save the old context */
 	adreno_dev->gpudev->ctxt_save(adreno_dev, adreno_dev->drawctxt_active);
 
-	
+	/* Set the new context */
 	adreno_dev->gpudev->ctxt_restore(adreno_dev, drawctxt);
 	adreno_dev->drawctxt_active = drawctxt;
 }
diff --git a/drivers/gpu/msm/adreno_drawctxt.h b/drivers/gpu/msm/adreno_drawctxt.h
index f66dfbb..fd60688 100644
--- a/drivers/gpu/msm/adreno_drawctxt.h
+++ b/drivers/gpu/msm/adreno_drawctxt.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -13,44 +13,73 @@
 #ifndef __ADRENO_DRAWCTXT_H
 #define __ADRENO_DRAWCTXT_H
 
+#include <linux/sched.h>
+
 #include "adreno_pm4types.h"
 #include "a2xx_reg.h"
 
+/* Flags */
 
 #define CTXT_FLAGS_NOT_IN_USE		0x00000000
-#define CTXT_FLAGS_IN_USE		0x00000001
+#define CTXT_FLAGS_IN_USE		BIT(0)
 
-#define CTXT_FLAGS_STATE_SHADOW		0x00000010
+/* state shadow memory allocated */
+#define CTXT_FLAGS_STATE_SHADOW		BIT(1)
 
-#define CTXT_FLAGS_GMEM_SHADOW		0x00000100
-#define CTXT_FLAGS_GMEM_SAVE		0x00000200
-#define CTXT_FLAGS_GMEM_RESTORE		0x00000400
-#define CTXT_FLAGS_PREAMBLE		0x00000800
-#define CTXT_FLAGS_SHADER_SAVE		0x00002000
-#define CTXT_FLAGS_SHADER_RESTORE	0x00004000
-#define CTXT_FLAGS_GPU_HANG		0x00008000
-#define CTXT_FLAGS_NOGMEMALLOC          0x00010000
-#define CTXT_FLAGS_TRASHSTATE		0x00020000
-#define CTXT_FLAGS_PER_CONTEXT_TS	0x00040000
-#define CTXT_FLAGS_GPU_HANG_RECOVERED	0x00008000
+/* gmem shadow memory allocated */
+#define CTXT_FLAGS_GMEM_SHADOW		BIT(2)
+/* gmem must be copied to shadow */
+#define CTXT_FLAGS_GMEM_SAVE		BIT(3)
+/* gmem can be restored from shadow */
+#define CTXT_FLAGS_GMEM_RESTORE		BIT(4)
+/* preamble packed in cmdbuffer for context switching */
+#define CTXT_FLAGS_PREAMBLE		BIT(5)
+/* shader must be copied to shadow */
+#define CTXT_FLAGS_SHADER_SAVE		BIT(6)
+/* shader can be restored from shadow */
+#define CTXT_FLAGS_SHADER_RESTORE	BIT(7)
+/* Context has caused a GPU hang */
+#define CTXT_FLAGS_GPU_HANG		BIT(8)
+/* Specifies there is no need to save GMEM */
+#define CTXT_FLAGS_NOGMEMALLOC          BIT(9)
+/* Trash state for context */
+#define CTXT_FLAGS_TRASHSTATE		BIT(10)
+/* per context timestamps enabled */
+#define CTXT_FLAGS_PER_CONTEXT_TS	BIT(11)
+/* Context has caused a GPU hang and fault tolerance successful */
+#define CTXT_FLAGS_GPU_HANG_FT	BIT(12)
+/* Context is being destroyed so dont save it */
+#define CTXT_FLAGS_BEING_DESTROYED	BIT(13)
+/* User mode generated timestamps enabled */
+#define CTXT_FLAGS_USER_GENERATED_TS    BIT(14)
+/* Context skip till EOF */
+#define CTXT_FLAGS_SKIP_EOF             BIT(15)
+/* Context no fault tolerance */
+#define CTXT_FLAGS_NO_FAULT_TOLERANCE  BIT(16)
 
 struct kgsl_device;
 struct adreno_device;
 struct kgsl_device_private;
 struct kgsl_context;
 
+/* draw context */
 struct gmem_shadow_t {
-	struct kgsl_memdesc gmemshadow;	
+	struct kgsl_memdesc gmemshadow;	/* Shadow buffer address */
 
+	/*
+	 * 256 KB GMEM surface = 4 bytes-per-pixel x 256 pixels/row x
+	 * 256 rows. Width & height must be multiples of 32 in case tiled
+	 * textures are used
+	*/
 
-	enum COLORFORMATX format; 
-	unsigned int size;	
-	unsigned int width;	
-	unsigned int height;	
-	unsigned int pitch;	
-	unsigned int gmem_pitch;	
-	unsigned int *gmem_save_commands;    
-	unsigned int *gmem_restore_commands; 
+	enum COLORFORMATX format; /* Unused on A3XX */
+	unsigned int size;	/* Size of surface used to store GMEM */
+	unsigned int width;	/* Width of surface used to store GMEM */
+	unsigned int height;	/* Height of surface used to store GMEM */
+	unsigned int pitch;	/* Pitch of surface used to store GMEM */
+	unsigned int gmem_pitch;	/* Pitch value used for GMEM */
+	unsigned int *gmem_save_commands;    /* Unused on A3XX */
+	unsigned int *gmem_restore_commands; /* Unused on A3XX */
 	unsigned int gmem_save[3];
 	unsigned int gmem_restore[3];
 	struct kgsl_memdesc quad_vertices;
@@ -59,7 +88,10 @@
 };
 
 struct adreno_context {
+	pid_t pid;
+	char pid_name[TASK_COMM_LEN];
 	unsigned int id;
+	unsigned int ib_gpu_time_used;
 	uint32_t flags;
 	struct kgsl_pagetable *pagetable;
 	struct kgsl_memdesc gpustate;
@@ -67,16 +99,16 @@
 	unsigned int shader_save[3];
 	unsigned int shader_restore[3];
 
-	
+	/* Information of the GMEM shadow that is created in context create */
 	struct gmem_shadow_t context_gmem_shadow;
 
-	
+	/* A2XX specific items */
 	unsigned int reg_save[3];
 	unsigned int shader_fixup[3];
 	unsigned int chicken_restore[3];
 	unsigned int bin_base_offset;
 
-	
+	/* A3XX specific items */
 	unsigned int regconstant_save[3];
 	unsigned int constant_restore[3];
 	unsigned int hlsqcontrol_restore[3];
@@ -105,6 +137,7 @@
 					struct kgsl_context *context,
 					unsigned int offset);
 
+/* GPU context switch helper functions */
 
 void build_quad_vtxbuff(struct adreno_context *drawctxt,
 		struct gmem_shadow_t *shadow, unsigned int **incmd);
@@ -131,8 +164,8 @@
 static inline unsigned int *reg_range(unsigned int *cmd, unsigned int start,
 	unsigned int end)
 {
-	*cmd++ = CP_REG(start);		
-	*cmd++ = end - start + 1;	
+	*cmd++ = CP_REG(start);		/* h/w regs, start addr */
+	*cmd++ = end - start + 1;	/* count */
 	return cmd;
 }
 
@@ -142,7 +175,7 @@
 
 	shadow->format = COLORX_8_8_8_8;
 
-	
+	/* convert from bytes to 32-bit words */
 	gmem_size = (gmem_size + 3) / 4;
 
 	while ((w * h) < gmem_size) {
@@ -158,4 +191,4 @@
 	shadow->size = shadow->pitch * shadow->height * 4;
 }
 
-#endif  
+#endif  /* __ADRENO_DRAWCTXT_H */
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index c6ebed4..a3fa312 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,103 +22,159 @@
 #define CP_TYPE3_PKT	((unsigned int)3 << 30)
 
 
+/* type3 packets */
+/* initialize CP's micro-engine */
 #define CP_ME_INIT		0x48
 
+/* skip N 32-bit words to get to the next packet */
 #define CP_NOP			0x10
 
+/* indirect buffer dispatch.  same as IB, but init is pipelined */
 #define CP_INDIRECT_BUFFER_PFD	0x37
 
+/* wait for the IDLE state of the engine */
 #define CP_WAIT_FOR_IDLE	0x26
 
+/* wait until a register or memory location is a specific value */
 #define CP_WAIT_REG_MEM	0x3c
 
+/* wait until a register location is equal to a specific value */
 #define CP_WAIT_REG_EQ		0x52
 
+/* wait until a register location is >= a specific value */
 #define CP_WAT_REG_GTE		0x53
 
+/* wait until a read completes */
 #define CP_WAIT_UNTIL_READ	0x5c
 
+/* wait until all base/size writes from an IB_PFD packet have completed */
 #define CP_WAIT_IB_PFD_COMPLETE 0x5d
 
+/* register read/modify/write */
 #define CP_REG_RMW		0x21
 
+/* Set binning configuration registers */
 #define CP_SET_BIN_DATA             0x2f
 
+/* reads register in chip and writes to memory */
 #define CP_REG_TO_MEM		0x3e
 
+/* write N 32-bit words to memory */
 #define CP_MEM_WRITE		0x3d
 
+/* write CP_PROG_COUNTER value to memory */
 #define CP_MEM_WRITE_CNTR	0x4f
 
+/* conditional execution of a sequence of packets */
 #define CP_COND_EXEC		0x44
 
+/* conditional write to memory or register */
 #define CP_COND_WRITE		0x45
 
+/* generate an event that creates a write to memory when completed */
 #define CP_EVENT_WRITE		0x46
 
+/* generate a VS|PS_done event */
 #define CP_EVENT_WRITE_SHD	0x58
 
+/* generate a cache flush done event */
 #define CP_EVENT_WRITE_CFL	0x59
 
+/* generate a z_pass done event */
 #define CP_EVENT_WRITE_ZPD	0x5b
 
 
+/* initiate fetch of index buffer and draw */
 #define CP_DRAW_INDX		0x22
 
+/* draw using supplied indices in packet */
 #define CP_DRAW_INDX_2		0x36
 
+/* initiate fetch of index buffer and binIDs and draw */
 #define CP_DRAW_INDX_BIN	0x34
 
+/* initiate fetch of bin IDs and draw using supplied indices */
 #define CP_DRAW_INDX_2_BIN	0x35
 
 
+/* begin/end initiator for viz query extent processing */
 #define CP_VIZ_QUERY		0x23
 
+/* fetch state sub-blocks and initiate shader code DMAs */
 #define CP_SET_STATE		0x25
 
+/* load constant into chip and to memory */
 #define CP_SET_CONSTANT	0x2d
 
+/* load sequencer instruction memory (pointer-based) */
 #define CP_IM_LOAD		0x27
 
+/* load sequencer instruction memory (code embedded in packet) */
 #define CP_IM_LOAD_IMMEDIATE	0x2b
 
+/* load constants from a location in memory */
 #define CP_LOAD_CONSTANT_CONTEXT 0x2e
 
+/* (A2x) sets binning configuration registers */
 #define CP_SET_BIN_DATA             0x2f
 
+/* selective invalidation of state pointers */
 #define CP_INVALIDATE_STATE	0x3b
 
 
+/* dynamically changes shader instruction memory partition */
 #define CP_SET_SHADER_BASES	0x4A
 
+/* sets the 64-bit BIN_MASK register in the PFP */
 #define CP_SET_BIN_MASK	0x50
 
+/* sets the 64-bit BIN_SELECT register in the PFP */
 #define CP_SET_BIN_SELECT	0x51
 
 
+/* updates the current context, if needed */
 #define CP_CONTEXT_UPDATE	0x5e
 
+/* generate interrupt from the command stream */
 #define CP_INTERRUPT		0x40
 
 
+/* copy sequencer instruction memory to system memory */
 #define CP_IM_STORE            0x2c
 
+/* test 2 memory locations to dword values specified */
 #define CP_TEST_TWO_MEMS    0x71
 
+/* PFP waits until the FIFO between the PFP and the ME is empty */
 #define CP_WAIT_FOR_ME      0x13
 
+/*
+ * for a20x
+ * program an offset that will added to the BIN_BASE value of
+ * the 3D_DRAW_INDX_BIN packet
+ */
 #define CP_SET_BIN_BASE_OFFSET     0x4B
 
+/*
+ * for a22x
+ * sets draw initiator flags register in PFP, gets bitwise-ORed into
+ * every draw initiator
+ */
 #define CP_SET_DRAW_INIT_FLAGS      0x4B
 
-#define CP_SET_PROTECTED_MODE  0x5f 
+#define CP_SET_PROTECTED_MODE  0x5f /* sets the register protection mode */
 
+/*
+ * for a3xx
+ */
 
-#define CP_LOAD_STATE 0x30 
+#define CP_LOAD_STATE 0x30 /* load high level sequencer command */
 
-#define CP_COND_INDIRECT_BUFFER_PFE 0x3A 
-#define CP_COND_INDIRECT_BUFFER_PFD 0x32 
+/* Conditionally load a IB based on a flag */
+#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */
+#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */
 
+/* Load a buffer with pre-fetch enabled */
 #define CP_INDIRECT_BUFFER_PFE 0x3F
 
 #define CP_LOADSTATE_DSTOFFSET_SHIFT 0x00000000
@@ -128,6 +184,7 @@
 #define CP_LOADSTATE_STATETYPE_SHIFT 0x00000000
 #define CP_LOADSTATE_EXTSRCADDR_SHIFT 0x00000002
 
+/* packet header building macros */
 #define cp_type0_packet(regindx, cnt) \
 	(CP_TYPE0_PKT | (((cnt)-1) << 16) | ((regindx) & 0x7FFF))
 
@@ -152,6 +209,10 @@
 #define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
 #define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
 
+/*
+ * Check both for the type3 opcode and make sure that the reserved bits [1:7]
+ * and 15 are 0
+ */
 
 #define pkt_is_type3(pkt) \
 	((((pkt) & 0xC0000000) == CP_TYPE3_PKT) && \
@@ -160,15 +221,19 @@
 #define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
 #define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
 
+/* packet headers */
 #define CP_HDR_ME_INIT	cp_type3_packet(CP_ME_INIT, 18)
 #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)
 #define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2)
 
+/* dword base address of the GFX decode space */
 #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000)))
 
+/* gmem command buffer length */
 #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg)))
 
 
+/* Return 1 if the command is an indirect buffer of any kind */
 static inline int adreno_cmd_is_ib(unsigned int cmd)
 {
 	return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) ||
@@ -177,4 +242,4 @@
 		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2));
 }
 
-#endif	
+#endif	/* __ADRENO_PM4TYPES_H */
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
index 45286dd..cf1cf90 100644
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,8 +19,6 @@
 #include "adreno.h"
 #include "adreno_pm4types.h"
 #include "adreno_ringbuffer.h"
-#include "adreno_postmortem.h"
-#include "adreno_debugfs.h"
 #include "kgsl_cffdump.h"
 #include "kgsl_pwrctrl.h"
 
@@ -136,7 +134,7 @@
 	int range = 0, offset = 0;
 
 	for (range = 0; range < size; range++) {
-		
+		/* start and end are in dword offsets */
 		int start = registers[range * 2];
 		int end = registers[range * 2 + 1];
 
@@ -194,7 +192,7 @@
 	dump_ib(device, "IB1:", pt_base, base_offset, ib1_base,
 		ib1_size, dump);
 
-	
+	/* fetch virtual address for given IB base */
 	ib1_addr = (uint32_t *)adreno_convertaddr(device, pt_base,
 		ib1_base, ib1_size*sizeof(uint32_t));
 	if (!ib1_addr)
@@ -206,7 +204,7 @@
 			uint32_t ib2_base = ib1_addr[i++];
 			uint32_t ib2_size = ib1_addr[i++];
 
-			
+			/* find previous match */
 			for (j = 0; j < ib_list->count; ++j)
 				if (ib_list->sizes[j] == ib2_size
 					&& ib_list->bases[j] == ib2_base)
@@ -216,7 +214,7 @@
 				>= IB_LIST_SIZE)
 				continue;
 
-			
+			/* store match */
 			ib_list->sizes[ib_list->count] = ib2_size;
 			ib_list->bases[ib_list->count] = ib2_base;
 			ib_list->offsets[ib_list->count] = i<<2;
@@ -270,7 +268,7 @@
 #endif
 }
 
-static void adreno_dump_rb(struct kgsl_device *device, const void *buf,
+void adreno_dump_rb(struct kgsl_device *device, const void *buf,
 			 size_t len, int start, int size)
 {
 	const uint32_t *ptr = buf;
@@ -678,7 +676,7 @@
 		"MH_INTERRUPT: MASK = %08X | STATUS   = %08X\n", r1, r2);
 }
 
-static int adreno_dump(struct kgsl_device *device)
+int adreno_dump(struct kgsl_device *device, int manual)
 {
 	unsigned int cp_ib1_base, cp_ib1_bufsz;
 	unsigned int cp_ib2_base, cp_ib2_bufsz;
@@ -694,21 +692,24 @@
 	unsigned int ts_processed = 0xdeaddead;
 	struct kgsl_context *context;
 	unsigned int context_id;
+	unsigned int rbbm_status;
 
 	static struct ib_list ib_list;
 
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	struct kgsl_memdesc **reg_map;
-	void *reg_map_array;
 	int num_iommu_units = 0;
 
 	mb();
 
-	if (adreno_is_a2xx(adreno_dev))
-		adreno_dump_a2xx(device);
-	else if (adreno_is_a3xx(adreno_dev))
-		adreno_dump_a3xx(device);
+	if (device->pm_dump_enable) {
+		if (adreno_is_a2xx(adreno_dev))
+			adreno_dump_a2xx(device);
+		else if (adreno_is_a3xx(adreno_dev))
+			adreno_dump_a3xx(device);
+	}
+
+	kgsl_regread(device, adreno_dev->gpudev->reg_rbbm_status, &rbbm_status);
 
 	pt_base = kgsl_mmu_get_current_ptbase(&device->mmu);
 	cur_pt_base = pt_base;
@@ -723,6 +724,18 @@
 	kgsl_regread(device, REG_CP_IB2_BASE, &cp_ib2_base);
 	kgsl_regread(device, REG_CP_IB2_BUFSZ, &cp_ib2_bufsz);
 
+	/* If postmortem dump is not enabled, dump minimal set and return */
+	if (!device->pm_dump_enable) {
+
+		KGSL_LOG_DUMP(device,
+			"RBBM STATUS %08X | IB1:%08X/%08X | IB2: %08X/%08X"
+			" | RPTR: %04X | WPTR: %04X\n",
+			rbbm_status,  cp_ib1_base, cp_ib1_bufsz, cp_ib2_base,
+			cp_ib2_bufsz, cp_rb_rptr, cp_rb_wptr);
+
+		return 0;
+	}
+
 	kgsl_sharedmem_readl(&device->memstore,
 			(unsigned int *) &context_id,
 			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
@@ -731,7 +744,7 @@
 	if (context) {
 		ts_processed = kgsl_readtimestamp(device, context,
 						  KGSL_TIMESTAMP_RETIRED);
-		KGSL_LOG_DUMP(device, "CTXT: %d  TIMESTM RTRD: %08X\n",
+		KGSL_LOG_DUMP(device, "FT CTXT: %d  TIMESTM RTRD: %08X\n",
 				context->id, ts_processed);
 	} else
 		KGSL_LOG_DUMP(device, "BAD CTXT: %d\n", context_id);
@@ -781,13 +794,11 @@
 		memcpy(rb_copy+part1_c, rb_vaddr, (num_item-part1_c)<<2);
 	}
 
-	
+	/* extract the latest ib commands from the buffer */
 	ib_list.count = 0;
 	i = 0;
-	
-	num_iommu_units = kgsl_mmu_get_reg_map_desc(&device->mmu,
-							&reg_map_array);
-	reg_map = reg_map_array;
+	/* get the register mapped array in case we are using IOMMU */
+	num_iommu_units = kgsl_mmu_get_num_iommu_units(&device->mmu);
 	for (read_idx = 0; read_idx < num_item; ) {
 		uint32_t this_cmd = rb_copy[read_idx++];
 		if (adreno_cmd_is_ib(this_cmd)) {
@@ -801,27 +812,29 @@
 					ib_list.bases[i],
 					ib_list.sizes[i], 0);
 		} else if (this_cmd == cp_type0_packet(MH_MMU_PT_BASE, 1) ||
-			(num_iommu_units && this_cmd == (reg_map[0]->gpuaddr +
-			(KGSL_IOMMU_CONTEXT_USER << KGSL_IOMMU_CTX_SHIFT) +
-			KGSL_IOMMU_TTBR0))) {
-
+			(num_iommu_units && this_cmd ==
+			kgsl_mmu_get_reg_gpuaddr(&device->mmu, 0,
+						KGSL_IOMMU_CONTEXT_USER,
+						KGSL_IOMMU_CTX_TTBR0))) {
 			KGSL_LOG_DUMP(device, "Current pagetable: %x\t"
 				"pagetable base: %x\n",
-				kgsl_mmu_get_ptname_from_ptbase(cur_pt_base),
+				kgsl_mmu_get_ptname_from_ptbase(&device->mmu,
+								cur_pt_base),
 				cur_pt_base);
 
-			
+			/* Set cur_pt_base to the new pagetable base */
 			cur_pt_base = rb_copy[read_idx++];
 
 			KGSL_LOG_DUMP(device, "New pagetable: %x\t"
 				"pagetable base: %x\n",
-				kgsl_mmu_get_ptname_from_ptbase(cur_pt_base),
+				kgsl_mmu_get_ptname_from_ptbase(&device->mmu,
+								cur_pt_base),
 				cur_pt_base);
 		}
 	}
-	if (num_iommu_units)
-		kfree(reg_map_array);
 
+	/* Restore cur_pt_base back to the pt_base of
+	   the process in whose context the GPU hung */
 	cur_pt_base = pt_base;
 
 	read_idx = (int)cp_rb_rptr - NUM_DWORDS_OF_RINGBUFFER_HISTORY;
@@ -832,7 +845,7 @@
 		cp_rb_base, cp_rb_rptr, cp_rb_wptr, read_idx);
 	adreno_dump_rb(device, rb_copy, num_item<<2, read_idx, rb_count);
 
-	if (is_adreno_pm_ib_enabled()) {
+	if (device->pm_ib_enabled) {
 		for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY;
 			read_idx >= 0; --read_idx) {
 			uint32_t this_cmd = rb_copy[read_idx];
@@ -862,8 +875,8 @@
 		}
 	}
 
-	
-	if (is_adreno_pm_regs_enabled()) {
+	/* Dump the registers if the user asked for it */
+	if (device->pm_regs_enabled) {
 		if (adreno_is_a20x(adreno_dev))
 			adreno_dump_regs(device, a200_registers,
 					a200_registers_count);
@@ -873,9 +886,14 @@
 		else if (adreno_is_a225(adreno_dev))
 			adreno_dump_regs(device, a225_registers,
 				a225_registers_count);
-		else if (adreno_is_a3xx(adreno_dev))
+		else if (adreno_is_a3xx(adreno_dev)) {
 			adreno_dump_regs(device, a3xx_registers,
 					a3xx_registers_count);
+
+			if (adreno_is_a330(adreno_dev))
+				adreno_dump_regs(device, a330_registers,
+					a330_registers_count);
+		}
 	}
 
 error_vfree:
@@ -883,72 +901,3 @@
 end:
 	return result;
 }
-
-
-int adreno_postmortem_dump(struct kgsl_device *device, int manual)
-{
-	bool saved_nap;
-	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-
-	BUG_ON(device == NULL);
-
-	kgsl_cffdump_hang(device->id);
-
-	
-
-	if (manual) {
-		if (device->active_cnt != 0) {
-			mutex_unlock(&device->mutex);
-			wait_for_completion(&device->suspend_gate);
-			mutex_lock(&device->mutex);
-		}
-
-		if (device->state == KGSL_STATE_ACTIVE)
-			kgsl_idle(device);
-
-	}
-	KGSL_LOG_DUMP(device, "POWER: FLAGS = %08lX | ACTIVE POWERLEVEL = %08X",
-			pwr->power_flags, pwr->active_pwrlevel);
-
-	KGSL_LOG_DUMP(device, "POWER: INTERVAL TIMEOUT = %08X ",
-		pwr->interval_timeout);
-
-	KGSL_LOG_DUMP(device, "GRP_CLK = %lu ",
-				  kgsl_get_clkrate(pwr->grp_clks[0]));
-
-	KGSL_LOG_DUMP(device, "BUS CLK = %lu ",
-		kgsl_get_clkrate(pwr->ebi1_clk));
-
-	
-	del_timer_sync(&device->idle_timer);
-	mutex_unlock(&device->mutex);
-	flush_workqueue(device->work_queue);
-	mutex_lock(&device->mutex);
-
-	saved_nap = device->pwrctrl.nap_allowed;
-	device->pwrctrl.nap_allowed = false;
-
-	
-	kgsl_pwrctrl_wake(device);
-
-	
-	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
-
-	adreno_dump(device);
-
-	
-	device->pwrctrl.nap_allowed = saved_nap;
-
-
-	if (manual) {
-		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
-
-		
-		kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP);
-		kgsl_pwrctrl_sleep(device);
-	}
-
-	KGSL_DRV_ERR(device, "Dump Finished\n");
-
-	return 0;
-}
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 90ff642..179027c 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -18,34 +18,44 @@
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
 #include "kgsl_cffdump.h"
-#include "kgsl_trace.h"
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
 #include "adreno_ringbuffer.h"
-#include "adreno_debugfs.h"
 
 #include "a2xx_reg.h"
 #include "a3xx_reg.h"
 
 #define GSL_RB_NOP_SIZEDWORDS				2
 
-#define CP_DEBUG_DEFAULT 0xA000000
+/*
+ * CP DEBUG settings for all cores:
+ * DYNAMIC_CLK_DISABLE [27] - turn off the dynamic clock control
+ * PROG_END_PTR_ENABLE [25] - Allow 128 bit writes to the VBIF
+ */
+
+#define CP_DEBUG_DEFAULT ((1 << 27) | (1 << 25))
 
 void adreno_ringbuffer_submit(struct adreno_ringbuffer *rb)
 {
 	BUG_ON(rb->wptr == 0);
 
+	/* Let the pwrscale policy know that new commands have
+	 been submitted. */
 	kgsl_pwrscale_busy(rb->device);
 
+	/*synchronize memory before informing the hardware of the
+	 *new commands.
+	 */
 	mb();
 
 	adreno_regwrite(rb->device, REG_CP_RB_WPTR, rb->wptr);
 }
 
-static void
-adreno_ringbuffer_waitspace(struct adreno_ringbuffer *rb, unsigned int numcmds,
-			  int wptr_ahead)
+static int
+adreno_ringbuffer_waitspace(struct adreno_ringbuffer *rb,
+				struct adreno_context *context,
+				unsigned int numcmds, int wptr_ahead)
 {
 	int nopcount;
 	unsigned int freecmds;
@@ -54,13 +64,13 @@
 	unsigned long wait_time;
 	unsigned long wait_timeout = msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
 	unsigned long wait_time_part;
-	unsigned int prev_reg_val[hang_detect_regs_count];
+	unsigned int prev_reg_val[ft_detect_regs_count];
 
 	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
-	
+	/* if wptr ahead, fill the remaining with NOPs */
 	if (wptr_ahead) {
-		
+		/* -1 for header */
 		nopcount = rb->sizedwords - rb->wptr - 1;
 
 		cmds = (unsigned int *)rb->buffer_desc.hostptr + rb->wptr;
@@ -68,6 +78,10 @@
 
 		GSL_RB_WRITE(cmds, cmds_gpu, cp_nop_packet(nopcount));
 
+		/* Make sure that rptr is not 0 before submitting
+		 * commands at the end of ringbuffer. We do not
+		 * want the rptr and wptr to become equal when
+		 * the ringbuffer is not empty */
 		do {
 			GSL_RB_GET_READPTR(rb, &rb->rptr);
 		} while (!rb->rptr);
@@ -81,7 +95,7 @@
 
 	wait_time = jiffies + wait_timeout;
 	wait_time_part = jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART);
-	
+	/* wait for space in ringbuffer */
 	while (1) {
 		GSL_RB_GET_READPTR(rb, &rb->rptr);
 
@@ -90,10 +104,12 @@
 		if (freecmds == 0 || freecmds > numcmds)
 			break;
 
+		/* Dont wait for timeout, detect hang faster.
+		 */
 		if (time_after(jiffies, wait_time_part)) {
 			wait_time_part = jiffies +
 				msecs_to_jiffies(KGSL_TIMEOUT_PART);
-			if ((adreno_hang_detect(rb->device,
+			if ((adreno_ft_detect(rb->device,
 						prev_reg_val))){
 				KGSL_DRV_ERR(rb->device,
 				"Hang detected while waiting for freespace in"
@@ -113,43 +129,56 @@
 		continue;
 
 err:
-		if (!adreno_dump_and_recover(rb->device)) {
+		if (!adreno_dump_and_exec_ft(rb->device)) {
+			if (context && context->flags & CTXT_FLAGS_GPU_HANG) {
+				KGSL_CTXT_WARN(rb->device,
+				"Context %p caused a gpu hang. Will not accept commands for context %d\n",
+				context, context->id);
+				return -EDEADLK;
+			}
 			wait_time = jiffies + wait_timeout;
 		} else {
-			
+			/* GPU is hung and fault tolerance failed */
 			BUG();
 		}
 	}
+	return 0;
 }
 
 unsigned int *adreno_ringbuffer_allocspace(struct adreno_ringbuffer *rb,
-					     unsigned int numcmds)
+					struct adreno_context *context,
+					unsigned int numcmds)
 {
-	unsigned int	*ptr = NULL;
-
+	unsigned int *ptr = NULL;
+	int ret = 0;
 	BUG_ON(numcmds >= rb->sizedwords);
 
 	GSL_RB_GET_READPTR(rb, &rb->rptr);
-	
+	/* check for available space */
 	if (rb->wptr >= rb->rptr) {
-		
-		
+		/* wptr ahead or equal to rptr */
+		/* reserve dwords for nop packet */
 		if ((rb->wptr + numcmds) > (rb->sizedwords -
 				GSL_RB_NOP_SIZEDWORDS))
-			adreno_ringbuffer_waitspace(rb, numcmds, 1);
+			ret = adreno_ringbuffer_waitspace(rb, context,
+							numcmds, 1);
 	} else {
-		
+		/* wptr behind rptr */
 		if ((rb->wptr + numcmds) >= rb->rptr)
-			adreno_ringbuffer_waitspace(rb, numcmds, 0);
-		
-		
-		if ((rb->wptr + numcmds) > (rb->sizedwords -
+			ret = adreno_ringbuffer_waitspace(rb, context,
+							numcmds, 0);
+		/* check for remaining space */
+		/* reserve dwords for nop packet */
+		if (!ret && (rb->wptr + numcmds) > (rb->sizedwords -
 				GSL_RB_NOP_SIZEDWORDS))
-			adreno_ringbuffer_waitspace(rb, numcmds, 1);
+			ret = adreno_ringbuffer_waitspace(rb, context,
+							numcmds, 1);
 	}
 
-	ptr = (unsigned int *)rb->buffer_desc.hostptr + rb->wptr;
-	rb->wptr += numcmds;
+	if (!ret) {
+		ptr = (unsigned int *)rb->buffer_desc.hostptr + rb->wptr;
+		rb->wptr += numcmds;
+	}
 
 	return ptr;
 }
@@ -195,7 +224,7 @@
 		if (ret)
 			goto err;
 
-		
+		/* PM4 size is 3 dword aligned plus 1 dword of version */
 		if (len % ((sizeof(uint32_t) * 3)) != sizeof(uint32_t)) {
 			KGSL_DRV_ERR(device, "Bad firmware size: %d\n", len);
 			ret = -EINVAL;
@@ -226,10 +255,8 @@
 
 	KGSL_DRV_INFO(device, "loading pm4 ucode version: %d\n",
 		adreno_dev->pm4_fw_version);
-	if (adreno_is_a3xx(adreno_dev))
-		adreno_regwrite(device, REG_CP_DEBUG, CP_DEBUG_DEFAULT);
-	else
-		adreno_regwrite(device, REG_CP_DEBUG, 0x02000000);
+
+	adreno_regwrite(device, REG_CP_DEBUG, CP_DEBUG_DEFAULT);
 	adreno_regwrite(device, REG_CP_ME_RAM_WADDR, 0);
 	for (i = 1; i < adreno_dev->pm4_fw_size; i++)
 		adreno_regwrite(device, REG_CP_ME_RAM_DATA,
@@ -252,7 +279,7 @@
 		if (ret)
 			goto err;
 
-		
+		/* PFP size shold be dword aligned */
 		if (len % sizeof(uint32_t) != 0) {
 			KGSL_DRV_ERR(device, "Bad firmware size: %d\n", len);
 			ret = -EINVAL;
@@ -281,20 +308,21 @@
 	}
 
 	KGSL_DRV_INFO(device, "loading pfp ucode version: %d\n",
-		adreno_dev->pfp_fw_version);
+			adreno_dev->pfp_fw_version);
 
 	adreno_regwrite(device, adreno_dev->gpudev->reg_cp_pfp_ucode_addr, 0);
 	for (i = 1; i < adreno_dev->pfp_fw_size; i++)
 		adreno_regwrite(device,
-			adreno_dev->gpudev->reg_cp_pfp_ucode_data,
-			adreno_dev->pfp_fw[i]);
+		adreno_dev->gpudev->reg_cp_pfp_ucode_data,
+		adreno_dev->pfp_fw[i]);
+
 	return 0;
 }
 
 int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
 {
 	int status;
-	
+	/*cp_rb_cntl_u cp_rb_cntl; */
 	union reg_cp_rb_cntl cp_rb_cntl;
 	unsigned int rb_cntl;
 	struct kgsl_device *device = rb->device;
@@ -317,25 +345,34 @@
 			(rb->memptrs_desc.gpuaddr
 			+ GSL_RB_MEMPTRS_WPTRPOLL_OFFSET));
 
-		
+		/* setup WPTR delay */
 		adreno_regwrite(device, REG_CP_RB_WPTR_DELAY,
-			0 );
+			0 /*0x70000010 */);
 	}
 
-	
+	/*setup REG_CP_RB_CNTL */
 	adreno_regread(device, REG_CP_RB_CNTL, &rb_cntl);
 	cp_rb_cntl.val = rb_cntl;
 
+	/*
+	 * The size of the ringbuffer in the hardware is the log2
+	 * representation of the size in quadwords (sizedwords / 2)
+	 */
 	cp_rb_cntl.f.rb_bufsz = ilog2(rb->sizedwords >> 1);
 
+	/*
+	 * Specify the quadwords to read before updating mem RPTR.
+	 * Like above, pass the log2 representation of the blocksize
+	 * in quadwords.
+	*/
 	cp_rb_cntl.f.rb_blksz = ilog2(KGSL_RB_BLKSIZE >> 3);
 
 	if (adreno_is_a2xx(adreno_dev)) {
-		
+		/* WPTR polling */
 		cp_rb_cntl.f.rb_poll_en = GSL_RB_CNTL_POLL_EN;
 	}
 
-	
+	/* mem RPTR writebacks */
 	cp_rb_cntl.f.rb_no_update =  GSL_RB_CNTL_NO_UPDATE;
 
 	adreno_regwrite(device, REG_CP_RB_CNTL, cp_rb_cntl.val);
@@ -347,10 +384,10 @@
 			     GSL_RB_MEMPTRS_RPTR_OFFSET);
 
 	if (adreno_is_a3xx(adreno_dev)) {
-		
+		/* enable access protection to privileged registers */
 		adreno_regwrite(device, A3XX_CP_PROTECT_CTRL, 0x00000007);
 
-		
+		/* RBBM registers */
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_0, 0x63000040);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_1, 0x62000080);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_2, 0x600000CC);
@@ -358,26 +395,26 @@
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_4, 0x64000140);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_5, 0x66000400);
 
-		
+		/* CP registers */
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_6, 0x65000700);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_7, 0x610007D8);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_8, 0x620007E0);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_9, 0x61001178);
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_A, 0x64001180);
 
-		
+		/* RB registers */
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_B, 0x60003300);
 
-		
+		/* VBIF registers */
 		adreno_regwrite(device, A3XX_CP_PROTECT_REG_C, 0x6B00C000);
 	}
 
 	if (adreno_is_a2xx(adreno_dev)) {
-		
+		/* explicitly clear all cp interrupts */
 		adreno_regwrite(device, REG_CP_INT_ACK, 0xFFFFFFFF);
 	}
 
-	
+	/* setup scratch/timestamp */
 	adreno_regwrite(device, REG_SCRATCH_ADDR, device->memstore.gpuaddr +
 			     KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 				     soptimestamp));
@@ -385,31 +422,30 @@
 	adreno_regwrite(device, REG_SCRATCH_UMSK,
 			     GSL_RB_MEMPTRS_SCRATCH_MASK);
 
-	
-
+	/* load the CP ucode */
 	status = adreno_ringbuffer_load_pm4_ucode(device);
 	if (status != 0)
 		return status;
 
-	
+	/* load the prefetch parser ucode */
 	status = adreno_ringbuffer_load_pfp_ucode(device);
 	if (status != 0)
 		return status;
 
-	
+	/* CP ROQ queue sizes (bytes) - RB:16, ST:16, IB1:32, IB2:64 */
 	if (adreno_is_a305(adreno_dev) || adreno_is_a320(adreno_dev))
 		adreno_regwrite(device, REG_CP_QUEUE_THRESHOLDS, 0x000E0602);
 
 	rb->rptr = 0;
 	rb->wptr = 0;
 
-	
+	/* clear ME_HALT to start micro engine */
 	adreno_regwrite(device, REG_CP_ME_CNTL, 0);
 
-	
+	/* ME init is GPU specific, so jump into the sub-function */
 	adreno_dev->gpudev->rb_init(adreno_dev, rb);
 
-	
+	/* idle device to validate ME INIT */
 	status = adreno_idle(device);
 
 	if (status == 0)
@@ -420,9 +456,13 @@
 
 void adreno_ringbuffer_stop(struct adreno_ringbuffer *rb)
 {
+	struct kgsl_device *device = rb->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
 	if (rb->flags & KGSL_FLAGS_STARTED) {
-		
-		adreno_regwrite(rb->device, REG_CP_ME_CNTL, 0x10000000);
+		if (adreno_is_a200(adreno_dev))
+			adreno_regwrite(rb->device, REG_CP_ME_CNTL, 0x10000000);
+
 		rb->flags &= ~KGSL_FLAGS_STARTED;
 	}
 }
@@ -434,9 +474,14 @@
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
 	rb->device = device;
+	/*
+	 * It is silly to convert this to words and then back to bytes
+	 * immediately below, but most of the rest of the code deals
+	 * in words, so we might as well only do the math once
+	 */
 	rb->sizedwords = KGSL_RB_SIZE >> 2;
 
-	
+	/* allocate memory for ringbuffer */
 	status = kgsl_allocate_contiguous(&rb->buffer_desc,
 		(rb->sizedwords << 2));
 
@@ -445,7 +490,9 @@
 		return status;
 	}
 
-	
+	/* allocate memory for polling and timestamps */
+	/* This really can be at 4 byte alignment boundry but for using MMU
+	 * we need to make it at page boundary */
 	status = kgsl_allocate_contiguous(&rb->memptrs_desc,
 		sizeof(struct kgsl_rbmemptrs));
 
@@ -454,7 +501,7 @@
 		return status;
 	}
 
-	
+	/* overlay structure on memptrs memory */
 	rb->memptrs = (struct kgsl_rbmemptrs *) rb->memptrs_desc.hostptr;
 
 	return 0;
@@ -480,42 +527,67 @@
 adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 				struct adreno_context *context,
 				unsigned int flags, unsigned int *cmds,
-				int sizedwords)
+				int sizedwords, uint32_t timestamp)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(rb->device);
 	unsigned int *ringcmds;
-	unsigned int timestamp;
 	unsigned int total_sizedwords = sizedwords;
 	unsigned int i;
 	unsigned int rcmd_gpu;
 	unsigned int context_id = KGSL_MEMSTORE_GLOBAL;
 	unsigned int gpuaddr = rb->device->memstore.gpuaddr;
 
-	if (context && (context->flags & CTXT_FLAGS_PER_CONTEXT_TS))
+	/*
+	 * if the context was not created with per context timestamp
+	 * support, we must use the global timestamp since issueibcmds
+	 * will be returning that one.
+	 */
+	if (context && context->flags & CTXT_FLAGS_PER_CONTEXT_TS)
 		context_id = context->id;
 
+	if ((context && context->flags & CTXT_FLAGS_USER_GENERATED_TS) &&
+			(!(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE))) {
+		if (timestamp_cmp(rb->timestamp[context_id],
+						timestamp) >= 0) {
+			KGSL_DRV_ERR(rb->device,
+				"Invalid user generated ts <%d:0x%x>, "
+				"less than last issued ts <%d:0x%x>\n",
+				context_id, timestamp, context_id,
+				rb->timestamp[context_id]);
+			return -ERANGE;
+		}
+	}
+
+	/* reserve space to temporarily turn off protected mode
+	*  error checking if needed
+	*/
 	total_sizedwords += flags & KGSL_CMD_FLAGS_PMODE ? 4 : 0;
-	total_sizedwords += !(flags & KGSL_CMD_FLAGS_NO_TS_CMP) ? 7 : 0;
-	
+	/* 2 dwords to store the start of command sequence */
 	total_sizedwords += 2;
 
+	/* Add CP_COND_EXEC commands to generate CP_INTERRUPT */
+	total_sizedwords += context ? 13 : 0;
+
 	if (adreno_is_a3xx(adreno_dev))
 		total_sizedwords += 7;
 
-	total_sizedwords += 2; 
-	if (context && (context->flags & CTXT_FLAGS_PER_CONTEXT_TS)) {
-		total_sizedwords += 3; 
-		total_sizedwords += 4; 
-		total_sizedwords += 3; 
+	total_sizedwords += 2; /* scratchpad ts for fault tolerance */
+	if (context && context->flags & CTXT_FLAGS_PER_CONTEXT_TS &&
+			!(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
+		total_sizedwords += 3; /* sop timestamp */
+		total_sizedwords += 4; /* eop timestamp */
+		total_sizedwords += 3; /* global timestamp without cache
+					* flush for non-zero context */
 	} else {
-		total_sizedwords += 4; 
+		total_sizedwords += 4; /* global timestamp for fault tolerance*/
 	}
 
-	ringcmds = adreno_ringbuffer_allocspace(rb, total_sizedwords);
-	if (context && (context->flags & CTXT_FLAGS_GPU_HANG)) {
-		KGSL_CTXT_WARN(rb->device,
-		"Context %p caused a gpu hang. Will not accept commands for context %d\n",
-		context, context->id);
+	ringcmds = adreno_ringbuffer_allocspace(rb, context, total_sizedwords);
+	if (!ringcmds) {
+		/*
+		 * We could not allocate space in ringbuffer, just return the
+		 * last timestamp
+		 */
 		return rb->timestamp[context_id];
 	}
 
@@ -526,7 +598,7 @@
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_CMD_IDENTIFIER);
 
 	if (flags & KGSL_CMD_FLAGS_PMODE) {
-		
+		/* disable protected mode error checking */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_SET_PROTECTED_MODE, 1));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0);
@@ -538,58 +610,67 @@
 	}
 
 	if (flags & KGSL_CMD_FLAGS_PMODE) {
-		
+		/* re-enable protected mode error checking */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_SET_PROTECTED_MODE, 1));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 1);
 	}
 
-	
+	/* always increment the global timestamp. once. */
 	rb->timestamp[KGSL_MEMSTORE_GLOBAL]++;
-	if (context) {
+
+	/* Do not update context's timestamp for internal submissions */
+	if (context && !(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
 		if (context_id == KGSL_MEMSTORE_GLOBAL)
-			rb->timestamp[context_id] =
+			rb->timestamp[context->id] =
 				rb->timestamp[KGSL_MEMSTORE_GLOBAL];
+		else if (context->flags & CTXT_FLAGS_USER_GENERATED_TS)
+			rb->timestamp[context_id] = timestamp;
 		else
 			rb->timestamp[context_id]++;
 	}
 	timestamp = rb->timestamp[context_id];
 
-	
+	/* scratchpad ts for fault tolerance */
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type0_packet(REG_CP_TIMESTAMP, 1));
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
 
 	if (adreno_is_a3xx(adreno_dev)) {
+		/*
+		 * FLush HLSQ lazy updates to make sure there are no
+		 * rsources pending for indirect loads after the timestamp
+		 */
 
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_EVENT_WRITE, 1));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x07); 
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x07); /* HLSQ_FLUSH */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_WAIT_FOR_IDLE, 1));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x00);
 	}
 
-	if (context && (context->flags & CTXT_FLAGS_PER_CONTEXT_TS)) {
-		
+	if (context && context->flags & CTXT_FLAGS_PER_CONTEXT_TS
+			&& !(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
+		/* start-of-pipeline timestamp */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_MEM_WRITE, 2));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(context->id, soptimestamp)));
+			KGSL_MEMSTORE_OFFSET(context_id, soptimestamp)));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
 
-		
+		/* end-of-pipeline timestamp */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_EVENT_WRITE, 3));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, CACHE_FLUSH_TS);
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(context->id, eoptimestamp)));
+			KGSL_MEMSTORE_OFFSET(context_id, eoptimestamp)));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
 
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_MEM_WRITE, 2));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			      KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-				      eoptimestamp)));
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+				eoptimestamp)));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
 	} else {
@@ -597,14 +678,13 @@
 			cp_type3_packet(CP_EVENT_WRITE, 3));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, CACHE_FLUSH_TS);
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			      KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-				      eoptimestamp)));
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+						eoptimestamp)));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
+				rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
 	}
-
-	if (!(flags & KGSL_CMD_FLAGS_NO_TS_CMP)) {
-		
+	if (context) {
+		/* Conditional execution based on memory values */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_COND_EXEC, 4));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
@@ -614,15 +694,33 @@
 			KGSL_MEMSTORE_OFFSET(
 				context_id, ref_wait_ts)) >> 2);
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
-		
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 2);
+		/* # of conditional command DWORDs */
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 8);
+
+		/* Clear the ts_cmp_enable for the context */
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_MEM_WRITE, 2));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, gpuaddr +
+			KGSL_MEMSTORE_OFFSET(
+				context_id, ts_cmp_enable));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x0);
+
+		/* Clear the ts_cmp_enable for the global timestamp */
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_MEM_WRITE, 2));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, gpuaddr +
+			KGSL_MEMSTORE_OFFSET(
+				KGSL_MEMSTORE_GLOBAL, ts_cmp_enable));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x0);
+
+		/* Trigger the interrupt */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_INTERRUPT, 1));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, CP_INT_CNTL__RB_INT_MASK);
 	}
 
 	if (adreno_is_a3xx(adreno_dev)) {
-		
+		/* Dummy set-constant to trigger context rollover */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_SET_CONSTANT, 2));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
@@ -630,6 +728,11 @@
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0);
 	}
 
+	if (flags & KGSL_CMD_FLAGS_EOF) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_END_OF_FRAME_IDENTIFIER);
+	}
+
 	adreno_ringbuffer_submit(rb);
 
 	return timestamp;
@@ -648,7 +751,11 @@
 	if (device->state & KGSL_STATE_HUNG)
 		return kgsl_readtimestamp(device, KGSL_MEMSTORE_GLOBAL,
 					KGSL_TIMESTAMP_RETIRED);
-	return adreno_ringbuffer_addcmds(rb, drawctxt, flags, cmds, sizedwords);
+
+	flags |= KGSL_CMD_FLAGS_INTERNAL_ISSUE;
+
+	return adreno_ringbuffer_addcmds(rb, drawctxt, flags, cmds,
+							sizedwords, 0);
 }
 
 static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr,
@@ -702,7 +809,7 @@
 	case CP_IM_STORE:
 	case CP_LOAD_STATE:
 		break;
-	
+	/* these shouldn't come from userspace */
 	case CP_ME_INIT:
 	case CP_SET_PROTECTED_MODE:
 	default:
@@ -727,13 +834,19 @@
 	return true;
 }
 
+/*
+ * Traverse IBs and dump them to test vector. Detect swap by inspecting
+ * register writes, keeping note of the current state, and dump
+ * framebuffer config to test vector
+ */
 static bool _parse_ibs(struct kgsl_device_private *dev_priv,
 			   uint gpuaddr, int sizedwords)
 {
-	static uint level; 
+	static uint level; /* recursion level */
 	bool ret = false;
 	uint *hostaddr, *hoststart;
-	int dwords_left = sizedwords; 
+	int dwords_left = sizedwords; /* dwords left in the current command
+					 buffer */
 	struct kgsl_mem_entry *entry;
 
 	spin_lock(&dev_priv->process_priv->mem_lock);
@@ -763,17 +876,17 @@
 	mb();
 	while (dwords_left > 0) {
 		bool cur_ret = true;
-		int count = 0; 
+		int count = 0; /* dword count including packet header */
 
 		switch (*hostaddr >> 30) {
-		case 0x0: 
+		case 0x0: /* type-0 */
 			count = (*hostaddr >> 16)+2;
 			cur_ret = _handle_type0(dev_priv, hostaddr);
 			break;
-		case 0x1: 
+		case 0x1: /* type-1 */
 			count = 2;
 			break;
-		case 0x3: 
+		case 0x3: /* type-3 */
 			count = ((*hostaddr >> 16) & 0x3fff) + 2;
 			cur_ret = _handle_type3(dev_priv, hostaddr);
 			break;
@@ -804,7 +917,7 @@
 			goto done;
 		}
 
-		
+		/* jump to next packet */
 		dwords_left -= count;
 		hostaddr += count;
 		if (dwords_left < 0) {
@@ -852,9 +965,6 @@
 	unsigned int i;
 	struct adreno_context *drawctxt;
 	unsigned int start_index = 0;
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE_SYSTRACE
-	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-#endif
 
 	if (device->state & KGSL_STATE_HUNG)
 		return -EBUSY;
@@ -865,12 +975,22 @@
 	drawctxt = context->devctxt;
 
 	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG) {
-		KGSL_CTXT_WARN(device, "Context %p caused a gpu hang.."
+		KGSL_CTXT_ERR(device, "proc %s failed fault tolerance"
 			" will not accept commands for context %d\n",
-			drawctxt, drawctxt->id);
+			drawctxt->pid_name, drawctxt->id);
 		return -EDEADLK;
 	}
 
+	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
+		KGSL_CTXT_ERR(device,
+			"proc %s triggered fault tolerance"
+			" skipping commands for context till EOF %d\n",
+			drawctxt->pid_name, drawctxt->id);
+		if (flags & KGSL_CMD_FLAGS_EOF)
+			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
+		numibs = 0;
+	}
+
 	cmds = link = kzalloc(sizeof(unsigned int) * (numibs * 3 + 4),
 				GFP_KERNEL);
 	if (!link) {
@@ -879,6 +999,9 @@
 		return -ENOMEM;
 	}
 
+	/*When preamble is enabled, the preamble buffer with state restoration
+	commands are stored in the first node of the IB chain. We can skip that
+	if a context switch hasn't occured */
 
 	if (drawctxt->flags & CTXT_FLAGS_PREAMBLE &&
 		adreno_dev->drawctxt_active == drawctxt)
@@ -913,24 +1036,12 @@
 		      kgsl_mmu_pt_get_flags(device->mmu.hwpagetable,
 					device->id));
 
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE_SYSTRACE
-	if(device->id == 0 && device->prev_pid != -1 && device->prev_pid != task_tgid_nr(current)) {
-		trace_kgsl_usage(device, KGSL_PWRFLAGS_ON, dev_priv->process_priv->pid, device->gputime.total, device->gputime.busy,
-			pwr->active_pwrlevel, pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq);
-		device->prev_pid = task_tgid_nr(current);
-	}
-#endif
-
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	if(device->current_process_priv == NULL || device->current_process_priv->pid != dev_priv->process_priv->pid)
-		device->current_process_priv = dev_priv->process_priv;
-#endif
-
 	adreno_drawctxt_switch(adreno_dev, drawctxt, flags);
 
 	*timestamp = adreno_ringbuffer_addcmds(&adreno_dev->ringbuffer,
-					drawctxt, 0,
-					&link[0], (cmds - link));
+					drawctxt,
+					(flags & KGSL_CMD_FLAGS_EOF),
+					&link[0], (cmds - link), *timestamp);
 
 	KGSL_CMD_INFO(device, "ctxt %d g %08x numibs %d ts %d\n",
 		context->id, (unsigned int)ibdesc, numibs, *timestamp);
@@ -938,156 +1049,23 @@
 	kfree(link);
 
 #ifdef CONFIG_MSM_KGSL_CFF_DUMP
+	/*
+	 * insert wait for idle after every IB1
+	 * this is conservative but works reliably and is ok
+	 * even for performance simulations
+	 */
 	adreno_idle(device);
 #endif
-	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_RECOVERED)
-		return -EDEADLK;
-	else
+
+	/*
+	 * If context hung and recovered then return error so that the
+	 * application may handle it
+	 */
+	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_FT) {
+		drawctxt->flags &= ~CTXT_FLAGS_GPU_HANG_FT;
+		return -EPROTO;
+	} else
 		return 0;
-
-}
-
-static int _find_start_of_cmd_seq(struct adreno_ringbuffer *rb,
-					unsigned int *ptr,
-					bool inc)
-{
-	int status = -EINVAL;
-	unsigned int val1;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int start_ptr = *ptr;
-
-	while ((start_ptr / sizeof(unsigned int)) != rb->wptr) {
-		if (inc)
-			start_ptr = adreno_ringbuffer_inc_wrapped(start_ptr,
-									size);
-		else
-			start_ptr = adreno_ringbuffer_dec_wrapped(start_ptr,
-									size);
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val1, start_ptr);
-		if (KGSL_CMD_IDENTIFIER == val1) {
-			if ((start_ptr / sizeof(unsigned int)) != rb->wptr)
-				start_ptr = adreno_ringbuffer_dec_wrapped(
-							start_ptr, size);
-				*ptr = start_ptr;
-				status = 0;
-				break;
-		}
-	}
-	return status;
-}
-
-static int _find_cmd_seq_after_eop_ts(struct adreno_ringbuffer *rb,
-					unsigned int *rb_rptr,
-					unsigned int global_eop,
-					bool inc)
-{
-	int status = -EINVAL;
-	unsigned int temp_rb_rptr = *rb_rptr;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int val[3];
-	int i = 0;
-	bool check = false;
-
-	if (inc && temp_rb_rptr / sizeof(unsigned int) != rb->wptr)
-		return status;
-
-	do {
-		if (!inc)
-			temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
-					temp_rb_rptr, size);
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i],
-					temp_rb_rptr);
-
-		if (check && ((inc && val[i] == global_eop) ||
-			(!inc && (val[i] ==
-			cp_type3_packet(CP_MEM_WRITE, 2) ||
-			val[i] == CACHE_FLUSH_TS)))) {
-			i = (i + 2) % 3;
-			if (val[i] == rb->device->memstore.gpuaddr +
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-						eoptimestamp)) {
-				int j = ((i + 2) % 3);
-				if ((inc && (val[j] == CACHE_FLUSH_TS ||
-						val[j] == cp_type3_packet(
-							CP_MEM_WRITE, 2))) ||
-					(!inc && val[j] == global_eop)) {
-						
-						status = 0;
-						break;
-				}
-			}
-			i = (i + 1) % 3;
-		}
-		if (inc)
-			temp_rb_rptr = adreno_ringbuffer_inc_wrapped(
-						temp_rb_rptr, size);
-
-		i = (i + 1) % 3;
-		if (2 == i)
-			check = true;
-	} while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr);
-	if (!status) {
-		status = _find_start_of_cmd_seq(rb, &temp_rb_rptr, false);
-		if (!status) {
-			*rb_rptr = temp_rb_rptr;
-			KGSL_DRV_ERR(rb->device,
-			"Offset of cmd sequence after eop timestamp: 0x%x\n",
-			temp_rb_rptr / sizeof(unsigned int));
-		}
-	}
-	if (status)
-		KGSL_DRV_ERR(rb->device,
-		"Failed to find the command sequence after eop timestamp\n");
-	return status;
-}
-
-static int _find_hanging_ib_sequence(struct adreno_ringbuffer *rb,
-				unsigned int *rb_rptr,
-				unsigned int ib1)
-{
-	int status = -EINVAL;
-	unsigned int temp_rb_rptr = *rb_rptr;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int val[2];
-	int i = 0;
-	bool check = false;
-	bool ctx_switch = false;
-
-	while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
-
-		if (check && val[i] == ib1) {
-			
-			i = (i + 1) % 2;
-			if (adreno_cmd_is_ib(val[i])) {
-				
-				status = _find_start_of_cmd_seq(rb,
-						&temp_rb_rptr, false);
-				KGSL_DRV_ERR(rb->device,
-				"Found the hanging IB at offset 0x%x\n",
-				temp_rb_rptr / sizeof(unsigned int));
-				break;
-			}
-			i = (i + 1) % 2;
-		}
-		if (val[i] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
-			if (ctx_switch) {
-				KGSL_DRV_ERR(rb->device,
-				"Context switch encountered before bad "
-				"IB found\n");
-				break;
-			}
-			ctx_switch = true;
-		}
-		i = (i + 1) % 2;
-		if (1 == i)
-			check = true;
-		temp_rb_rptr = adreno_ringbuffer_inc_wrapped(temp_rb_rptr,
-								size);
-	}
-	if  (!status)
-		*rb_rptr = temp_rb_rptr;
-	return status;
 }
 
 static void _turn_preamble_on_for_ib_seq(struct adreno_ringbuffer *rb,
@@ -1100,11 +1078,11 @@
 	bool check = false;
 	bool cmd_start = false;
 
-	
+	/* Go till the start of the ib sequence and turn on preamble */
 	while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
 		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
 		if (check && KGSL_START_OF_IB_IDENTIFIER == val[i]) {
-			
+			/* decrement i */
 			i = (i + 1) % 2;
 			if (val[i] == cp_nop_packet(4)) {
 				temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
@@ -1112,11 +1090,14 @@
 				kgsl_sharedmem_writel(&rb->buffer_desc,
 					temp_rb_rptr, cp_nop_packet(1));
 			}
-			KGSL_DRV_ERR(rb->device,
+			KGSL_FT_INFO(rb->device,
 			"Turned preamble on at offset 0x%x\n",
 			temp_rb_rptr / 4);
 			break;
 		}
+		/* If you reach beginning of next command sequence then exit
+		 * First command encountered is the current one so don't break
+		 * on that. */
 		if (KGSL_CMD_IDENTIFIER == val[i]) {
 			if (cmd_start)
 				break;
@@ -1131,119 +1112,122 @@
 	}
 }
 
-static void _copy_valid_rb_content(struct adreno_ringbuffer *rb,
-		unsigned int rb_rptr, unsigned int *temp_rb_buffer,
-		int *rb_size, unsigned int *bad_rb_buffer,
-		int *bad_rb_size,
-		int *last_valid_ctx_id)
+void adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
+				struct adreno_ft_data *ft_data)
 {
-	unsigned int good_rb_idx = 0, cmd_start_idx = 0;
+	struct kgsl_device *device = rb->device;
+	unsigned int rb_rptr = ft_data->start_of_replay_cmds;
+	unsigned int good_rb_idx = 0, bad_rb_idx = 0, temp_rb_idx = 0;
+	unsigned int last_good_cmd_end_idx = 0, last_bad_cmd_end_idx = 0;
+	unsigned int cmd_start_idx = 0;
 	unsigned int val1 = 0;
-	struct kgsl_context *k_ctxt;
-	struct adreno_context *a_ctxt;
-	unsigned int bad_rb_idx = 0;
 	int copy_rb_contents = 0;
 	unsigned int temp_rb_rptr;
+	struct kgsl_context *k_ctxt;
+	struct adreno_context *a_ctxt;
 	unsigned int size = rb->buffer_desc.size;
-	unsigned int good_cmd_start_idx = 0;
+	unsigned int *temp_rb_buffer = ft_data->rb_buffer;
+	int *rb_size = &ft_data->rb_size;
+	unsigned int *bad_rb_buffer = ft_data->bad_rb_buffer;
+	int *bad_rb_size = &ft_data->bad_rb_size;
+	unsigned int *good_rb_buffer = ft_data->good_rb_buffer;
+	int *good_rb_size = &ft_data->good_rb_size;
 
+	/*
+	 * If the start index from where commands need to be copied is invalid
+	 * then no need to save off any commands
+	 */
+	if (0xFFFFFFFF == ft_data->start_of_replay_cmds)
+		return;
+
+	k_ctxt = idr_find(&device->context_idr, ft_data->context_id);
+	if (k_ctxt) {
+		a_ctxt = k_ctxt->devctxt;
+		if (a_ctxt->flags & CTXT_FLAGS_PREAMBLE)
+			_turn_preamble_on_for_ib_seq(rb, rb_rptr);
+	}
+	k_ctxt = NULL;
+
+	/* Walk the rb from the context switch. Omit any commands
+	 * for an invalid context. */
 	while ((rb_rptr / sizeof(unsigned int)) != rb->wptr) {
 		kgsl_sharedmem_readl(&rb->buffer_desc, &val1, rb_rptr);
 
 		if (KGSL_CMD_IDENTIFIER == val1) {
-			cmd_start_idx = bad_rb_idx - 1;
-			if (copy_rb_contents)
-				good_cmd_start_idx = good_rb_idx - 1;
+			/* Start is the NOP dword that comes before
+			 * KGSL_CMD_IDENTIFIER */
+			cmd_start_idx = temp_rb_idx - 1;
+			if ((copy_rb_contents) && (good_rb_idx))
+				last_good_cmd_end_idx = good_rb_idx - 1;
+			if ((!copy_rb_contents) && (bad_rb_idx))
+				last_bad_cmd_end_idx = bad_rb_idx - 1;
 		}
 
-		
+		/* check for context switch indicator */
 		if (val1 == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
 			unsigned int temp_idx, val2;
-			
+			/* increment by 3 to get to the context_id */
 			temp_rb_rptr = rb_rptr + (3 * sizeof(unsigned int)) %
 					size;
 			kgsl_sharedmem_readl(&rb->buffer_desc, &val2,
 						temp_rb_rptr);
 
+			/* if context switches to a context that did not cause
+			 * hang then start saving the rb contents as those
+			 * commands can be executed */
 			k_ctxt = idr_find(&rb->device->context_idr, val2);
 			if (k_ctxt) {
 				a_ctxt = k_ctxt->devctxt;
 
+			/* If we are changing to a good context and were not
+			 * copying commands then copy over commands to the good
+			 * context */
 			if (!copy_rb_contents && ((k_ctxt &&
 				!(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) ||
 				!k_ctxt)) {
 				for (temp_idx = cmd_start_idx;
-					temp_idx < bad_rb_idx;
+					temp_idx < temp_rb_idx;
 					temp_idx++)
-					temp_rb_buffer[good_rb_idx++] =
-						bad_rb_buffer[temp_idx];
-				*last_valid_ctx_id = val2;
+					good_rb_buffer[good_rb_idx++] =
+						temp_rb_buffer[temp_idx];
+				ft_data->last_valid_ctx_id = val2;
 				copy_rb_contents = 1;
+				/* remove the good commands from bad buffer */
+				bad_rb_idx = last_bad_cmd_end_idx;
 			} else if (copy_rb_contents && k_ctxt &&
 				(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) {
-				good_rb_idx = good_cmd_start_idx;
+
+				/* If we are changing back to a bad context
+				 * from good ctxt and were not copying commands
+				 * to bad ctxt then copy over commands to
+				 * the bad context */
+				for (temp_idx = cmd_start_idx;
+					temp_idx < temp_rb_idx;
+					temp_idx++)
+					bad_rb_buffer[bad_rb_idx++] =
+						temp_rb_buffer[temp_idx];
+				/* If we are changing to bad context then
+				 * remove the dwords we copied for this
+				 * sequence from the good buffer */
+				good_rb_idx = last_good_cmd_end_idx;
 				copy_rb_contents = 0;
 			}
 			}
 		}
 
 		if (copy_rb_contents)
-			temp_rb_buffer[good_rb_idx++] = val1;
-		bad_rb_buffer[bad_rb_idx++] = val1;
+			good_rb_buffer[good_rb_idx++] = val1;
+		else
+			bad_rb_buffer[bad_rb_idx++] = val1;
+
+		/* Copy both good and bad commands to temp buffer */
+		temp_rb_buffer[temp_rb_idx++] = val1;
 
 		rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, size);
 	}
-	*rb_size = good_rb_idx;
+	*good_rb_size = good_rb_idx;
 	*bad_rb_size = bad_rb_idx;
-}
-
-int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
-				struct adreno_recovery_data *rec_data)
-{
-	int status;
-	struct kgsl_device *device = rb->device;
-	unsigned int rb_rptr = rb->wptr * sizeof(unsigned int);
-	struct kgsl_context *context;
-	struct adreno_context *adreno_context;
-
-	context = idr_find(&device->context_idr, rec_data->context_id);
-
-	
-	status = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
-				rec_data->global_eop + 1, false);
-	if (status)
-		goto done;
-
-	if (context) {
-		adreno_context = context->devctxt;
-
-		if (adreno_context->flags & CTXT_FLAGS_PREAMBLE) {
-			if (rec_data->ib1) {
-				status = _find_hanging_ib_sequence(rb, &rb_rptr,
-								rec_data->ib1);
-				if (status)
-					goto copy_rb_contents;
-			}
-			_turn_preamble_on_for_ib_seq(rb, rb_rptr);
-		} else {
-			status = -EINVAL;
-		}
-	}
-
-copy_rb_contents:
-	_copy_valid_rb_content(rb, rb_rptr, rec_data->rb_buffer,
-				&rec_data->rb_size,
-				rec_data->bad_rb_buffer,
-				&rec_data->bad_rb_size,
-				&rec_data->last_valid_ctx_id);
-	if (status) {
-		rec_data->bad_rb_size = 0;
-		status = 0;
-	}
-	if (!context)
-		rec_data->rb_size = 0;
-done:
-	return status;
+	*rb_size = temp_rb_idx;
 }
 
 void
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index 7560848..fa03c05 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -13,16 +13,21 @@
 #ifndef __ADRENO_RINGBUFFER_H
 #define __ADRENO_RINGBUFFER_H
 
+/*
+ * Adreno ringbuffer sizes in bytes - these are converted to
+ * the appropriate log2 values in the code
+ */
 
 #define KGSL_RB_SIZE (32 * 1024)
 #define KGSL_RB_BLKSIZE 16
 
+/* CP timestamp register */
 #define	REG_CP_TIMESTAMP		 REG_SCRATCH_REG0
 
 
 struct kgsl_device;
 struct kgsl_device_private;
-struct adreno_recovery_data;
+struct adreno_ft_data;
 
 #define GSL_RB_MEMPTRS_SCRATCH_COUNT	 8
 struct kgsl_rbmemptrs {
@@ -45,11 +50,11 @@
 	struct kgsl_memdesc memptrs_desc;
 	struct kgsl_rbmemptrs *memptrs;
 
-	
+	/*ringbuffer size */
 	unsigned int sizedwords;
 
-	unsigned int wptr; 
-	unsigned int rptr; 
+	unsigned int wptr; /* write pointer offset in dwords from baseaddr */
+	unsigned int rptr; /* read pointer offset in dwords from baseaddr */
 
 	unsigned int timestamp[KGSL_MEMSTORE_MAX];
 };
@@ -64,16 +69,23 @@
 		gpuaddr += sizeof(uint); \
 	} while (0)
 
+/* enable timestamp (...scratch0) memory shadowing */
 #define GSL_RB_MEMPTRS_SCRATCH_MASK 0x1
 
-#define GSL_RB_CNTL_NO_UPDATE 0x0 
+/* mem rptr */
+#define GSL_RB_CNTL_NO_UPDATE 0x0 /* enable */
 #define GSL_RB_GET_READPTR(rb, data) \
 	do { \
 		*(data) = rb->memptrs->rptr; \
 	} while (0)
 
-#define GSL_RB_CNTL_POLL_EN 0x0 
+#define GSL_RB_CNTL_POLL_EN 0x0 /* disable */
 
+/*
+ * protected mode error checking below register address 0x800
+ * note: if CP_INTERRUPT packet is used then checking needs
+ * to change to below register address 0x7C8
+ */
 #define GSL_RB_PROTECTED_MODE_CONTROL		0x200001F2
 
 int adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
@@ -102,15 +114,16 @@
 
 void kgsl_cp_intrcallback(struct kgsl_device *device);
 
-int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
-				struct adreno_recovery_data *rec_data);
+void adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
+				struct adreno_ft_data *ft_data);
 
 void
 adreno_ringbuffer_restore(struct adreno_ringbuffer *rb, unsigned int *rb_buff,
 			int num_rb_contents);
 
 unsigned int *adreno_ringbuffer_allocspace(struct adreno_ringbuffer *rb,
-					     unsigned int numcmds);
+						struct adreno_context *context,
+						unsigned int numcmds);
 
 int adreno_ringbuffer_read_pfp_ucode(struct kgsl_device *device);
 
@@ -124,16 +137,18 @@
 	return rb->wptr + rb->sizedwords - rptr;
 }
 
+/* Increment a value by 4 bytes with wrap-around based on size */
 static inline unsigned int adreno_ringbuffer_inc_wrapped(unsigned int val,
 							unsigned int size)
 {
 	return (val + sizeof(unsigned int)) % size;
 }
 
+/* Decrement a value by 4 bytes with wrap-around based on size */
 static inline unsigned int adreno_ringbuffer_dec_wrapped(unsigned int val,
 							unsigned int size)
 {
 	return (val + size - sizeof(unsigned int)) % size;
 }
 
-#endif  
+#endif  /* __ADRENO_RINGBUFFER_H */
diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c
index a412c12..f23586e 100644
--- a/drivers/gpu/msm/adreno_snapshot.c
+++ b/drivers/gpu/msm/adreno_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,13 +19,16 @@
 #include "a2xx_reg.h"
 #include "a3xx_reg.h"
 
+/* Number of dwords of ringbuffer history to record */
 #define NUM_DWORDS_OF_RINGBUFFER_HISTORY 100
 
+/* Maintain a list of the objects we see during parsing */
 
 #define SNAPSHOT_OBJ_BUFSIZE 64
 
 #define SNAPSHOT_OBJ_TYPE_IB 0
 
+/* Keep track of how many bytes are frozen after a snapshot and tell the user */
 static int snapshot_frozen_objsize;
 
 static struct kgsl_snapshot_obj {
@@ -36,14 +39,22 @@
 	int dwords;
 } objbuf[SNAPSHOT_OBJ_BUFSIZE];
 
+/* Pointer to the next open entry in the object list */
 static int objbufptr;
 
+/* Push a new buffer object onto the list */
 static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	uint32_t gpuaddr, int dwords)
 {
 	int index;
 	void *ptr;
 
+	/*
+	 * Sometimes IBs can be reused in the same dump.  Because we parse from
+	 * oldest to newest, if we come across an IB that has already been used,
+	 * assume that it has been reused and update the list with the newest
+	 * size.
+	 */
 
 	for (index = 0; index < objbufptr; index++) {
 		if (objbuf[index].gpuaddr == gpuaddr &&
@@ -58,6 +69,10 @@
 		return;
 	}
 
+	/*
+	 * adreno_convertaddr verifies that the IB size is valid - at least in
+	 * the context of it being smaller then the allocated memory space
+	 */
 	ptr = adreno_convertaddr(device, ptbase, gpuaddr, dwords << 2);
 
 	if (ptr == NULL) {
@@ -66,7 +81,7 @@
 		return;
 	}
 
-	
+	/* Put it on the list of things to parse */
 	objbuf[objbufptr].type = type;
 	objbuf[objbufptr].gpuaddr = gpuaddr;
 	objbuf[objbufptr].ptbase = ptbase;
@@ -74,6 +89,10 @@
 	objbuf[objbufptr++].ptr = ptr;
 }
 
+/*
+ * Return a 1 if the specified object is already on the list of buffers
+ * to be dumped
+ */
 
 static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase)
 {
@@ -89,31 +108,64 @@
 	return 0;
 }
 
+/*
+ * This structure keeps track of type0 writes to VSC_PIPE_DATA_ADDRESS_x and
+ * VSC_PIPE_DATA_LENGTH_x. When a draw initator is called these registers
+ * point to buffers that we need to freeze for a snapshot
+ */
 
 static struct {
 	unsigned int base;
 	unsigned int size;
 } vsc_pipe[8];
 
+/*
+ * This is the cached value of type0 writes to the VSC_SIZE_ADDRESS which
+ * contains the buffer address of the visiblity stream size buffer during a
+ * binning pass
+ */
 
 static unsigned int vsc_size_address;
 
+/*
+ * This struct keeps track of type0 writes to VFD_FETCH_INSTR_0_X and
+ * VFD_FETCH_INSTR_1_X registers. When a draw initator is called the addresses
+ * and sizes in these registers point to VBOs that we need to freeze for a
+ * snapshot
+ */
 
 static struct {
 	unsigned int base;
 	unsigned int stride;
 } vbo[16];
 
+/*
+ * This is the cached value of type0 writes to VFD_INDEX_MAX.  This will be used
+ * to calculate the size of the VBOs when the draw initator is called
+ */
 
 static unsigned int vfd_index_max;
 
+/*
+ * This is the cached value of type0 writes to VFD_CONTROL_0 which tells us how
+ * many VBOs are active when the draw initator is called
+ */
 
 static unsigned int vfd_control_0;
 
+/*
+ * Cached value of type0 writes to SP_VS_PVT_MEM_ADDR and SP_FS_PVT_MEM_ADDR.
+ * This is a buffer that contains private stack information for the shader
+ */
 
 static unsigned int sp_vs_pvt_mem_addr;
 static unsigned int sp_fs_pvt_mem_addr;
 
+/*
+ * Each load state block has two possible types.  Each type has a different
+ * number of dwords per unit.  Use this handy lookup table to make sure
+ * we dump the right amount of data from the indirect buffer
+ */
 
 static int load_state_unit_sizes[7][2] = {
 	{ 2, 4 },
@@ -125,15 +177,31 @@
 	{ 8, 2 },
 };
 
-static void ib_parse_load_state(struct kgsl_device *device, unsigned int *pkt,
+static int ib_parse_load_state(struct kgsl_device *device, unsigned int *pkt,
 	unsigned int ptbase)
 {
 	unsigned int block, source, type;
+	int ret = 0;
 
+	/*
+	 * The object here is to find indirect shaders i.e - shaders loaded from
+	 * GPU memory instead of directly in the command.  These should be added
+	 * to the list of memory objects to dump. So look at the load state
+	 * if the block is indirect (source = 4). If so then add the memory
+	 * address to the list.  The size of the object differs depending on the
+	 * type per the load_state_unit_sizes array above.
+	 */
 
 	if (type3_pkt_size(pkt[0]) < 2)
-		return;
+		return 0;
 
+	/*
+	 * pkt[1] 18:16 - source
+	 * pkt[1] 21:19 - state block
+	 * pkt[1] 31:22 - size in units
+	 * pkt[2] 0:1 - type
+	 * pkt[2] 31:2 - GPU memory address
+	 */
 
 	block = (pkt[1] >> 19) & 0x07;
 	source = (pkt[1] >> 16) & 0x07;
@@ -147,34 +215,52 @@
 		else
 			unitsize = load_state_unit_sizes[block][1];
 
-		
+		/* Freeze the GPU buffer containing the shader */
 
 		ret = kgsl_snapshot_get_object(device, ptbase,
 				pkt[2] & 0xFFFFFFFC,
 				(((pkt[1] >> 22) & 0x03FF) * unitsize) << 2,
 				SNAPSHOT_GPU_OBJECT_SHADER);
+
+		if (ret < 0)
+			return -EINVAL;
+
 		snapshot_frozen_objsize += ret;
 	}
+
+	return ret;
 }
 
+/*
+ * This opcode sets the base addresses for the visibilty stream buffer and the
+ * visiblity stream size buffer.
+ */
 
-static void ib_parse_set_bin_data(struct kgsl_device *device, unsigned int *pkt,
+static int ib_parse_set_bin_data(struct kgsl_device *device, unsigned int *pkt,
 	unsigned int ptbase)
 {
 	int ret;
 
 	if (type3_pkt_size(pkt[0]) < 2)
-		return;
+		return 0;
 
-	
+	/* Visiblity stream buffer */
 	ret = kgsl_snapshot_get_object(device, ptbase, pkt[1], 0,
 			SNAPSHOT_GPU_OBJECT_GENERIC);
+
+	if (ret < 0)
+		return -EINVAL;
+
 	snapshot_frozen_objsize += ret;
 
-	
+	/* visiblity stream size buffer (fixed size 8 dwords) */
 	ret = kgsl_snapshot_get_object(device, ptbase, pkt[2], 32,
 			SNAPSHOT_GPU_OBJECT_GENERIC);
-	snapshot_frozen_objsize += ret;
+
+	if (ret >= 0)
+		snapshot_frozen_objsize += ret;
+
+	return ret;
 }
 
 /*
@@ -183,13 +269,13 @@
  * buffers that are written to as frozen
  */
 
-static void ib_parse_mem_write(struct kgsl_device *device, unsigned int *pkt,
+static int ib_parse_mem_write(struct kgsl_device *device, unsigned int *pkt,
 	unsigned int ptbase)
 {
 	int ret;
 
 	if (type3_pkt_size(pkt[0]) < 1)
-		return;
+		return 0;
 
 	/*
 	 * The address is where the data in the rest of this packet is written
@@ -201,52 +287,77 @@
 	ret = kgsl_snapshot_get_object(device, ptbase, pkt[1] & 0xFFFFFFFC, 0,
 		SNAPSHOT_GPU_OBJECT_GENERIC);
 
-	snapshot_frozen_objsize += ret;
+	if (ret >= 0)
+		snapshot_frozen_objsize += ret;
+
+	return ret;
 }
 
+/*
+ * The DRAW_INDX opcode sends a draw initator which starts a draw operation in
+ * the GPU, so this is the point where all the registers and buffers become
+ * "valid".  The DRAW_INDX may also have an index buffer pointer that should be
+ * frozen with the others
+ */
 
-static void ib_parse_draw_indx(struct kgsl_device *device, unsigned int *pkt,
+static int ib_parse_draw_indx(struct kgsl_device *device, unsigned int *pkt,
 	unsigned int ptbase)
 {
-	int ret, i;
+	int ret = 0, i;
 
 	if (type3_pkt_size(pkt[0]) < 3)
-		return;
+		return 0;
 
-	
+	/*  DRAW_IDX may have a index buffer pointer */
 
 	if (type3_pkt_size(pkt[0]) > 3) {
 		ret = kgsl_snapshot_get_object(device, ptbase, pkt[4], pkt[5],
 			SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+
 		snapshot_frozen_objsize += ret;
 	}
 
+	/*
+	 * All of the type0 writes are valid at a draw initiator, so freeze
+	 * the various buffers that we are tracking
+	 */
 
-	
+	/* First up the visiblity stream buffer */
 
 	for (i = 0; i < ARRAY_SIZE(vsc_pipe); i++) {
 		if (vsc_pipe[i].base != 0 && vsc_pipe[i].size != 0) {
 			ret = kgsl_snapshot_get_object(device, ptbase,
 				vsc_pipe[i].base, vsc_pipe[i].size,
 				SNAPSHOT_GPU_OBJECT_GENERIC);
+			if (ret < 0)
+				return -EINVAL;
+
 			snapshot_frozen_objsize += ret;
 		}
 	}
 
-	
+	/* Next the visibility stream size buffer */
 
 	if (vsc_size_address) {
 		ret = kgsl_snapshot_get_object(device, ptbase,
 				vsc_size_address, 32,
 				SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+
 		snapshot_frozen_objsize += ret;
 	}
 
-	
+	/* Next private shader buffer memory */
 	if (sp_vs_pvt_mem_addr) {
 		ret = kgsl_snapshot_get_object(device, ptbase,
 				sp_vs_pvt_mem_addr, 8192,
 				SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+
 		snapshot_frozen_objsize += ret;
 		sp_vs_pvt_mem_addr = 0;
 	}
@@ -255,16 +366,24 @@
 		ret = kgsl_snapshot_get_object(device, ptbase,
 				sp_fs_pvt_mem_addr, 8192,
 				SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+
 		snapshot_frozen_objsize += ret;
 		sp_fs_pvt_mem_addr = 0;
 	}
 
-	
+	/* Finally: VBOs */
 
-	
+	/* The number of active VBOs is stored in VFD_CONTROL_O[31:27] */
 	for (i = 0; i < (vfd_control_0) >> 27; i++) {
 		int size;
 
+		/*
+		 * The size of the VBO is the stride stored in
+		 * VFD_FETCH_INSTR_0_X.BUFSTRIDE * VFD_INDEX_MAX. The base
+		 * is stored in VFD_FETCH_INSTR_1_X
+		 */
 
 		if (vbo[i].base != 0) {
 			size = vbo[i].stride * vfd_index_max;
@@ -272,6 +391,9 @@
 			ret = kgsl_snapshot_get_object(device, ptbase,
 				vbo[i].base,
 				0, SNAPSHOT_GPU_OBJECT_GENERIC);
+			if (ret < 0)
+				return -EINVAL;
+
 			snapshot_frozen_objsize += ret;
 		}
 
@@ -281,26 +403,30 @@
 
 	vfd_control_0 = 0;
 	vfd_index_max = 0;
+
+	return ret;
 }
 
+/*
+ * Parse all the type3 opcode packets that may contain important information,
+ * such as additional GPU buffers to grab or a draw initator
+ */
 
-static void ib_parse_type3(struct kgsl_device *device, unsigned int *ptr,
+static int ib_parse_type3(struct kgsl_device *device, unsigned int *ptr,
 	unsigned int ptbase)
 {
-	switch (cp_type3_opcode(*ptr)) {
-	case CP_LOAD_STATE:
-		ib_parse_load_state(device, ptr, ptbase);
-		break;
-	case CP_SET_BIN_DATA:
-		ib_parse_set_bin_data(device, ptr, ptbase);
-		break;
-	case CP_MEM_WRITE:
-		ib_parse_mem_write(device, ptr, ptbase);
-		break;
-	case CP_DRAW_INDX:
-		ib_parse_draw_indx(device, ptr, ptbase);
-		break;
-	}
+	int opcode = cp_type3_opcode(*ptr);
+
+	if (opcode == CP_LOAD_STATE)
+		return ib_parse_load_state(device, ptr, ptbase);
+	else if (opcode == CP_SET_BIN_DATA)
+		return ib_parse_set_bin_data(device, ptr, ptbase);
+	else if (opcode == CP_MEM_WRITE)
+		return ib_parse_mem_write(device, ptr, ptbase);
+	else if (opcode == CP_DRAW_INDX)
+		return ib_parse_draw_indx(device, ptr, ptbase);
+
+	return 0;
 }
 
 /*
@@ -320,12 +446,23 @@
 
 	for (i = 0; i < size; i++, offset++) {
 
-		
+		/* Visiblity stream buffer */
 
 		if (offset >= A3XX_VSC_PIPE_DATA_ADDRESS_0 &&
 			offset <= A3XX_VSC_PIPE_DATA_LENGTH_7) {
 			int index = offset - A3XX_VSC_PIPE_DATA_ADDRESS_0;
 
+			/* Each bank of address and length registers are
+			 * interleaved with an empty register:
+			 *
+			 * address 0
+			 * length 0
+			 * empty
+			 * address 1
+			 * length 1
+			 * empty
+			 * ...
+			 */
 
 			if ((index % 3) == 0)
 				vsc_pipe[index / 3].base = ptr[i + 1];
@@ -335,6 +472,11 @@
 			(offset <= A3XX_VFD_FETCH_INSTR_1_F)) {
 			int index = offset - A3XX_VFD_FETCH_INSTR_0_0;
 
+			/*
+			 * FETCH_INSTR_0_X and FETCH_INSTR_1_X banks are
+			 * interleaved as above but without the empty register
+			 * in between
+			 */
 
 			if ((index % 2) == 0)
 				vbo[index >> 1].stride =
@@ -342,6 +484,10 @@
 			else
 				vbo[index >> 1].base = ptr[i + 1];
 		} else {
+			/*
+			 * Cache various support registers for calculating
+			 * buffer sizes
+			 */
 
 			switch (offset) {
 			case A3XX_VFD_CONTROL_0:
@@ -364,26 +510,32 @@
 	}
 }
 
+/* Add an IB as a GPU object, but first, parse it to find more goodies within */
 
-static void ib_add_gpu_object(struct kgsl_device *device, unsigned int ptbase,
+static int ib_add_gpu_object(struct kgsl_device *device, unsigned int ptbase,
 		unsigned int gpuaddr, unsigned int dwords)
 {
 	int i, ret, rem = dwords;
 	unsigned int *src;
 
+	/*
+	 * If the object is already in the list, we don't need to parse it again
+	 */
 
 	if (kgsl_snapshot_have_object(device, ptbase, gpuaddr, dwords << 2))
-		return;
+		return 0;
 
 	src = (unsigned int *) adreno_convertaddr(device, ptbase, gpuaddr,
 		dwords << 2);
 
 	if (src == NULL)
-		return;
+		return -EINVAL;
 
 	for (i = 0; rem > 0; rem--, i++) {
 		int pktsize;
 
+		/* If the packet isn't a type 1 or a type 3, then don't bother
+		 * parsing it - it is likely corrupted */
 
 		if (!pkt_is_type0(src[i]) && !pkt_is_type3(src[i]))
 			break;
@@ -394,11 +546,46 @@
 			break;
 
 		if (pkt_is_type3(src[i])) {
-			if (adreno_cmd_is_ib(src[i]))
-				ib_add_gpu_object(device, ptbase,
-					src[i + 1], src[i + 2]);
-			else
-				ib_parse_type3(device, &src[i], ptbase);
+			if (adreno_cmd_is_ib(src[i])) {
+				unsigned int gpuaddr = src[i + 1];
+				unsigned int size = src[i + 2];
+				unsigned int ibbase;
+
+				/* Address of the last processed IB2 */
+				kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
+
+				/*
+				 * If this is the last IB2 that was executed,
+				 * then push it to make sure it goes into the
+				 * static space
+				 */
+
+				if (ibbase == gpuaddr)
+					push_object(device,
+						SNAPSHOT_OBJ_TYPE_IB, ptbase,
+						gpuaddr, size);
+				else {
+					ret = ib_add_gpu_object(device,
+						ptbase, gpuaddr, size);
+
+					/*
+					 * If adding the IB failed then stop
+					 * parsing
+					 */
+					if (ret < 0)
+						goto done;
+				}
+			} else {
+				ret = ib_parse_type3(device, &src[i], ptbase);
+				/*
+				 * If the parse function failed (probably
+				 * because of a bad decode) then bail out and
+				 * just capture the binary IB data
+				 */
+
+				if (ret < 0)
+					goto done;
+			}
 		} else if (pkt_is_type0(src[i])) {
 			ib_parse_type0(device, &src[i], ptbase);
 		}
@@ -407,12 +594,17 @@
 		rem -= pktsize;
 	}
 
+done:
 	ret = kgsl_snapshot_get_object(device, ptbase, gpuaddr, dwords << 2,
 		SNAPSHOT_GPU_OBJECT_IB);
 
-	snapshot_frozen_objsize += ret;
+	if (ret >= 0)
+		snapshot_frozen_objsize += ret;
+
+	return ret;
 }
 
+/* Snapshot the istore memory */
 static int snapshot_istore(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
 {
@@ -437,6 +629,7 @@
 	return (count * 4) + sizeof(*header);
 }
 
+/* Snapshot the ringbuffer memory */
 static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
 {
@@ -448,15 +641,20 @@
 	int index, size, i;
 	int parse_ibs = 0, ib_parse_start;
 
-	
+	/* Get the physical address of the MMU pagetable */
 	ptbase = kgsl_mmu_get_current_ptbase(&device->mmu);
 
-	
+	/* Get the current read pointers for the RB */
 	kgsl_regread(device, REG_CP_RB_RPTR, &rptr);
 
-	
+	/* Address of the last processed IB */
 	kgsl_regread(device, REG_CP_IB1_BASE, &ibbase);
 
+	/*
+	 * Figure out the window of ringbuffer data to dump.  First we need to
+	 * find where the last processed IB ws submitted.  Start walking back
+	 * from the rptr
+	 */
 
 	index = rptr;
 	rbptr = rb->buffer_desc.hostptr;
@@ -467,7 +665,7 @@
 		if (index < 0) {
 			index = rb->sizedwords - 3;
 
-			
+			/* We wrapped without finding what we wanted */
 			if (index < rb->wptr) {
 				index = rb->wptr;
 				break;
@@ -479,6 +677,11 @@
 			break;
 	} while (index != rb->wptr);
 
+	/*
+	 * index points at the last submitted IB. We can only trust that the
+	 * memory between the context switch and the hanging IB is valid, so
+	 * the next step is to find the context switch before the submission
+	 */
 
 	while (index != rb->wptr) {
 		index--;
@@ -486,6 +689,11 @@
 		if (index < 0) {
 			index = rb->sizedwords - 2;
 
+			/*
+			 * Wrapped without finding the context switch. This is
+			 * harmless - we should still have enough data to dump a
+			 * valid state
+			 */
 
 			if (index < rb->wptr) {
 				index = rb->wptr;
@@ -493,15 +701,23 @@
 			}
 		}
 
-		
+		/* Break if the current packet is a context switch identifier */
 		if ((rbptr[index] == cp_nop_packet(1)) &&
 			(rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER))
 			break;
 	}
 
+	/*
+	 * Index represents the start of the window of interest.  We will try
+	 * to dump all buffers between here and the rptr
+	 */
 
 	ib_parse_start = index;
 
+	/*
+	 * Dump the entire ringbuffer - the parser can choose how much of it to
+	 * process
+	 */
 
 	size = (rb->sizedwords << 2);
 
@@ -511,18 +727,26 @@
 		return 0;
 	}
 
-	
+	/* Write the sub-header for the section */
 	header->start = rb->wptr;
 	header->end = rb->wptr;
 	header->wptr = rb->wptr;
 	header->rbsize = rb->sizedwords;
 	header->count = rb->sizedwords;
 
+	/*
+	 * Loop through the RB, copying the data and looking for indirect
+	 * buffers and MMU pagetable changes
+	 */
 
 	index = rb->wptr;
 	for (i = 0; i < rb->sizedwords; i++) {
 		*data = rbptr[index];
 
+		/*
+		 * Only parse IBs between the start and the rptr or the next
+		 * context switch, whichever comes first
+		 */
 
 		if (parse_ibs == 0 && index == ib_parse_start)
 			parse_ibs = 1;
@@ -533,17 +757,27 @@
 			unsigned int ibaddr = rbptr[index + 1];
 			unsigned int ibsize = rbptr[index + 2];
 
+			/*
+			 * This will return non NULL if the IB happens to be
+			 * part of the context memory (i.e - context switch
+			 * command buffers)
+			 */
 
 			struct kgsl_memdesc *memdesc =
 				adreno_find_ctxtmem(device, ptbase, ibaddr,
 					ibsize);
 
-			
+			/* IOMMU uses a NOP IB placed in setsate memory */
 			if (NULL == memdesc)
 				if (kgsl_gpuaddr_in_memdesc(
 						&device->mmu.setstate_memory,
 						ibaddr, ibsize))
 					memdesc = &device->mmu.setstate_memory;
+			/*
+			 * The IB from CP_IB1_BASE and the IBs for legacy
+			 * context switch go into the snapshot all
+			 * others get marked at GPU objects
+			 */
 
 			if (ibaddr == ibbase || memdesc != NULL)
 				push_object(device, SNAPSHOT_OBJ_TYPE_IB,
@@ -561,10 +795,11 @@
 		data++;
 	}
 
-	
+	/* Return the size of the section */
 	return size + sizeof(*header);
 }
 
+/* Snapshot the memory for an indirect buffer */
 static int snapshot_ib(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
 {
@@ -572,7 +807,7 @@
 	struct kgsl_snapshot_obj *obj = priv;
 	unsigned int *src = obj->ptr;
 	unsigned int *dst = snapshot + sizeof(*header);
-	int i;
+	int i, ret;
 
 	if (remain < (obj->dwords << 2) + sizeof(*header)) {
 		KGSL_DRV_ERR(device,
@@ -580,12 +815,12 @@
 		return 0;
 	}
 
-	
+	/* Write the sub-header for the section */
 	header->gpuaddr = obj->gpuaddr;
 	header->ptbase = obj->ptbase;
 	header->size = obj->dwords;
 
-	
+	/* Write the contents of the ib */
 	for (i = 0; i < obj->dwords; i++, src++, dst++) {
 		*dst = *src;
 
@@ -596,14 +831,20 @@
 			if (adreno_cmd_is_ib(*src))
 				push_object(device, SNAPSHOT_OBJ_TYPE_IB,
 					obj->ptbase, src[1], src[2]);
-			else
-				ib_parse_type3(device, src, obj->ptbase);
+			else {
+				ret = ib_parse_type3(device, src, obj->ptbase);
+
+				/* Stop parsing if the type3 decode fails */
+				if (ret < 0)
+					break;
+			}
 		}
 	}
 
 	return (obj->dwords << 2) + sizeof(*header);
 }
 
+/* Dump another item on the current pending list */
 static void *dump_object(struct kgsl_device *device, int obj, void *snapshot,
 	int *remain)
 {
@@ -623,6 +864,15 @@
 	return snapshot;
 }
 
+/* adreno_snapshot - Snapshot the Adreno GPU state
+ * @device - KGSL device to snapshot
+ * @snapshot - Pointer to the start of memory to write into
+ * @remain - A pointer to how many bytes of memory are remaining in the snapshot
+ * @hang - set if this snapshot was automatically triggered by a GPU hang
+ * This is a hook function called by kgsl_snapshot to snapshot the
+ * Adreno specific information for the GPU snapshot.  In turn, this function
+ * calls the GPU specific snapshot function to get core specific information.
+ */
 
 void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
 		int hang)
@@ -631,12 +881,12 @@
 	uint32_t ptbase, ibbase, ibsize;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	
+	/* Reset the list of objects */
 	objbufptr = 0;
 
 	snapshot_frozen_objsize = 0;
 
-	
+	/* Clear the caches for the visibilty stream and VBO parsing */
 
 	vfd_control_0 = 0;
 	vfd_index_max = 0;
@@ -645,17 +895,30 @@
 	memset(vsc_pipe, 0, sizeof(vsc_pipe));
 	memset(vbo, 0, sizeof(vbo));
 
-	
+	/* Get the physical address of the MMU pagetable */
 	ptbase = kgsl_mmu_get_current_ptbase(&device->mmu);
 
-	
+	/* Dump the ringbuffer */
 	snapshot = kgsl_snapshot_add_section(device, KGSL_SNAPSHOT_SECTION_RB,
 		snapshot, remain, snapshot_rb, NULL);
 
+	/*
+	 * Make sure that the last IB1 that was being executed is dumped.
+	 * Since this was the last IB1 that was processed, we should have
+	 * already added it to the list during the ringbuffer parse but we
+	 * want to be double plus sure.
+	 */
 
 	kgsl_regread(device, REG_CP_IB1_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize);
 
+	/*
+	 * The problem is that IB size from the register is the unprocessed size
+	 * of the buffer not the original size, so if we didn't catch this
+	 * buffer being directly used in the RB, then we might not be able to
+	 * dump the whle thing. Print a warning message so we can try to
+	 * figure how often this really happens.
+	 */
 
 	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
@@ -667,15 +930,30 @@
 	kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize);
 
+	/*
+	 * Add the last parsed IB2 to the list. The IB2 should be found as we
+	 * parse the objects below, but we try to add it to the list first, so
+	 * it too can be parsed.  Don't print an error message in this case - if
+	 * the IB2 is found during parsing, the list will be updated with the
+	 * correct size.
+	 */
 
 	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
 	}
 
+	/*
+	 * Go through the list of found objects and dump each one.  As the IBs
+	 * are parsed, more objects might be found, and objbufptr will increase
+	 */
 	for (i = 0; i < objbufptr; i++)
 		snapshot = dump_object(device, i, snapshot, remain);
 
+	/*
+	 * Only dump the istore on a hang - reading it on a running system
+	 * has a non 0 chance of hanging the GPU
+	 */
 
 	if (hang) {
 		snapshot = kgsl_snapshot_add_section(device,
@@ -683,7 +961,7 @@
 			snapshot_istore, NULL);
 	}
 
-	
+	/* Add GPU specific sections - registers mainly, but other stuff too */
 	if (adreno_dev->gpudev->snapshot)
 		snapshot = adreno_dev->gpudev->snapshot(adreno_dev, snapshot,
 			remain, hang);
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
index 55692a6..3582a41 100644
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 #include <linux/ashmem.h>
 #include <linux/major.h>
-#include <linux/ion.h>
+#include <linux/msm_ion.h>
 #include <linux/io.h>
 #include <mach/socinfo.h>
 
@@ -36,6 +36,7 @@
 #include "kgsl_sharedmem.h"
 #include "kgsl_device.h"
 #include "kgsl_trace.h"
+#include "kgsl_sync.h"
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "kgsl."
@@ -51,112 +52,15 @@
 
 static struct ion_client *kgsl_ion_client;
 
+/* kgsl_get_mem_entry - get the mem_entry structure for the specified object
+ * @device - Pointer to the device structure
+ * @ptbase - the pagetable base of the object
+ * @gpuaddr - the GPU address of the object
+ * @size - Size of the region to search
+ */
 
-int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
-	void (*cb)(struct kgsl_device *, void *, u32, u32), void *priv,
-	void *owner)
-{
-	struct kgsl_event *event;
-	struct list_head *n;
-	unsigned int cur_ts;
-	struct kgsl_context *context = NULL;
-
-	if (cb == NULL)
-		return -EINVAL;
-
-	if (id != KGSL_MEMSTORE_GLOBAL) {
-		context = idr_find(&device->context_idr, id);
-		if (context == NULL)
-			return -EINVAL;
-	}
-	cur_ts = kgsl_readtimestamp(device, context, KGSL_TIMESTAMP_RETIRED);
-
-	
-
-	if (timestamp_cmp(cur_ts, ts) >= 0) {
-		cb(device, priv, id, cur_ts);
-		return 0;
-	}
-
-	event = kzalloc(sizeof(*event), GFP_KERNEL);
-	if (event == NULL)
-		return -ENOMEM;
-
-	event->context = context;
-	event->timestamp = ts;
-	event->priv = priv;
-	event->func = cb;
-	event->owner = owner;
-
-
-	for (n = device->events.next ; n != &device->events; n = n->next) {
-		struct kgsl_event *e =
-			list_entry(n, struct kgsl_event, list);
-
-		if (e->context != context)
-			continue;
-
-		if (timestamp_cmp(e->timestamp, ts) > 0) {
-			list_add(&event->list, n->prev);
-			break;
-		}
-	}
-
-	if (n == &device->events)
-		list_add_tail(&event->list, &device->events);
-
-	queue_work(device->work_queue, &device->ts_expired_ws);
-	return 0;
-}
-EXPORT_SYMBOL(kgsl_add_event);
-
-static void kgsl_cancel_events_ctxt(struct kgsl_device *device,
-	struct kgsl_context *context)
-{
-	struct kgsl_event *event, *event_tmp;
-	unsigned int id, cur;
-
-	cur = kgsl_readtimestamp(device, context, KGSL_TIMESTAMP_RETIRED);
-	id = context->id;
-
-	list_for_each_entry_safe(event, event_tmp, &device->events, list) {
-		if (event->context != context)
-			continue;
-
-		if (event->func)
-			event->func(device, event->priv, id, cur);
-
-		list_del(&event->list);
-		kfree(event);
-	}
-}
-
-void kgsl_cancel_events(struct kgsl_device *device,
-	void *owner)
-{
-	struct kgsl_event *event, *event_tmp;
-	unsigned int id, cur;
-
-	list_for_each_entry_safe(event, event_tmp, &device->events, list) {
-		if (event->owner != owner)
-			continue;
-
-		cur = kgsl_readtimestamp(device, event->context,
-					 KGSL_TIMESTAMP_RETIRED);
-
-		id = event->context ? event->context->id : KGSL_MEMSTORE_GLOBAL;
-		if (event->func)
-			event->func(device, event->priv, id, cur);
-
-		list_del(&event->list);
-		kfree(event);
-	}
-}
-EXPORT_SYMBOL(kgsl_cancel_events);
-
-
-struct kgsl_mem_entry *kgsl_get_mem_entry(unsigned int ptbase,
-	unsigned int gpuaddr, unsigned int size)
+struct kgsl_mem_entry *kgsl_get_mem_entry(struct kgsl_device *device,
+	unsigned int ptbase, unsigned int gpuaddr, unsigned int size)
 {
 	struct kgsl_process_private *priv;
 	struct kgsl_mem_entry *entry;
@@ -164,7 +68,7 @@
 	mutex_lock(&kgsl_driver.process_mutex);
 
 	list_for_each_entry(priv, &kgsl_driver.process_list, list) {
-		if (!kgsl_mmu_pt_equal(priv->pagetable, ptbase))
+		if (!kgsl_mmu_pt_equal(&device->mmu, priv->pagetable, ptbase))
 			continue;
 		spin_lock(&priv->mem_lock);
 		entry = kgsl_sharedmem_find_region(priv, gpuaddr, size);
@@ -189,10 +93,8 @@
 
 	if (!entry)
 		KGSL_CORE_ERR("kzalloc(%d) failed\n", sizeof(*entry));
-	else {
+	else
 		kref_init(&entry->refcount);
-		entry->memdesc.handle = NULL;
-	}
 
 	return entry;
 }
@@ -200,36 +102,6 @@
 unsigned int kgsl_get_alloc_size(int detailed)
 {
 	unsigned int ret = 0;
-	struct kgsl_process_private *private;
-	int i = 0;
-
-	ret = kgsl_driver.stats.page_alloc;
-	printk("kgsl: kgsl_driver.stats.page_alloc = %u\n", kgsl_driver.stats.page_alloc);
-	printk("kgsl: kgsl_driver.stats.page_alloc_kernel = %u\n", kgsl_driver.stats.page_alloc_kernel);
-	printk("kgsl: kgsl_driver.stats.pre_alloc = %u\n", kgsl_driver.stats.pre_alloc);
-	printk("kgsl: kgsl_driver.stats.pre_alloc_kernel = %u\n", kgsl_driver.stats.pre_alloc_kernel);
-
-	if (!detailed)
-		return ret;
-
-	mutex_lock(&kgsl_driver.process_mutex);
-
-	list_for_each_entry(private, &kgsl_driver.process_list, list) {
-		printk("kgsl: below is going to list all memory info of pid:%d \n", private->pid);
-		for (i = 0; i < KGSL_MEM_ENTRY_MAX; i++) {
-			switch (i) {
-			case KGSL_MEM_ENTRY_PAGE_ALLOC:
-				if (private->stats[KGSL_MEM_ENTRY_PAGE_ALLOC].cur != 0)
-					printk("kgsl: page alloc %d\n", private->stats[KGSL_MEM_ENTRY_PAGE_ALLOC].cur);
-				break;
-			case KGSL_MEM_ENTRY_PRE_ALLOC:
-				if (private->stats[KGSL_MEM_ENTRY_PRE_ALLOC].cur != 0)
-					printk("kgsl: pre alloc %d\n", private->stats[KGSL_MEM_ENTRY_PRE_ALLOC].cur);
-				break;
-			}
-		}
-	}
-	mutex_unlock(&kgsl_driver.process_mutex);
 
 	return ret;
 }
@@ -244,6 +116,11 @@
 	if (entry->memtype != KGSL_MEM_ENTRY_KERNEL)
 		kgsl_driver.stats.mapped -= entry->memdesc.size;
 
+	/*
+	 * Ion takes care of freeing the sglist for us so
+	 * clear the sg before freeing the sharedmem so kgsl_sharedmem_free
+	 * doesn't try to free it again
+	 */
 
 	if (entry->memtype == KGSL_MEM_ENTRY_ION) {
 		entry->memdesc.sg = NULL;
@@ -295,9 +172,9 @@
 	spin_unlock(&process->mem_lock);
 
 	entry->priv = process;
-	entry->memdesc.private = process;
 }
 
+/* Detach a memory entry from a process and unmap it from the MMU */
 
 static void kgsl_mem_entry_detach_process(struct kgsl_mem_entry *entry)
 {
@@ -312,6 +189,7 @@
 	kgsl_mem_entry_put(entry);
 }
 
+/* Allocate a new context id */
 
 static struct kgsl_context *
 kgsl_create_context(struct kgsl_device_private *dev_priv)
@@ -343,7 +221,7 @@
 		return NULL;
 	}
 
-	
+	/* MAX - 1, there is one memdesc in memstore for device info */
 	if (id >= KGSL_MEMSTORE_MAX) {
 		KGSL_DRV_ERR(dev_priv->device, "cannot have more than %d "
 				"ctxts due to memstore limitation\n",
@@ -357,9 +235,44 @@
 	context->id = id;
 	context->dev_priv = dev_priv;
 
+	if (kgsl_sync_timeline_create(context)) {
+		idr_remove(&dev_priv->device->context_idr, id);
+		goto func_end;
+	}
+
+	/* Initialize the pending event list */
+	INIT_LIST_HEAD(&context->events);
+
+	/*
+	 * Initialize the node that is used to maintain the master list of
+	 * contexts with pending events in the device structure. Normally we
+	 * wouldn't take the time to initalize a node but at event add time we
+	 * call list_empty() on the node as a quick way of determining if the
+	 * context is already in the master list so it needs to always be either
+	 * active or in an unused but initialized state
+	 */
+
+	INIT_LIST_HEAD(&context->events_list);
+
+func_end:
+	if (ret) {
+		kfree(context);
+		return NULL;
+	}
+
 	return context;
 }
 
+/**
+ * kgsl_context_detach - Release the "master" context reference
+ * @context - The context that will be detached
+ *
+ * This is called when a context becomes unusable, because userspace
+ * has requested for it to be destroyed. The context itself may
+ * exist a bit longer until its reference count goes to zero.
+ * Other code referencing the context can detect that it has been
+ * detached because the context id will be set to KGSL_CONTEXT_INVALID.
+ */
 void
 kgsl_context_detach(struct kgsl_context *context)
 {
@@ -373,8 +286,13 @@
 
 	if (device->ftbl->drawctxt_destroy)
 		device->ftbl->drawctxt_destroy(device, context);
-	
+	/*device specific drawctxt_destroy MUST clean up devctxt */
 	BUG_ON(context->devctxt);
+	/*
+	 * Cancel events after the device-specific context is
+	 * destroyed, to avoid possibly freeing memory while
+	 * it is still in use by the GPU.
+	 */
 	kgsl_cancel_events_ctxt(device, context);
 	idr_remove(&device->context_idr, id);
 	context->id = KGSL_CONTEXT_INVALID;
@@ -386,45 +304,17 @@
 {
 	struct kgsl_context *context = container_of(kref, struct kgsl_context,
 						    refcount);
+	kgsl_sync_timeline_destroy(context);
 	kfree(context);
 }
 
-void kgsl_timestamp_expired(struct work_struct *work)
-{
-	struct kgsl_device *device = container_of(work, struct kgsl_device,
-		ts_expired_ws);
-	struct kgsl_event *event, *event_tmp;
-	uint32_t ts_processed;
-	unsigned int id;
-
-	mutex_lock(&device->mutex);
-
-	
-	list_for_each_entry_safe(event, event_tmp, &device->events, list) {
-		ts_processed = kgsl_readtimestamp(device, event->context,
-						  KGSL_TIMESTAMP_RETIRED);
-		if (timestamp_cmp(ts_processed, event->timestamp) < 0)
-			continue;
-
-		id = event->context ? event->context->id : KGSL_MEMSTORE_GLOBAL;
-
-		if (event->func)
-			event->func(device, event->priv, id, ts_processed);
-
-		list_del(&event->list);
-		kfree(event);
-	}
-
-	mutex_unlock(&device->mutex);
-}
-EXPORT_SYMBOL(kgsl_timestamp_expired);
-
 static void kgsl_check_idle_locked(struct kgsl_device *device)
 {
 	if (device->pwrctrl.nap_allowed == true &&
 	    device->state == KGSL_STATE_ACTIVE &&
 		device->requested_state == KGSL_STATE_NONE) {
 		kgsl_pwrctrl_request_state(device, KGSL_STATE_NAP);
+		kgsl_pwrscale_idle(device);
 		if (kgsl_pwrctrl_sleep(device) != 0)
 			mod_timer(&device->idle_timer,
 				  jiffies +
@@ -519,22 +409,24 @@
 	policy_saved = device->pwrscale.policy;
 	device->pwrscale.policy = NULL;
 	kgsl_pwrctrl_request_state(device, KGSL_STATE_SUSPEND);
+	/* Make sure no user process is waiting for a timestamp *
+	 * before supending */
 	if (device->active_cnt != 0) {
 		mutex_unlock(&device->mutex);
 		wait_for_completion(&device->suspend_gate);
 		mutex_lock(&device->mutex);
 	}
-	
+	/* Don't let the timer wake us during suspended sleep. */
 	del_timer_sync(&device->idle_timer);
 	switch (device->state) {
 		case KGSL_STATE_INIT:
 			break;
 		case KGSL_STATE_ACTIVE:
-			
+			/* Wait for the device to become idle */
 			device->ftbl->idle(device);
 		case KGSL_STATE_NAP:
 		case KGSL_STATE_SLEEP:
-			
+			/* Get the completion ready to be waited upon. */
 			INIT_COMPLETION(device->hwaccess_gate);
 			device->ftbl->suspend_context(device);
 			device->ftbl->stop(device);
@@ -566,19 +458,16 @@
 {
 	int status = -EINVAL;
 
-	if (!device) {
-		printk("kgsl_resume_device: device is null!\n");
+	if (!device)
 		return -EINVAL;
-	}
+
 	KGSL_PWR_WARN(device, "resume start\n");
 	mutex_lock(&device->mutex);
 	if (device->state == KGSL_STATE_SUSPEND) {
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER);
 		status = 0;
 		complete_all(&device->hwaccess_gate);
-	}else
-		printk(" kgsl_resume_device: state=%d\n", device->state);
-
+	}
 	kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 
 	mutex_unlock(&device->mutex);
@@ -663,13 +552,11 @@
 }
 EXPORT_SYMBOL(kgsl_late_resume_driver);
 
+/* file operations */
 static struct kgsl_process_private *
 kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
 {
 	struct kgsl_process_private *private;
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	int i;
-#endif
 
 	mutex_lock(&kgsl_driver.process_mutex);
 	list_for_each_entry(private, &kgsl_driver.process_list, list) {
@@ -679,7 +566,7 @@
 		}
 	}
 
-	
+	/* no existing process private found for this dev_priv, create one */
 	private = kzalloc(sizeof(struct kgsl_process_private), GFP_KERNEL);
 	if (private == NULL) {
 		KGSL_DRV_ERR(cur_dev_priv->device, "kzalloc(%d) failed\n",
@@ -687,15 +574,6 @@
 		goto out;
 	}
 
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	private->gputime.total = 0;
-	private->gputime.busy = 0;
-	for(i=0;i<KGSL_MAX_PWRLEVELS;i++) {
-		private->gputime_in_state[i].total = 0;
-		private->gputime_in_state[i].busy = 0;
-	}
-#endif
-
 	spin_lock_init(&private->mem_lock);
 	private->refcnt = 1;
 	private->pid = task_tgid_nr(current);
@@ -717,6 +595,7 @@
 	list_add(&private->list, &kgsl_driver.process_list);
 
 	kgsl_process_init_sysfs(private);
+	kgsl_process_init_debugfs(private);
 
 out:
 	mutex_unlock(&kgsl_driver.process_mutex);
@@ -739,6 +618,7 @@
 		goto unlock;
 
 	kgsl_process_uninit_sysfs(private);
+	debugfs_remove_recursive(private->debug_root);
 
 	list_del(&private->list);
 
@@ -779,6 +659,12 @@
 
 		next = next + 1;
 	}
+	/*
+	 * Clean up any to-be-freed entries that belong to this
+	 * process and this device. This is done after the context
+	 * are destroyed to avoid possibly freeing memory while
+	 * it is still in use by the GPU.
+	 */
 	kgsl_cancel_events(device, dev_priv);
 
 	device->open_count--;
@@ -831,7 +717,7 @@
 	dev_priv->device = device;
 	filep->private_data = dev_priv;
 
-	
+	/* Get file (per process) private struct */
 	dev_priv->process_priv = kgsl_get_process_private(dev_priv);
 	if (dev_priv->process_priv ==  NULL) {
 		result = -ENOMEM;
@@ -872,6 +758,7 @@
 	return result;
 }
 
+/*call with private->mem_lock locked */
 struct kgsl_mem_entry *
 kgsl_sharedmem_find_region(struct kgsl_process_private *private,
 	unsigned int gpuaddr, size_t size)
@@ -904,12 +791,14 @@
 }
 EXPORT_SYMBOL(kgsl_sharedmem_find_region);
 
+/*call with private->mem_lock locked */
 static inline struct kgsl_mem_entry *
 kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
 {
 	return kgsl_sharedmem_find_region(private, gpuaddr, 1);
 }
 
+/*call all ioctl sub functions with driver locked*/
 static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 					  unsigned int cmd, void *data)
 {
@@ -937,7 +826,7 @@
 	}
 	case KGSL_PROP_GPU_RESET_STAT:
 	{
-		
+		/* Return reset status of given context and clear it */
 		uint32_t id;
 		struct kgsl_context *context;
 
@@ -945,7 +834,7 @@
 			result = -EINVAL;
 			break;
 		}
-		
+		/* We expect the value passed in to contain the context id */
 		if (copy_from_user(&id, param->value,
 			sizeof(unsigned int))) {
 			result = -EFAULT;
@@ -956,12 +845,16 @@
 			result = -EINVAL;
 			break;
 		}
+		/*
+		 * Copy the reset status to value which also serves as
+		 * the out parameter
+		 */
 		if (copy_to_user(param->value, &(context->reset_status),
 			sizeof(unsigned int))) {
 			result = -EFAULT;
 			break;
 		}
-		
+		/* Clear reset status once its been queried */
 		context->reset_status = KGSL_CTX_STAT_NO_ERROR;
 		break;
 	}
@@ -979,7 +872,7 @@
 					  unsigned int cmd, void *data)
 {
 	int result = 0;
-	
+	/* The getproperty struct is reused for setproperty too */
 	struct kgsl_device_getproperty *param = data;
 
 	if (dev_priv->device->ftbl->setproperty)
@@ -999,7 +892,7 @@
 	struct kgsl_device *device = dev_priv->device;
 	unsigned int context_id = context ? context->id : KGSL_MEMSTORE_GLOBAL;
 
-	
+	/* Set the active count so that suspend doesn't do the wrong thing */
 
 	device->active_cnt++;
 
@@ -1016,11 +909,8 @@
 							KGSL_TIMESTAMP_RETIRED),
 				      result);
 
-	
-
-	INIT_COMPLETION(dev_priv->device->suspend_gate);
-	dev_priv->device->active_cnt--;
-	complete(&dev_priv->device->suspend_gate);
+	/* Fire off any pending suspend operations that are in flight */
+	kgsl_active_count_put(dev_priv->device);
 
 	return result;
 }
@@ -1049,6 +939,11 @@
 			param->context_id);
 		return -EINVAL;
 	}
+	/*
+	 * A reference count is needed here, because waittimestamp may
+	 * block with the device mutex unlocked and userspace could
+	 * request for the context to be destroyed during that time.
+	 */
 	kgsl_context_get(context);
 	result = _device_waittimestamp(dev_priv, context,
 			param->timestamp, param->timeout);
@@ -1085,6 +980,19 @@
 			goto done;
 		}
 
+		/*
+		 * Put a reasonable upper limit on the number of IBs that can be
+		 * submitted
+		 */
+
+		if (param->numibs > 10000) {
+			KGSL_DRV_ERR(dev_priv->device,
+				"Too many IBs submitted. count: %d max 10000\n",
+				param->numibs);
+			result = -EINVAL;
+			goto done;
+		}
+
 		ibdesc = kzalloc(sizeof(struct kgsl_ibdesc) * param->numibs,
 					GFP_KERNEL);
 		if (!ibdesc) {
@@ -1105,6 +1013,8 @@
 	} else {
 		KGSL_DRV_INFO(dev_priv->device,
 			"Using single IB submission mode for ib submission\n");
+		/* If user space driver is still using the old mode of
+		 * submitting single ib then we need to support that as well */
 		ibdesc = kzalloc(sizeof(struct kgsl_ibdesc), GFP_KERNEL);
 		if (!ibdesc) {
 			KGSL_MEM_ERR(dev_priv->device,
@@ -1332,97 +1242,6 @@
 	return vma;
 }
 
-static long
-kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
-				unsigned int cmd, void *data)
-{
-	int result = 0, len = 0;
-	struct kgsl_process_private *private = dev_priv->process_priv;
-	struct kgsl_sharedmem_from_vmalloc *param = data;
-	struct kgsl_mem_entry *entry = NULL;
-	struct vm_area_struct *vma;
-
-	KGSL_DEV_ERR_ONCE(dev_priv->device, "IOCTL_KGSL_SHAREDMEM_FROM_VMALLOC"
-			" is deprecated\n");
-	if (!kgsl_mmu_enabled())
-		return -ENODEV;
-
-	if (!param->hostptr) {
-		KGSL_CORE_ERR("invalid hostptr %x\n", param->hostptr);
-		result = -EINVAL;
-		goto error;
-	}
-
-	vma = kgsl_get_vma_from_start_addr(param->hostptr);
-	if (!vma) {
-		result = -EINVAL;
-		goto error;
-	}
-
-	if (param->gpuaddr != 0) {
-		len = param->gpuaddr;
-	} else {
-		if (vma->vm_pgoff || (param->hostptr != vma->vm_start)) {
-			KGSL_CORE_ERR("VMA region does not match hostaddr\n");
-			result = -EINVAL;
-			goto error;
-		}
-
-		len = vma->vm_end - vma->vm_start;
-	}
-
-	
-	if (len == 0 || param->hostptr + len > vma->vm_end) {
-		KGSL_CORE_ERR("Invalid memory allocation length %d\n", len);
-		result = -EINVAL;
-		goto error;
-	}
-
-	entry = kgsl_mem_entry_create();
-	if (entry == NULL) {
-		result = -ENOMEM;
-		goto error;
-	}
-
-	result = kgsl_sharedmem_page_alloc_user(&entry->memdesc,
-					     private,
-					     private->pagetable, len,
-					     param->flags);
-	if (result != 0)
-		goto error_free_entry;
-
-	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-
-	result = kgsl_sharedmem_map_vma(vma, &entry->memdesc);
-	if (result) {
-		KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result);
-		goto error_free_alloc;
-	}
-
-	param->gpuaddr = entry->memdesc.gpuaddr;
-
-	entry->memtype = KGSL_MEM_ENTRY_KERNEL;
-
-	kgsl_mem_entry_attach_process(entry, private);
-
-	trace_kgsl_mem_alloc(entry);
-	
-	kgsl_process_add_stats(private, entry->memtype, len);
-
-	kgsl_check_idle(dev_priv->device);
-	return 0;
-
-error_free_alloc:
-	kgsl_sharedmem_free(&entry->memdesc);
-
-error_free_entry:
-	kfree(entry);
-
-error:
-	kgsl_check_idle(dev_priv->device);
-	return result;
-}
-
 static inline int _check_region(unsigned long start, unsigned long size,
 				uint64_t len)
 {
@@ -1483,20 +1302,35 @@
 
 	ret = -ERANGE;
 
-	if (phys == 0)
-		goto err;
-
-	if ((len & ~PAGE_MASK) ||
-		(offset & ~PAGE_MASK) ||
-		(size & ~PAGE_MASK)) {
-		KGSL_CORE_ERR("length offset or size is not page aligned\n");
+	if (phys == 0) {
+		KGSL_CORE_ERR("kgsl_get_phys_file returned phys=0\n");
 		goto err;
 	}
 
-	
-	if (offset >= len || size > len)
+	/* Make sure the length of the region, the offset and the desired
+	 * size are all page aligned or bail
+	 */
+	if ((len & ~PAGE_MASK) ||
+		(offset & ~PAGE_MASK) ||
+		(size & ~PAGE_MASK)) {
+		KGSL_CORE_ERR("length %lu, offset %u or size %u "
+				"is not page aligned\n",
+				len, offset, size);
 		goto err;
+	}
 
+	/* The size or offset can never be greater than the PMEM length */
+	if (offset >= len || size > len) {
+		KGSL_CORE_ERR("offset %u or size %u "
+				"exceeds pmem length %lu\n",
+				offset, size, len);
+		goto err;
+	}
+
+	/* If size is 0, then adjust it to default to the size of the region
+	 * minus the offset.  If size isn't zero, then make sure that it will
+	 * fit inside of the region.
+	 */
 	if (size == 0)
 		size = len - offset;
 
@@ -1535,6 +1369,8 @@
 		return -ENOMEM;
 
 	memdesc->sglen = sglen;
+	memdesc->sglen_alloc = sglen;
+
 	sg_init_table(memdesc->sg, sglen);
 
 	spin_lock(&current->mm->page_table_lock);
@@ -1593,7 +1429,7 @@
 		return -EINVAL;
 	}
 
-	
+	/* We don't necessarily start at vma->vm_start */
 	len = vma->vm_end - (unsigned long) hostptr;
 
 	if (offset >= len)
@@ -1610,7 +1446,7 @@
 	if (size == 0)
 		size = len;
 
-	
+	/* Adjust the size of the region to account for the offset */
 	size += offset & ~PAGE_MASK;
 
 	size = ALIGN(size, PAGE_SIZE);
@@ -1726,7 +1562,7 @@
 
 	entry->memdesc.sg = sg_table->sgl;
 
-	
+	/* Calculate the size of the memdesc from the sglist */
 
 	entry->memdesc.sglen = 0;
 
@@ -1760,6 +1596,8 @@
 	else
 		memtype = param->memtype;
 
+	entry->memdesc.flags = param->flags;
+
 	switch (memtype) {
 	case KGSL_USER_MEM_TYPE_PMEM:
 		if (param->fd == 0 || param->len == 0)
@@ -1819,6 +1657,11 @@
 	if (result)
 		goto error;
 
+	if (entry->memdesc.size >= SZ_1M)
+		kgsl_memdesc_set_align(&entry->memdesc, ilog2(SZ_1M));
+	else if (entry->memdesc.size >= SZ_64K)
+		kgsl_memdesc_set_align(&entry->memdesc, ilog2(SZ_64));
+
 	result = kgsl_mmu_map(private->pagetable,
 			      &entry->memdesc,
 			      GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
@@ -1826,7 +1669,7 @@
 	if (result)
 		goto error_put_file_ptr;
 
-	
+	/* Adjust the returned value for a non 4k aligned offset */
 	param->gpuaddr = entry->memdesc.gpuaddr + (param->offset & ~PAGE_MASK);
 
 	KGSL_STATS_ADD(param->len, kgsl_driver.stats.mapped,
@@ -1859,6 +1702,8 @@
 	return result;
 }
 
+/*This function flushes a graphics memory allocation from CPU cache
+ *when caching is enabled with MMU*/
 static long
 kgsl_ioctl_sharedmem_flush_cache(struct kgsl_device_private *dev_priv,
 				 unsigned int cmd, void *data)
@@ -1900,7 +1745,7 @@
 	if (entry == NULL)
 		return -ENOMEM;
 
-	result = kgsl_allocate_user(&entry->memdesc, private, private->pagetable,
+	result = kgsl_allocate_user(&entry->memdesc, private->pagetable,
 		param->size, param->flags);
 
 	if (result == 0) {
@@ -1953,6 +1798,15 @@
 	struct genlock *lock;
 };
 
+/**
+ * kgsl_genlock_event_cb - Event callback for a genlock timestamp event
+ * @device - The KGSL device that expired the timestamp
+ * @priv - private data for the event
+ * @context_id - the context id that goes with the timestamp
+ * @timestamp - the timestamp that triggered the event
+ *
+ * Release a genlock lock following the expiration of a timestamp
+ */
 
 static void kgsl_genlock_event_cb(struct kgsl_device *device,
 	void *priv, u32 context_id, u32 timestamp)
@@ -1969,6 +1823,18 @@
 	kfree(ev);
 }
 
+/**
+ * kgsl_add_genlock-event - Create a new genlock event
+ * @device - KGSL device to create the event on
+ * @timestamp - Timestamp to trigger the event
+ * @data - User space buffer containing struct kgsl_genlock_event_priv
+ * @len - length of the userspace buffer
+ * @owner - driver instance that owns this event
+ * @returns 0 on success or error code on error
+ *
+ * Attack to a genlock handle and register an event to release the
+ * genlock lock when the timestamp expires
+ */
 
 static int kgsl_add_genlock_event(struct kgsl_device *device,
 	u32 context_id, u32 timestamp, void __user *data, int len,
@@ -2013,6 +1879,13 @@
 }
 #endif
 
+/**
+ * kgsl_ioctl_timestamp_event - Register a new timestamp event from userspace
+ * @dev_priv - pointer to the private device structure
+ * @cmd - the ioctl cmd passed from kgsl_ioctl
+ * @data - the user data buffer from kgsl_ioctl
+ * @returns 0 on success or error code on failure
+ */
 
 static long kgsl_ioctl_timestamp_event(struct kgsl_device_private *dev_priv,
 		unsigned int cmd, void *data)
@@ -2026,6 +1899,11 @@
 			param->context_id, param->timestamp, param->priv,
 			param->len, dev_priv);
 		break;
+	case KGSL_TIMESTAMP_EVENT_FENCE:
+		ret = kgsl_add_fence_event(dev_priv->device,
+			param->context_id, param->timestamp, param->priv,
+			param->len, dev_priv);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -2036,42 +1914,54 @@
 typedef long (*kgsl_ioctl_func_t)(struct kgsl_device_private *,
 	unsigned int, void *);
 
-#define KGSL_IOCTL_FUNC(_cmd, _func, _lock) \
-	[_IOC_NR(_cmd)] = { .cmd = _cmd, .func = _func, .lock = _lock }
+#define KGSL_IOCTL_FUNC(_cmd, _func, _flags) \
+	[_IOC_NR((_cmd))] = \
+		{ .cmd = (_cmd), .func = (_func), .flags = (_flags) }
+
+#define KGSL_IOCTL_LOCK		BIT(0)
+#define KGSL_IOCTL_WAKE		BIT(1)
 
 static const struct {
 	unsigned int cmd;
 	kgsl_ioctl_func_t func;
-	int lock;
+	int flags;
 } kgsl_ioctl_funcs[] = {
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_DEVICE_GETPROPERTY,
-			kgsl_ioctl_device_getproperty, 1),
+			kgsl_ioctl_device_getproperty,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_DEVICE_WAITTIMESTAMP,
-			kgsl_ioctl_device_waittimestamp, 1),
+			kgsl_ioctl_device_waittimestamp,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID,
-			kgsl_ioctl_device_waittimestamp_ctxtid, 1),
+			kgsl_ioctl_device_waittimestamp_ctxtid,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_RINGBUFFER_ISSUEIBCMDS,
-			kgsl_ioctl_rb_issueibcmds, 1),
+			kgsl_ioctl_rb_issueibcmds,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_CMDSTREAM_READTIMESTAMP,
-			kgsl_ioctl_cmdstream_readtimestamp, 1),
+			kgsl_ioctl_cmdstream_readtimestamp,
+			KGSL_IOCTL_LOCK),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_CMDSTREAM_READTIMESTAMP_CTXTID,
-			kgsl_ioctl_cmdstream_readtimestamp_ctxtid, 1),
+			kgsl_ioctl_cmdstream_readtimestamp_ctxtid,
+			KGSL_IOCTL_LOCK),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP,
-			kgsl_ioctl_cmdstream_freememontimestamp, 1),
+			kgsl_ioctl_cmdstream_freememontimestamp,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP_CTXTID,
-			kgsl_ioctl_cmdstream_freememontimestamp_ctxtid, 1),
+			kgsl_ioctl_cmdstream_freememontimestamp_ctxtid,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_DRAWCTXT_CREATE,
-			kgsl_ioctl_drawctxt_create, 1),
+			kgsl_ioctl_drawctxt_create,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_DRAWCTXT_DESTROY,
-			kgsl_ioctl_drawctxt_destroy, 1),
+			kgsl_ioctl_drawctxt_destroy,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_MAP_USER_MEM,
 			kgsl_ioctl_map_user_mem, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_SHAREDMEM_FROM_PMEM,
 			kgsl_ioctl_map_user_mem, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_SHAREDMEM_FREE,
 			kgsl_ioctl_sharedmem_free, 0),
-	KGSL_IOCTL_FUNC(IOCTL_KGSL_SHAREDMEM_FROM_VMALLOC,
-			kgsl_ioctl_sharedmem_from_vmalloc, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_SHAREDMEM_FLUSH_CACHE,
 			kgsl_ioctl_sharedmem_flush_cache, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_ALLOC,
@@ -2081,9 +1971,11 @@
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_CFF_USER_EVENT,
 			kgsl_ioctl_cff_user_event, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_TIMESTAMP_EVENT,
-			kgsl_ioctl_timestamp_event, 1),
+			kgsl_ioctl_timestamp_event,
+			KGSL_IOCTL_LOCK),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_SETPROPERTY,
-			kgsl_ioctl_device_setproperty, 1),
+			kgsl_ioctl_device_setproperty,
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE)
 };
 
 static long kgsl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
@@ -2091,17 +1983,21 @@
 	struct kgsl_device_private *dev_priv = filep->private_data;
 	unsigned int nr;
 	kgsl_ioctl_func_t func;
-	int lock, ret;
+	int lock, ret, use_hw;
 	char ustack[64];
 	void *uptr = NULL;
 
 	BUG_ON(dev_priv == NULL);
 
+	/* Workaround for an previously incorrectly defined ioctl code.
+	   This helps ensure binary compatability */
 
 	if (cmd == IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP_OLD)
 		cmd = IOCTL_KGSL_CMDSTREAM_FREEMEMONTIMESTAMP;
 	else if (cmd == IOCTL_KGSL_CMDSTREAM_READTIMESTAMP_OLD)
 		cmd = IOCTL_KGSL_CMDSTREAM_READTIMESTAMP;
+	else if (cmd == IOCTL_KGSL_TIMESTAMP_EVENT_OLD)
+		cmd = IOCTL_KGSL_TIMESTAMP_EVENT;
 
 	nr = _IOC_NR(cmd);
 
@@ -2131,6 +2027,10 @@
 	if (nr < ARRAY_SIZE(kgsl_ioctl_funcs) &&
 		kgsl_ioctl_funcs[nr].func != NULL) {
 
+		/*
+		 * Make sure that nobody tried to send us a malformed ioctl code
+		 * with a valid NR but bogus flags
+		 */
 
 		if (kgsl_ioctl_funcs[nr].cmd != cmd) {
 			KGSL_DRV_ERR(dev_priv->device,
@@ -2140,7 +2040,8 @@
 		}
 
 		func = kgsl_ioctl_funcs[nr].func;
-		lock = kgsl_ioctl_funcs[nr].lock;
+		lock = kgsl_ioctl_funcs[nr].flags & KGSL_IOCTL_LOCK;
+		use_hw = kgsl_ioctl_funcs[nr].flags & KGSL_IOCTL_WAKE;
 	} else {
 		func = dev_priv->device->ftbl->ioctl;
 		if (!func) {
@@ -2150,11 +2051,13 @@
 			goto done;
 		}
 		lock = 1;
+		use_hw = 1;
 	}
 
 	if (lock) {
 		mutex_lock(&dev_priv->device->mutex);
-		kgsl_check_suspended(dev_priv->device);
+		if (use_hw)
+			kgsl_check_suspended(dev_priv->device);
 	}
 
 	ret = func(dev_priv, cmd, uptr);
@@ -2182,7 +2085,8 @@
 	struct kgsl_memdesc *memdesc = &device->memstore;
 	int result;
 	unsigned int vma_size = vma->vm_end - vma->vm_start;
-	
+
+	/* The memstore can only be mapped as read only */
 
 	if (vma->vm_flags & VM_WRITE)
 		return -EPERM;
@@ -2195,7 +2099,8 @@
 
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-	result = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+	result = remap_pfn_range(vma, vma->vm_start,
+				device->memstore.physaddr >> PAGE_SHIFT,
 				 vma_size, vma->vm_page_prot);
 	if (result != 0)
 		KGSL_MEM_ERR(device, "remap_pfn_range failed: %d\n",
@@ -2204,6 +2109,10 @@
 	return result;
 }
 
+/*
+ * kgsl_gpumem_vm_open is called whenever a vma region is copied or split.
+ * Increase the refcount to make sure that the accounting stays correct
+ */
 
 static void kgsl_gpumem_vm_open(struct vm_area_struct *vma)
 {
@@ -2242,14 +2151,13 @@
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_mem_entry *entry = NULL;
 	struct kgsl_device *device = dev_priv->device;
-	int i;
 
-	
+	/* Handle leagacy behavior for memstore */
 
-	if (vma_offset == device->memstore.physaddr)
+	if (vma_offset == device->memstore.gpuaddr)
 		return kgsl_mmap_memstore(device, vma);
 
-	
+	/* Find a chunk of GPU memory */
 
 	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, vma_offset);
@@ -2274,21 +2182,6 @@
 	vma->vm_ops = &kgsl_gpumem_vm_ops;
 	vma->vm_file = file;
 
-	
-	if(entry->memdesc.handle == NULL)
-	{
-		for(i = 0; (i*PAGE_SIZE) < (vma->vm_end - vma->vm_start); i++)
-		{
-			vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, sg_page(&entry->memdesc.sg[i]));
-		}
-	}
-	else
-	{
-		remap_pfn_range(vma, vma->vm_start, __phys_to_pfn(entry->memdesc.sg[0].dma_address),
-		entry->memdesc.size, vma->vm_page_prot);
-	}
-	
-
 	return 0;
 }
 
@@ -2337,7 +2230,7 @@
 	int minor, ret;
 	dev_t dev;
 
-	
+	/* Find a minor for the device */
 
 	mutex_lock(&kgsl_driver.devlock);
 	for (minor = 0; minor < KGSL_DEVICE_MAX; minor++) {
@@ -2353,7 +2246,7 @@
 		return -ENODEV;
 	}
 
-	
+	/* Create the device */
 	dev = MKDEV(MAJOR(kgsl_driver.major), minor);
 	device->dev = device_create(kgsl_driver.class,
 				    device->parentdev,
@@ -2376,7 +2269,6 @@
 int kgsl_device_platform_probe(struct kgsl_device *device)
 {
 	int result;
-	int i;
 	int status = -EINVAL;
 	struct resource *res;
 	struct platform_device *pdev =
@@ -2386,20 +2278,7 @@
 	if (status)
 		return status;
 
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	device->current_process_priv = NULL;
-#endif
-
-	
-	device->gputime.total = 0;
-	device->gputime.busy = 0;
-	for(i=0;i<KGSL_MAX_PWRLEVELS;i++) {
-		device->gputime_in_state[i].total = 0;
-		device->gputime_in_state[i].busy = 0;
-	}
-	
-
-	
+	/* Initialize logging first, so that failures below actually print. */
 	kgsl_device_debugfs_init(device);
 
 	status = kgsl_pwrctrl_init(device);
@@ -2440,7 +2319,7 @@
 		status = -ENODEV;
 		goto error_pwrctrl_close;
 	}
-	
+	/*acquire interrupt */
 	device->pwrctrl.interrupt_num =
 		platform_get_irq_byname(pdev, device->pwrctrl.irq_name);
 
@@ -2495,10 +2374,10 @@
 	pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY,
 				PM_QOS_DEFAULT_VALUE);
 
-	
+	/* Initalize the snapshot engine */
 	kgsl_device_snapshot_init(device);
 
-	
+	/* Initialize common sysfs entries */
 	kgsl_pwrctrl_init_sysfs(device);
 
 	return 0;
@@ -2516,6 +2395,80 @@
 }
 EXPORT_SYMBOL(kgsl_device_platform_probe);
 
+int kgsl_postmortem_dump(struct kgsl_device *device, int manual)
+{
+	bool saved_nap;
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+
+	BUG_ON(device == NULL);
+
+	kgsl_cffdump_hang(device->id);
+
+	/* For a manual dump, make sure that the system is idle */
+
+	if (manual) {
+		if (device->active_cnt != 0) {
+			mutex_unlock(&device->mutex);
+			wait_for_completion(&device->suspend_gate);
+			mutex_lock(&device->mutex);
+		}
+
+		if (device->state == KGSL_STATE_ACTIVE)
+			kgsl_idle(device);
+
+	}
+
+	if (device->pm_dump_enable) {
+
+		KGSL_LOG_DUMP(device,
+				"POWER: FLAGS = %08lX | ACTIVE POWERLEVEL = %08X",
+				pwr->power_flags, pwr->active_pwrlevel);
+
+		KGSL_LOG_DUMP(device, "POWER: INTERVAL TIMEOUT = %08X ",
+				pwr->interval_timeout);
+
+	}
+
+	/* Disable the idle timer so we don't get interrupted */
+	del_timer_sync(&device->idle_timer);
+	mutex_unlock(&device->mutex);
+	flush_workqueue(device->work_queue);
+	mutex_lock(&device->mutex);
+
+	/* Turn off napping to make sure we have the clocks full
+	   attention through the following process */
+	saved_nap = device->pwrctrl.nap_allowed;
+	device->pwrctrl.nap_allowed = false;
+
+	/* Force on the clocks */
+	kgsl_pwrctrl_wake(device);
+
+	/* Disable the irq */
+	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
+
+	/*Call the device specific postmortem dump function*/
+	device->ftbl->postmortem_dump(device, manual);
+
+	/* Restore nap mode */
+	device->pwrctrl.nap_allowed = saved_nap;
+
+	/* On a manual trigger, turn on the interrupts and put
+	   the clocks to sleep.  They will recover themselves
+	   on the next event.  For a hang, leave things as they
+	   are until fault tolerance kicks in. */
+
+	if (manual) {
+		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
+
+		/* try to go into a sleep mode until the next event */
+		kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP);
+		kgsl_pwrctrl_sleep(device);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(kgsl_postmortem_dump);
+
 void kgsl_device_platform_remove(struct kgsl_device *device)
 {
 	kgsl_device_snapshot_close(device);
@@ -2560,6 +2513,12 @@
 	kgsl_cffdump_destroy();
 	kgsl_core_debugfs_close();
 
+	/*
+	 * We call kgsl_sharedmem_uninit_sysfs() and device_unregister()
+	 * only if kgsl_driver.virtdev has been populated.
+	 * We check at least one member of kgsl_driver.virtdev to
+	 * see if it is not NULL (and thus, has been populated).
+	 */
 	if (kgsl_driver.virtdev.class) {
 		kgsl_sharedmem_uninit_sysfs();
 		device_unregister(&kgsl_driver.virtdev);
@@ -2576,7 +2535,7 @@
 static int __init kgsl_core_init(void)
 {
 	int result = 0;
-	
+	/* alloc major and minor device numbers */
 	result = alloc_chrdev_region(&kgsl_driver.major, 0, KGSL_DEVICE_MAX,
 				  KGSL_NAME);
 	if (result < 0) {
@@ -2604,6 +2563,8 @@
 		goto err;
 	}
 
+	/* Make a virtual device for managing core related things
+	   in sysfs */
 	kgsl_driver.virtdev.class = kgsl_driver.class;
 	dev_set_name(&kgsl_driver.virtdev, "kgsl");
 	result = device_register(&kgsl_driver.virtdev);
@@ -2612,7 +2573,7 @@
 		goto err;
 	}
 
-	
+	/* Make kobjects in the virtual device for storing statistics */
 
 	kgsl_driver.ptkobj =
 	  kobject_create_and_add("pagetables",
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index 7e62475..3935164 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,24 +22,33 @@
 #include <linux/cdev.h>
 #include <linux/regulator/consumer.h>
 #include <linux/mm.h>
-#include <linux/ion.h>
+
+#include <mach/kgsl.h>
 
 #define KGSL_NAME "kgsl"
 
+/* The number of memstore arrays limits the number of contexts allowed.
+ * If more contexts are needed, update multiple for MEMSTORE_SIZE
+ */
 #define KGSL_MEMSTORE_SIZE	((int)(PAGE_SIZE * 2))
 #define KGSL_MEMSTORE_GLOBAL	(0)
 #define KGSL_MEMSTORE_MAX	(KGSL_MEMSTORE_SIZE / \
 		sizeof(struct kgsl_devmemstore) - 1)
 
+/* Timestamp window used to detect rollovers (half of integer range) */
 #define KGSL_TIMESTAMP_WINDOW 0x80000000
 
+/*cache coherency ops */
 #define DRM_KGSL_GEM_CACHE_OP_TO_DEV	0x0001
 #define DRM_KGSL_GEM_CACHE_OP_FROM_DEV	0x0002
 
+/* The size of each entry in a page table */
 #define KGSL_PAGETABLE_ENTRY_SIZE  4
 
+/* Pagetable Virtual Address base */
 #define KGSL_PAGETABLE_BASE	0x10000000
 
+/* Extra accounting entries needed in the pagetable */
 #define KGSL_PT_EXTRA_ENTRIES      16
 
 #define KGSL_PAGETABLE_ENTRIES(_sz) (((_sz) >> PAGE_SHIFT) + \
@@ -51,36 +60,41 @@
 #define KGSL_PAGETABLE_COUNT 1
 #endif
 
+/* Casting using container_of() for structures that kgsl owns. */
 #define KGSL_CONTAINER_OF(ptr, type, member) \
 		container_of(ptr, type, member)
 
+/* A macro for memory statistics - add the new size to the stat and if
+   the statisic is greater then _max, set _max
+*/
 
 #define KGSL_STATS_ADD(_size, _stat, _max) \
 	do { _stat += (_size); if (_stat > _max) _max = _stat; } while (0)
 
 struct kgsl_device;
+struct kgsl_context;
 
 struct kgsl_driver {
 	struct cdev cdev;
 	dev_t major;
 	struct class *class;
-	
+	/* Virtual device for managing the core */
 	struct device virtdev;
-	
+	/* Kobjects for storing pagetable and process statistics */
 	struct kobject *ptkobj;
 	struct kobject *prockobj;
 	struct kgsl_device *devp[KGSL_DEVICE_MAX];
 
-	
+	/* Global lilst of open processes */
 	struct list_head process_list;
-	
+	/* Global list of pagetables */
 	struct list_head pagetable_list;
-	
+	/* Spinlock for accessing the pagetable list */
 	spinlock_t ptlock;
-	
+	/* Mutex for accessing the process list */
 	struct mutex process_mutex;
 
-	
+	/* Mutex for protecting the device list */
 	struct mutex devlock;
 
 	void *ptpool;
@@ -90,14 +104,10 @@
 		unsigned int vmalloc_max;
 		unsigned int page_alloc;
 		unsigned int page_alloc_max;
-		unsigned int page_alloc_kernel;
 		unsigned int coherent;
 		unsigned int coherent_max;
 		unsigned int mapped;
 		unsigned int mapped_max;
-		unsigned int pre_alloc;
-		unsigned int pre_alloc_max;
-		unsigned int pre_alloc_kernel;
 		unsigned int histogram[16];
 	} stats;
 };
@@ -115,45 +125,36 @@
 	int (*map_kernel_mem)(struct kgsl_memdesc *);
 };
 
+/* Internal definitions for memdesc->priv */
 #define KGSL_MEMDESC_GUARD_PAGE BIT(0)
+/* Set if the memdesc is mapped into all pagetables */
+#define KGSL_MEMDESC_GLOBAL BIT(1)
 
+/* shared memory allocation */
 struct kgsl_memdesc {
 	struct kgsl_pagetable *pagetable;
 	void *hostptr;
 	unsigned int gpuaddr;
 	unsigned int physaddr;
 	unsigned int size;
-	unsigned int priv;
+	unsigned int priv; /* Internal flags and settings */
 	struct scatterlist *sg;
-	unsigned int sglen;
+	unsigned int sglen; /* Active entries in the sglist */
+	unsigned int sglen_alloc;  /* Allocated entries in the sglist */
 	struct kgsl_memdesc_ops *ops;
-	int flags;
-	struct ion_handle* handle;
-	struct kgsl_process_private *private;
+	unsigned int flags; /* Flags set from userspace */
 };
 
-#if 0
-#define KGSL_MEM_ENTRY_KERNEL		0
-#define KGSL_MEM_ENTRY_PMEM		1
-#define KGSL_MEM_ENTRY_ASHMEM		2
-#define KGSL_MEM_ENTRY_USER		3
-#define KGSL_MEM_ENTRY_ION		4
-#define KGSL_MEM_ENTRY_PAGE_ALLOC	5
-#define KGSL_MEM_ENTRY_PRE_ALLOC	6
-#define KGSL_MEM_ENTRY_MAX		7
-#else
-enum {
-	KGSL_MEM_ENTRY_KERNEL = 0,
-	KGSL_MEM_ENTRY_PMEM,
-	KGSL_MEM_ENTRY_ASHMEM,
-	KGSL_MEM_ENTRY_USER,
-	KGSL_MEM_ENTRY_ION,
-	KGSL_MEM_ENTRY_PAGE_ALLOC,
-	KGSL_MEM_ENTRY_PRE_ALLOC,
-	KGSL_MEM_ENTRY_MAX,
-};
-#endif
+/* List of different memory entry types */
 
+#define KGSL_MEM_ENTRY_KERNEL 0
+#define KGSL_MEM_ENTRY_PMEM   1
+#define KGSL_MEM_ENTRY_ASHMEM 2
+#define KGSL_MEM_ENTRY_USER   3
+#define KGSL_MEM_ENTRY_ION    4
+#define KGSL_MEM_ENTRY_MAX    5
+
+/* List of flags */
 
 #define KGSL_MEM_ENTRY_FROZEN (1 << 0)
 
@@ -165,6 +166,8 @@
 	void *priv_data;
 	struct rb_node node;
 	unsigned int context_id;
+	/* back pointer to private structure under whose context this
+	* allocation is made */
 	struct kgsl_process_private *priv;
 };
 
@@ -175,14 +178,17 @@
 #endif
 
 void kgsl_mem_entry_destroy(struct kref *kref);
+int kgsl_postmortem_dump(struct kgsl_device *device, int manual);
 
-struct kgsl_mem_entry *kgsl_get_mem_entry(unsigned int ptbase,
-		unsigned int gpuaddr, unsigned int size);
+struct kgsl_mem_entry *kgsl_get_mem_entry(struct kgsl_device *device,
+		unsigned int ptbase, unsigned int gpuaddr, unsigned int size);
 
 struct kgsl_mem_entry *kgsl_sharedmem_find_region(
 	struct kgsl_process_private *private, unsigned int gpuaddr,
 	size_t size);
 
+void kgsl_get_memory_usage(char *str, size_t len, unsigned int memflags);
+
 int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
 	void (*cb)(struct kgsl_device *, void *, u32, u32), void *priv,
 	void *owner);
@@ -190,6 +196,9 @@
 void kgsl_cancel_events(struct kgsl_device *device,
 	void *owner);
 
+void kgsl_cancel_events_ctxt(struct kgsl_device *device,
+	struct kgsl_context *context);
+
 extern const struct dev_pm_ops kgsl_pm_ops;
 
 struct early_suspend;
@@ -245,14 +254,19 @@
 
 static inline int timestamp_cmp(unsigned int a, unsigned int b)
 {
-	
+	/* check for equal */
 	if (a == b)
 		return 0;
 
-	
+	/* check for greater-than for non-rollover case */
 	if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW))
 		return 1;
 
+	/* check for greater-than for rollover case
+	 * note that <= is required to ensure that consistent
+	 * results are returned for values whose difference is
+	 * equal to the window size
+	 */
 	a += KGSL_TIMESTAMP_WINDOW;
 	b += KGSL_TIMESTAMP_WINDOW;
 	return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
@@ -270,4 +284,4 @@
 	kref_put(&entry->refcount, kgsl_mem_entry_destroy);
 }
 
-#endif 
+#endif /* __KGSL_H */
diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c
new file mode 100644
index 0000000..e06c94d
--- /dev/null
+++ b/drivers/gpu/msm/kgsl_cffdump.c
@@ -0,0 +1,591 @@
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+#define ALIGN_CPU
+
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <mach/socinfo.h>
+
+#include "kgsl.h"
+#include "kgsl_cffdump.h"
+#include "kgsl_debugfs.h"
+#include "kgsl_log.h"
+#include "kgsl_sharedmem.h"
+#include "adreno_pm4types.h"
+
+static struct rchan	*chan;
+static struct dentry	*dir;
+static int		suspended;
+static size_t		dropped;
+static size_t		subbuf_size = 256*1024;
+static size_t		n_subbufs = 64;
+
+/* forward declarations */
+static void destroy_channel(void);
+static struct rchan *create_channel(unsigned subbuf_size, unsigned n_subbufs);
+
+static spinlock_t cffdump_lock;
+static ulong serial_nr;
+static ulong total_bytes;
+static ulong total_syncmem;
+static long last_sec;
+
+#define MEMBUF_SIZE	64
+
+#define CFF_OP_WRITE_REG        0x00000002
+struct cff_op_write_reg {
+	unsigned char op;
+	uint addr;
+	uint value;
+} __packed;
+
+#define CFF_OP_POLL_REG         0x00000004
+struct cff_op_poll_reg {
+	unsigned char op;
+	uint addr;
+	uint value;
+	uint mask;
+} __packed;
+
+#define CFF_OP_WAIT_IRQ         0x00000005
+struct cff_op_wait_irq {
+	unsigned char op;
+} __packed;
+
+#define CFF_OP_RMW              0x0000000a
+
+#define CFF_OP_WRITE_MEM        0x0000000b
+struct cff_op_write_mem {
+	unsigned char op;
+	uint addr;
+	uint value;
+} __packed;
+
+#define CFF_OP_WRITE_MEMBUF     0x0000000c
+struct cff_op_write_membuf {
+	unsigned char op;
+	uint addr;
+	ushort count;
+	uint buffer[MEMBUF_SIZE];
+} __packed;
+
+#define CFF_OP_MEMORY_BASE	0x0000000d
+struct cff_op_memory_base {
+	unsigned char op;
+	uint base;
+	uint size;
+	uint gmemsize;
+} __packed;
+
+#define CFF_OP_HANG		0x0000000e
+struct cff_op_hang {
+	unsigned char op;
+} __packed;
+
+#define CFF_OP_EOF              0xffffffff
+struct cff_op_eof {
+	unsigned char op;
+} __packed;
+
+#define CFF_OP_VERIFY_MEM_FILE  0x00000007
+#define CFF_OP_WRITE_SURFACE_PARAMS 0x00000011
+struct cff_op_user_event {
+	unsigned char op;
+	unsigned int op1;
+	unsigned int op2;
+	unsigned int op3;
+	unsigned int op4;
+	unsigned int op5;
+} __packed;
+
+
+static void b64_encodeblock(unsigned char in[3], unsigned char out[4], int len)
+{
+	static const char tob64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmno"
+		"pqrstuvwxyz0123456789+/";
+
+	out[0] = tob64[in[0] >> 2];
+	out[1] = tob64[((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4)];
+	out[2] = (unsigned char) (len > 1 ? tob64[((in[1] & 0x0f) << 2)
+		| ((in[2] & 0xc0) >> 6)] : '=');
+	out[3] = (unsigned char) (len > 2 ? tob64[in[2] & 0x3f] : '=');
+}
+
+static void b64_encode(const unsigned char *in_buf, int in_size,
+	unsigned char *out_buf, int out_bufsize, int *out_size)
+{
+	unsigned char in[3], out[4];
+	int i, len;
+
+	*out_size = 0;
+	while (in_size > 0) {
+		len = 0;
+		for (i = 0; i < 3; ++i) {
+			if (in_size-- > 0) {
+				in[i] = *in_buf++;
+				++len;
+			} else
+				in[i] = 0;
+		}
+		if (len) {
+			b64_encodeblock(in, out, len);
+			if (out_bufsize < 4) {
+				pr_warn("kgsl: cffdump: %s: out of buffer\n",
+					__func__);
+				return;
+			}
+			for (i = 0; i < 4; ++i)
+				*out_buf++ = out[i];
+			*out_size += 4;
+			out_bufsize -= 4;
+		}
+	}
+}
+
+#define KLOG_TMPBUF_SIZE (1024)
+static void klog_printk(const char *fmt, ...)
+{
+	/* per-cpu klog formatting temporary buffer */
+	static char klog_buf[NR_CPUS][KLOG_TMPBUF_SIZE];
+
+	va_list args;
+	int len;
+	char *cbuf;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	cbuf = klog_buf[smp_processor_id()];
+	va_start(args, fmt);
+	len = vsnprintf(cbuf, KLOG_TMPBUF_SIZE, fmt, args);
+	total_bytes += len;
+	va_end(args);
+	relay_write(chan, cbuf, len);
+	local_irq_restore(flags);
+}
+
+static struct cff_op_write_membuf cff_op_write_membuf;
+static void cffdump_membuf(int id, unsigned char *out_buf, int out_bufsize)
+{
+	void *data;
+	int len, out_size;
+	struct cff_op_write_mem cff_op_write_mem;
+
+	uint addr = cff_op_write_membuf.addr
+		- sizeof(uint)*cff_op_write_membuf.count;
+
+	if (!cff_op_write_membuf.count) {
+		pr_warn("kgsl: cffdump: membuf: count == 0, skipping");
+		return;
+	}
+
+	if (cff_op_write_membuf.count != 1) {
+		cff_op_write_membuf.op = CFF_OP_WRITE_MEMBUF;
+		cff_op_write_membuf.addr = addr;
+		len = sizeof(cff_op_write_membuf) -
+			sizeof(uint)*(MEMBUF_SIZE - cff_op_write_membuf.count);
+		data = &cff_op_write_membuf;
+	} else {
+		cff_op_write_mem.op = CFF_OP_WRITE_MEM;
+		cff_op_write_mem.addr = addr;
+		cff_op_write_mem.value = cff_op_write_membuf.buffer[0];
+		data = &cff_op_write_mem;
+		len = sizeof(cff_op_write_mem);
+	}
+	b64_encode(data, len, out_buf, out_bufsize, &out_size);
+	out_buf[out_size] = 0;
+	klog_printk("%ld:%d;%s\n", ++serial_nr, id, out_buf);
+	cff_op_write_membuf.count = 0;
+	cff_op_write_membuf.addr = 0;
+}
+
+static void cffdump_printline(int id, uint opcode, uint op1, uint op2,
+	uint op3, uint op4, uint op5)
+{
+	struct cff_op_write_reg cff_op_write_reg;
+	struct cff_op_poll_reg cff_op_poll_reg;
+	struct cff_op_wait_irq cff_op_wait_irq;
+	struct cff_op_memory_base cff_op_memory_base;
+	struct cff_op_hang cff_op_hang;
+	struct cff_op_eof cff_op_eof;
+	struct cff_op_user_event cff_op_user_event;
+	unsigned char out_buf[sizeof(cff_op_write_membuf)/3*4 + 16];
+	void *data;
+	int len = 0, out_size;
+	long cur_secs;
+
+	spin_lock(&cffdump_lock);
+	if (opcode == CFF_OP_WRITE_MEM) {
+		if ((cff_op_write_membuf.addr != op1 &&
+			cff_op_write_membuf.count)
+			|| (cff_op_write_membuf.count == MEMBUF_SIZE))
+			cffdump_membuf(id, out_buf, sizeof(out_buf));
+
+		cff_op_write_membuf.buffer[cff_op_write_membuf.count++] = op2;
+		cff_op_write_membuf.addr = op1 + sizeof(uint);
+		spin_unlock(&cffdump_lock);
+		return;
+	} else if (cff_op_write_membuf.count)
+		cffdump_membuf(id, out_buf, sizeof(out_buf));
+	spin_unlock(&cffdump_lock);
+
+	switch (opcode) {
+	case CFF_OP_WRITE_REG:
+		cff_op_write_reg.op = opcode;
+		cff_op_write_reg.addr = op1;
+		cff_op_write_reg.value = op2;
+		data = &cff_op_write_reg;
+		len = sizeof(cff_op_write_reg);
+		break;
+
+	case CFF_OP_POLL_REG:
+		cff_op_poll_reg.op = opcode;
+		cff_op_poll_reg.addr = op1;
+		cff_op_poll_reg.value = op2;
+		cff_op_poll_reg.mask = op3;
+		data = &cff_op_poll_reg;
+		len = sizeof(cff_op_poll_reg);
+		break;
+
+	case CFF_OP_WAIT_IRQ:
+		cff_op_wait_irq.op = opcode;
+		data = &cff_op_wait_irq;
+		len = sizeof(cff_op_wait_irq);
+		break;
+
+	case CFF_OP_MEMORY_BASE:
+		cff_op_memory_base.op = opcode;
+		cff_op_memory_base.base = op1;
+		cff_op_memory_base.size = op2;
+		cff_op_memory_base.gmemsize = op3;
+		data = &cff_op_memory_base;
+		len = sizeof(cff_op_memory_base);
+		break;
+
+	case CFF_OP_HANG:
+		cff_op_hang.op = opcode;
+		data = &cff_op_hang;
+		len = sizeof(cff_op_hang);
+		break;
+
+	case CFF_OP_EOF:
+		cff_op_eof.op = opcode;
+		data = &cff_op_eof;
+		len = sizeof(cff_op_eof);
+		break;
+
+	case CFF_OP_WRITE_SURFACE_PARAMS:
+	case CFF_OP_VERIFY_MEM_FILE:
+		cff_op_user_event.op = opcode;
+		cff_op_user_event.op1 = op1;
+		cff_op_user_event.op2 = op2;
+		cff_op_user_event.op3 = op3;
+		cff_op_user_event.op4 = op4;
+		cff_op_user_event.op5 = op5;
+		data = &cff_op_user_event;
+		len = sizeof(cff_op_user_event);
+		break;
+	}
+
+	if (len) {
+		b64_encode(data, len, out_buf, sizeof(out_buf), &out_size);
+		out_buf[out_size] = 0;
+		klog_printk("%ld:%d;%s\n", ++serial_nr, id, out_buf);
+	} else
+		pr_warn("kgsl: cffdump: unhandled opcode: %d\n", opcode);
+
+	cur_secs = get_seconds();
+	if ((cur_secs - last_sec) > 10 || (last_sec - cur_secs) > 10) {
+		pr_info("kgsl: cffdump: total [bytes:%lu kB, syncmem:%lu kB], "
+			"seq#: %lu\n", total_bytes/1024, total_syncmem/1024,
+			serial_nr);
+		last_sec = cur_secs;
+	}
+}
+
+void kgsl_cffdump_init()
+{
+	struct dentry *debugfs_dir = kgsl_get_debugfs_dir();
+
+#ifdef ALIGN_CPU
+	cpumask_t mask;
+
+	cpumask_clear(&mask);
+	cpumask_set_cpu(0, &mask);
+	sched_setaffinity(0, &mask);
+#endif
+	if (!debugfs_dir || IS_ERR(debugfs_dir)) {
+		KGSL_CORE_ERR("Debugfs directory is bad\n");
+		return;
+	}
+
+	kgsl_cff_dump_enable = 1;
+
+	spin_lock_init(&cffdump_lock);
+
+	dir = debugfs_create_dir("cff", debugfs_dir);
+	if (!dir) {
+		KGSL_CORE_ERR("debugfs_create_dir failed\n");
+		return;
+	}
+
+	chan = create_channel(subbuf_size, n_subbufs);
+}
+
+void kgsl_cffdump_destroy()
+{
+	if (chan)
+		relay_flush(chan);
+	destroy_channel();
+	if (dir)
+		debugfs_remove(dir);
+}
+
+void kgsl_cffdump_open(enum kgsl_deviceid device_id)
+{
+	kgsl_cffdump_memory_base(device_id, KGSL_PAGETABLE_BASE,
+			kgsl_mmu_get_ptsize(), SZ_256K);
+}
+
+void kgsl_cffdump_memory_base(enum kgsl_deviceid device_id, unsigned int base,
+			      unsigned int range, unsigned gmemsize)
+{
+	cffdump_printline(device_id, CFF_OP_MEMORY_BASE, base,
+			range, gmemsize, 0, 0);
+}
+
+void kgsl_cffdump_hang(enum kgsl_deviceid device_id)
+{
+	cffdump_printline(device_id, CFF_OP_HANG, 0, 0, 0, 0, 0);
+}
+
+void kgsl_cffdump_close(enum kgsl_deviceid device_id)
+{
+	cffdump_printline(device_id, CFF_OP_EOF, 0, 0, 0, 0, 0);
+}
+
+void kgsl_cffdump_user_event(unsigned int cff_opcode, unsigned int op1,
+		unsigned int op2, unsigned int op3,
+		unsigned int op4, unsigned int op5)
+{
+	cffdump_printline(-1, cff_opcode, op1, op2, op3, op4, op5);
+}
+
+void kgsl_cffdump_syncmem(struct kgsl_device_private *dev_priv,
+	const struct kgsl_memdesc *memdesc, uint gpuaddr, uint sizebytes,
+	bool clean_cache)
+{
+	const void *src;
+
+	if (!kgsl_cff_dump_enable)
+		return;
+
+	total_syncmem += sizebytes;
+
+	if (memdesc == NULL) {
+		struct kgsl_mem_entry *entry;
+		spin_lock(&dev_priv->process_priv->mem_lock);
+		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
+			gpuaddr, sizebytes);
+		spin_unlock(&dev_priv->process_priv->mem_lock);
+		if (entry == NULL) {
+			KGSL_CORE_ERR("did not find mapping "
+				"for gpuaddr: 0x%08x\n", gpuaddr);
+			return;
+		}
+		memdesc = &entry->memdesc;
+	}
+	src = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr);
+	if (memdesc->hostptr == NULL) {
+		KGSL_CORE_ERR("no kernel mapping for "
+			"gpuaddr: 0x%08x, m->host: 0x%p, phys: 0x%08x\n",
+			gpuaddr, memdesc->hostptr, memdesc->physaddr);
+		return;
+	}
+
+	if (clean_cache) {
+		/* Ensure that this memory region is not read from the
+		 * cache but fetched fresh */
+
+		mb();
+
+		kgsl_cache_range_op((struct kgsl_memdesc *)memdesc,
+				KGSL_CACHE_OP_INV);
+	}
+
+	while (sizebytes > 3) {
+		cffdump_printline(-1, CFF_OP_WRITE_MEM, gpuaddr, *(uint *)src,
+			0, 0, 0);
+		gpuaddr += 4;
+		src += 4;
+		sizebytes -= 4;
+	}
+	if (sizebytes > 0)
+		cffdump_printline(-1, CFF_OP_WRITE_MEM, gpuaddr, *(uint *)src,
+			0, 0, 0);
+}
+
+void kgsl_cffdump_setmem(uint addr, uint value, uint sizebytes)
+{
+	if (!kgsl_cff_dump_enable)
+		return;
+
+	while (sizebytes > 3) {
+		/* Use 32bit memory writes as long as there's at least
+		 * 4 bytes left */
+		cffdump_printline(-1, CFF_OP_WRITE_MEM, addr, value,
+				0, 0, 0);
+		addr += 4;
+		sizebytes -= 4;
+	}
+	if (sizebytes > 0)
+		cffdump_printline(-1, CFF_OP_WRITE_MEM, addr, value,
+				0, 0, 0);
+}
+
+void kgsl_cffdump_regwrite(enum kgsl_deviceid device_id, uint addr,
+	uint value)
+{
+	if (!kgsl_cff_dump_enable)
+		return;
+
+	cffdump_printline(device_id, CFF_OP_WRITE_REG, addr, value,
+			0, 0, 0);
+}
+
+void kgsl_cffdump_regpoll(enum kgsl_deviceid device_id, uint addr,
+	uint value, uint mask)
+{
+	if (!kgsl_cff_dump_enable)
+		return;
+
+	cffdump_printline(device_id, CFF_OP_POLL_REG, addr, value,
+			mask, 0, 0);
+}
+
+void kgsl_cffdump_slavewrite(uint addr, uint value)
+{
+	if (!kgsl_cff_dump_enable)
+		return;
+
+	cffdump_printline(-1, CFF_OP_WRITE_REG, addr, value, 0, 0, 0);
+}
+
+int kgsl_cffdump_waitirq(void)
+{
+	if (!kgsl_cff_dump_enable)
+		return 0;
+
+	cffdump_printline(-1, CFF_OP_WAIT_IRQ, 0, 0, 0, 0, 0);
+
+	return 1;
+}
+EXPORT_SYMBOL(kgsl_cffdump_waitirq);
+
+static int subbuf_start_handler(struct rchan_buf *buf,
+	void *subbuf, void *prev_subbuf, uint prev_padding)
+{
+	pr_debug("kgsl: cffdump: subbuf_start_handler(subbuf=%p, prev_subbuf"
+		"=%p, prev_padding=%08x)\n", subbuf, prev_subbuf, prev_padding);
+
+	if (relay_buf_full(buf)) {
+		if (!suspended) {
+			suspended = 1;
+			pr_warn("kgsl: cffdump: relay: cpu %d buffer full!!!\n",
+				smp_processor_id());
+		}
+		dropped++;
+		return 0;
+	} else if (suspended) {
+		suspended = 0;
+		pr_warn("kgsl: cffdump: relay: cpu %d buffer no longer full.\n",
+			smp_processor_id());
+	}
+
+	subbuf_start_reserve(buf, 0);
+	return 1;
+}
+
+static struct dentry *create_buf_file_handler(const char *filename,
+	struct dentry *parent, int mode, struct rchan_buf *buf,
+	int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+				       &relay_file_operations);
+}
+
+/*
+ * file_remove() default callback.  Removes relay file in debugfs.
+ */
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+	pr_info("kgsl: cffdump: %s()\n", __func__);
+	debugfs_remove(dentry);
+	return 0;
+}
+
+/*
+ * relay callbacks
+ */
+static struct rchan_callbacks relay_callbacks = {
+	.subbuf_start = subbuf_start_handler,
+	.create_buf_file = create_buf_file_handler,
+	.remove_buf_file = remove_buf_file_handler,
+};
+
+/**
+ *	create_channel - creates channel /debug/klog/cpuXXX
+ *
+ *	Creates channel along with associated produced/consumed control files
+ *
+ *	Returns channel on success, NULL otherwise
+ */
+static struct rchan *create_channel(unsigned subbuf_size, unsigned n_subbufs)
+{
+	struct rchan *chan;
+
+	pr_info("kgsl: cffdump: relay: create_channel: subbuf_size %u, "
+		"n_subbufs %u, dir 0x%p\n", subbuf_size, n_subbufs, dir);
+
+	chan = relay_open("cpu", dir, subbuf_size,
+			  n_subbufs, &relay_callbacks, NULL);
+	if (!chan) {
+		KGSL_CORE_ERR("relay_open failed\n");
+		return NULL;
+	}
+
+	suspended = 0;
+	dropped = 0;
+
+	return chan;
+}
+
+/**
+ *	destroy_channel - destroys channel /debug/kgsl/cff/cpuXXX
+ *
+ *	Destroys channel along with associated produced/consumed control files
+ */
+static void destroy_channel(void)
+{
+	pr_info("kgsl: cffdump: relay: destroy_channel\n");
+	if (chan) {
+		relay_close(chan);
+		chan = NULL;
+	}
+}
+
diff --git a/drivers/gpu/msm/kgsl_cffdump.h b/drivers/gpu/msm/kgsl_cffdump.h
index cea8ea0..2733cc3 100644
--- a/drivers/gpu/msm/kgsl_cffdump.h
+++ b/drivers/gpu/msm/kgsl_cffdump.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -64,6 +64,6 @@
 #define kgsl_cffdump_user_event(cff_opcode, op1, op2, op3, op4, op5) \
 	(void)param
 
-#endif 
+#endif /* CONFIG_MSM_KGSL_CFF_DUMP */
 
-#endif 
+#endif /* __KGSL_CFFDUMP_H */
diff --git a/drivers/gpu/msm/kgsl_debugfs.c b/drivers/gpu/msm/kgsl_debugfs.c
index 68fee6d..b41bd6b 100644
--- a/drivers/gpu/msm/kgsl_debugfs.c
+++ b/drivers/gpu/msm/kgsl_debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2008-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -16,11 +16,86 @@
 
 #include "kgsl.h"
 #include "kgsl_device.h"
+#include "kgsl_sharedmem.h"
 
+/*default log levels is error for everything*/
 #define KGSL_LOG_LEVEL_DEFAULT 3
 #define KGSL_LOG_LEVEL_MAX     7
 
 struct dentry *kgsl_debugfs_dir;
+static struct dentry *pm_d_debugfs;
+struct dentry *proc_d_debugfs;
+
+static int pm_dump_set(void *data, u64 val)
+{
+	struct kgsl_device *device = data;
+
+	if (val) {
+		mutex_lock(&device->mutex);
+		kgsl_postmortem_dump(device, 1);
+		mutex_unlock(&device->mutex);
+	}
+
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(pm_dump_fops,
+			NULL,
+			pm_dump_set, "%llu\n");
+
+static int pm_regs_enabled_set(void *data, u64 val)
+{
+	struct kgsl_device *device = data;
+	device->pm_regs_enabled = val ? 1 : 0;
+	return 0;
+}
+
+static int pm_regs_enabled_get(void *data, u64 *val)
+{
+	struct kgsl_device *device = data;
+	*val = device->pm_regs_enabled;
+	return 0;
+}
+
+static int pm_ib_enabled_set(void *data, u64 val)
+{
+	struct kgsl_device *device = data;
+	device->pm_ib_enabled = val ? 1 : 0;
+	return 0;
+}
+
+static int pm_ib_enabled_get(void *data, u64 *val)
+{
+	struct kgsl_device *device = data;
+	*val = device->pm_ib_enabled;
+	return 0;
+}
+
+static int pm_enabled_set(void *data, u64 val)
+{
+	struct kgsl_device *device = data;
+	device->pm_dump_enable = val;
+	return 0;
+}
+
+static int pm_enabled_get(void *data, u64 *val)
+{
+	struct kgsl_device *device = data;
+	*val = device->pm_dump_enable;
+	return 0;
+}
+
+
+DEFINE_SIMPLE_ATTRIBUTE(pm_regs_enabled_fops,
+			pm_regs_enabled_get,
+			pm_regs_enabled_set, "%llu\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(pm_ib_enabled_fops,
+			pm_ib_enabled_get,
+			pm_ib_enabled_set, "%llu\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(pm_enabled_fops,
+			pm_enabled_get,
+			pm_enabled_set, "%llu\n");
 
 static inline int kgsl_log_set(unsigned int *log_val, void *data, u64 val)
 {
@@ -48,6 +123,7 @@
 KGSL_DEBUGFS_LOG(ctxt_log);
 KGSL_DEBUGFS_LOG(mem_log);
 KGSL_DEBUGFS_LOG(pwr_log);
+KGSL_DEBUGFS_LOG(ft_log);
 
 void kgsl_device_debugfs_init(struct kgsl_device *device)
 {
@@ -63,6 +139,7 @@
 	device->drv_log = KGSL_LOG_LEVEL_DEFAULT;
 	device->mem_log = KGSL_LOG_LEVEL_DEFAULT;
 	device->pwr_log = KGSL_LOG_LEVEL_DEFAULT;
+	device->ft_log = KGSL_LOG_LEVEL_DEFAULT;
 
 	debugfs_create_file("log_level_cmd", 0644, device->d_debugfs, device,
 			    &cmd_log_fops);
@@ -74,11 +151,113 @@
 				&mem_log_fops);
 	debugfs_create_file("log_level_pwr", 0644, device->d_debugfs, device,
 				&pwr_log_fops);
+	debugfs_create_file("log_level_ft", 0644, device->d_debugfs, device,
+				&ft_log_fops);
+
+	/* Create postmortem dump control files */
+
+	pm_d_debugfs = debugfs_create_dir("postmortem", device->d_debugfs);
+
+	if (IS_ERR(pm_d_debugfs))
+		return;
+
+	debugfs_create_file("dump",  0600, pm_d_debugfs, device,
+			    &pm_dump_fops);
+	debugfs_create_file("regs_enabled", 0644, pm_d_debugfs, device,
+			    &pm_regs_enabled_fops);
+	debugfs_create_file("ib_enabled", 0644, pm_d_debugfs, device,
+				    &pm_ib_enabled_fops);
+	device->pm_dump_enable = 0;
+	debugfs_create_file("enable", 0644, pm_d_debugfs, device,
+				    &pm_enabled_fops);
+
+}
+
+static const char * const memtype_strings[] = {
+	"gpumem",
+	"pmem",
+	"ashmem",
+	"usermap",
+	"ion",
+};
+
+static const char *memtype_str(int memtype)
+{
+	if (memtype < ARRAY_SIZE(memtype_strings))
+		return memtype_strings[memtype];
+	return "unknown";
+}
+
+static char get_alignflag(const struct kgsl_memdesc *m)
+{
+	int align = kgsl_memdesc_get_align(m);
+	if (align >= ilog2(SZ_1M))
+		return 'L';
+	else if (align >= ilog2(SZ_64K))
+		return 'l';
+	return '-';
+}
+
+static int process_mem_print(struct seq_file *s, void *unused)
+{
+	struct kgsl_mem_entry *entry;
+	struct rb_node *node;
+	struct kgsl_process_private *private = s->private;
+	char flags[4];
+	char usage[16];
+
+	spin_lock(&private->mem_lock);
+	seq_printf(s, "%8s %8s %5s %10s %16s %5s\n",
+		   "gpuaddr", "size", "flags", "type", "usage", "sglen");
+	for (node = rb_first(&private->mem_rb); node; node = rb_next(node)) {
+		struct kgsl_memdesc *m;
+
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+		m = &entry->memdesc;
+
+		flags[0] = m->priv & KGSL_MEMDESC_GLOBAL ?  'g' : '-';
+		flags[1] = m->flags & KGSL_MEMFLAGS_GPUREADONLY ? 'r' : '-';
+		flags[2] = get_alignflag(m);
+		flags[3] = '\0';
+
+		kgsl_get_memory_usage(usage, sizeof(usage), m->flags);
+
+		seq_printf(s, "%08x %8d %5s %10s %16s %5d\n",
+			   m->gpuaddr, m->size, flags,
+			   memtype_str(entry->memtype), usage, m->sglen);
+	}
+	spin_unlock(&private->mem_lock);
+	return 0;
+}
+
+static int process_mem_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, process_mem_print, inode->i_private);
+}
+
+static const struct file_operations process_mem_fops = {
+	.open = process_mem_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void
+kgsl_process_init_debugfs(struct kgsl_process_private *private)
+{
+	unsigned char name[16];
+
+	snprintf(name, sizeof(name), "%d", private->pid);
+
+	private->debug_root = debugfs_create_dir(name, proc_d_debugfs);
+	debugfs_create_file("mem", 0400, private->debug_root, private,
+			    &process_mem_fops);
 }
 
 void kgsl_core_debugfs_init(void)
 {
 	kgsl_debugfs_dir = debugfs_create_dir("kgsl", 0);
+	proc_d_debugfs = debugfs_create_dir("proc", kgsl_debugfs_dir);
 }
 
 void kgsl_core_debugfs_close(void)
diff --git a/drivers/gpu/msm/kgsl_debugfs.h b/drivers/gpu/msm/kgsl_debugfs.h
index 5e10988..ae5601f 100644
--- a/drivers/gpu/msm/kgsl_debugfs.h
+++ b/drivers/gpu/msm/kgsl_debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2008-2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -15,6 +15,7 @@
 #define _KGSL_DEBUGFS_H
 
 struct kgsl_device;
+struct kgsl_process_private;
 
 #ifdef CONFIG_DEBUG_FS
 void kgsl_core_debugfs_init(void);
@@ -28,11 +29,16 @@
 	return kgsl_debugfs_dir;
 }
 
+int kgsl_process_init_debugfs(struct kgsl_process_private *);
 #else
 static inline void kgsl_core_debugfs_init(void) { }
 static inline void kgsl_device_debugfs_init(struct kgsl_device *device) { }
 static inline void kgsl_core_debugfs_close(void) { }
 static inline struct dentry *kgsl_get_debugfs_dir(void) { return NULL; }
+static inline int kgsl_process_init_debugfs(struct kgsl_process_private *)
+{
+	return 0;
+}
 
 #endif
 
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index df61717..b215d8c 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,14 +22,23 @@
 #include "kgsl_pwrctrl.h"
 #include "kgsl_log.h"
 #include "kgsl_pwrscale.h"
+#include <linux/sync.h>
 
-#define KGSL_TIMEOUT_NONE       0
-#define KGSL_TIMEOUT_DEFAULT    0xFFFFFFFF
-#define KGSL_TIMEOUT_PART       2000 
+#define KGSL_TIMEOUT_NONE           0
+#define KGSL_TIMEOUT_DEFAULT        0xFFFFFFFF
+#define KGSL_TIMEOUT_PART           50 /* 50 msec */
+#define KGSL_TIMEOUT_LONG_IB_DETECTION  2000 /* 2 sec*/
 
 #define FIRST_TIMEOUT (HZ / 2)
 
 
+/* KGSL device state is initialized to INIT when platform_probe		*
+ * sucessfully initialized the device.  Once a device has been opened	*
+ * (started) it becomes active.  NAP implies that only low latency	*
+ * resources (for now clocks on some platforms) are off.  SLEEP implies	*
+ * that the KGSL module believes a device is idle (has been inactive	*
+ * past its timer) and all system resources are released.  SUSPEND is	*
+ * requested by the kernel and will be enforced upon all open devices.	*/
 
 #define KGSL_STATE_NONE		0x00000000
 #define KGSL_STATE_INIT		0x00000001
@@ -38,7 +47,7 @@
 #define KGSL_STATE_SLEEP	0x00000008
 #define KGSL_STATE_SUSPEND	0x00000010
 #define KGSL_STATE_HUNG		0x00000020
-#define KGSL_STATE_DUMP_AND_RECOVER	0x00000040
+#define KGSL_STATE_DUMP_AND_FT	0x00000040
 #define KGSL_STATE_SLUMBER	0x00000080
 
 #define KGSL_GRAPHICS_MEMORY_LOW_WATERMARK  0x1000000
@@ -50,8 +59,13 @@
 struct kgsl_device_private;
 struct kgsl_context;
 struct kgsl_power_stats;
+struct kgsl_event;
 
 struct kgsl_functable {
+	/* Mandatory functions - these functions must be implemented
+	   by the client device.  The driver will not check for a NULL
+	   pointer before calling the hook.
+	 */
 	void (*regread) (struct kgsl_device *device,
 		unsigned int offsetwords, unsigned int *value);
 	void (*regwrite) (struct kgsl_device *device,
@@ -84,6 +98,9 @@
 	void * (*snapshot)(struct kgsl_device *device, void *snapshot,
 		int *remain, int hang);
 	irqreturn_t (*irq_handler)(struct kgsl_device *device);
+	/* Optional functions - these functions are not mandatory.  The
+	   driver will check that the function pointer is not NULL before
+	   calling the hook */
 	void (*setstate) (struct kgsl_device *device, unsigned int context_id,
 			uint32_t flags);
 	int (*drawctxt_create) (struct kgsl_device *device,
@@ -96,8 +113,12 @@
 	int (*setproperty) (struct kgsl_device *device,
 		enum kgsl_property_type type, void *value,
 		unsigned int sizebytes);
+	int (*postmortem_dump) (struct kgsl_device *device, int manual);
+	int (*next_event)(struct kgsl_device *device,
+		struct kgsl_event *event);
 };
 
+/* MH register values */
 struct kgsl_mh {
 	unsigned int     mharb;
 	unsigned int     mh_intf_cfg1;
@@ -113,12 +134,9 @@
 	void *priv;
 	struct list_head list;
 	void *owner;
+	unsigned int created;
 };
 
-struct kgsl_gpubusy {
-	s64 busy;
-	s64 total;
-};
 
 struct kgsl_device {
 	struct device *dev;
@@ -153,74 +171,94 @@
 	wait_queue_head_t wait_queue;
 	struct workqueue_struct *work_queue;
 	struct device *parentdev;
-	struct completion recovery_gate;
+	struct completion ft_gate;
 	struct dentry *d_debugfs;
 	struct idr context_idr;
 	struct early_suspend display_off;
 
-	void *snapshot;		
-	int snapshot_maxsize;   
-	int snapshot_size;      
-	u32 snapshot_timestamp;	
-	int snapshot_frozen;	
-	int snapshot_no_panic;	
+	void *snapshot;		/* Pointer to the snapshot memory region */
+	int snapshot_maxsize;   /* Max size of the snapshot region */
+	int snapshot_size;      /* Current size of the snapshot region */
+	u32 snapshot_timestamp;	/* Timestamp of the last valid snapshot */
+	int snapshot_frozen;	/* 1 if the snapshot output is frozen until
+				   it gets read by the user.  This avoids
+				   losing the output on multiple hangs  */
 	struct kobject snapshot_kobj;
 
+	/*
+	 * List of GPU buffers that have been frozen in memory until they can be
+	 * dumped
+	 */
 	struct list_head snapshot_obj_list;
 
-	
+	/* Logging levels */
 	int cmd_log;
 	int ctxt_log;
 	int drv_log;
 	int mem_log;
 	int pwr_log;
+	int ft_log;
+	int pm_dump_enable;
 	struct kgsl_pwrscale pwrscale;
 	struct kobject pwrscale_kobj;
 	struct pm_qos_request pm_qos_req_dma;
 	struct work_struct ts_expired_ws;
 	struct list_head events;
+	struct list_head events_pending_list;
 	s64 on_time;
 
-	
-	struct kgsl_gpubusy gputime;
-	struct kgsl_gpubusy gputime_in_state[KGSL_MAX_PWRLEVELS];
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	struct kgsl_process_private *current_process_priv;
-#endif
-#if defined(CONFIG_MSM_KGSL_GPU_USAGE_SYSTRACE)
-	int prev_pid;
-#endif
+	/* Postmortem Control switches */
+	int pm_regs_enabled;
+	int pm_ib_enabled;
 };
 
-void kgsl_timestamp_expired(struct work_struct *work);
+void kgsl_process_events(struct work_struct *work);
+void kgsl_check_fences(struct work_struct *work);
 
 #define KGSL_DEVICE_COMMON_INIT(_dev) \
 	.hwaccess_gate = COMPLETION_INITIALIZER((_dev).hwaccess_gate),\
 	.suspend_gate = COMPLETION_INITIALIZER((_dev).suspend_gate),\
-	.recovery_gate = COMPLETION_INITIALIZER((_dev).recovery_gate),\
+	.ft_gate = COMPLETION_INITIALIZER((_dev).ft_gate),\
 	.ts_notifier_list = ATOMIC_NOTIFIER_INIT((_dev).ts_notifier_list),\
 	.idle_check_ws = __WORK_INITIALIZER((_dev).idle_check_ws,\
 			kgsl_idle_check),\
 	.ts_expired_ws  = __WORK_INITIALIZER((_dev).ts_expired_ws,\
-			kgsl_timestamp_expired),\
+			kgsl_process_events),\
 	.context_idr = IDR_INIT((_dev).context_idr),\
 	.events = LIST_HEAD_INIT((_dev).events),\
+	.events_pending_list = LIST_HEAD_INIT((_dev).events_pending_list), \
 	.wait_queue = __WAIT_QUEUE_HEAD_INITIALIZER((_dev).wait_queue),\
 	.mutex = __MUTEX_INITIALIZER((_dev).mutex),\
 	.state = KGSL_STATE_INIT,\
 	.ver_major = DRIVER_VERSION_MAJOR,\
 	.ver_minor = DRIVER_VERSION_MINOR
 
+
+/**
+ * struct kgsl_context - Master structure for a KGSL context object
+ * @refcount - kref object for reference counting the context
+ * @id - integer identifier for the context
+ * @dev_priv - pointer to the owning device instance
+ * @devctxt - pointer to the device specific context information
+ * @reset_status - status indication whether a gpu reset occured and whether
+ * this context was responsible for causing it
+ * @wait_on_invalid_ts - flag indicating if this context has tried to wait on a
+ * bad timestamp
+ * @timeline - sync timeline used to create fences that can be signaled when a
+ * sync_pt timestamp expires
+ * @events - list head of pending events for this context
+ * @events_list - list node for the list of all contexts that have pending events
+ */
 struct kgsl_context {
 	struct kref refcount;
 	uint32_t id;
-
-	
 	struct kgsl_device_private *dev_priv;
-
-	
 	void *devctxt;
 	unsigned int reset_status;
+	bool wait_on_invalid_ts;
+	struct sync_timeline *timeline;
+	struct list_head events;
+	struct list_head events_list;
 };
 
 struct kgsl_process_private {
@@ -231,15 +269,12 @@
 	struct kgsl_pagetable *pagetable;
 	struct list_head list;
 	struct kobject kobj;
+	struct dentry *debug_root;
 
 	struct {
 		unsigned int cur;
 		unsigned int max;
 	} stats[KGSL_MEM_ENTRY_MAX];
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	struct kgsl_gpubusy gputime;
-	struct kgsl_gpubusy gputime_in_state[KGSL_MAX_PWRLEVELS];
-#endif
 };
 
 struct kgsl_device_private {
@@ -262,12 +297,6 @@
 		priv->stats[type].max = priv->stats[type].cur;
 }
 
-static inline void kgsl_process_sub_stats(struct kgsl_process_private *priv,
-	unsigned int type, size_t size)
-{
-	priv->stats[type].cur -= size;
-}
-
 static inline void kgsl_regread(struct kgsl_device *device,
 				unsigned int offsetwords,
 				unsigned int *value)
@@ -353,6 +382,8 @@
 	struct kgsl_context *ctxt =
 		idr_find(&dev_priv->device->context_idr, id);
 
+	/* Make sure that the context belongs to the current instance so
+	   that other processes can't guess context IDs and mess things up */
 
 	return  (ctxt && ctxt->dev_priv == dev_priv) ? ctxt : NULL;
 }
@@ -385,6 +416,15 @@
 	return pdev->dev.platform_data;
 }
 
+/**
+ * kgsl_context_get - Get context reference count
+ * @context
+ *
+ * Asynchronous code that holds a pointer to a context
+ * must hold a reference count on it. The kgsl device
+ * mutex must be held while the context reference count
+ * is changed.
+ */
 static inline void
 kgsl_context_get(struct kgsl_context *context)
 {
@@ -393,10 +433,34 @@
 
 void kgsl_context_destroy(struct kref *kref);
 
+/**
+ * kgsl_context_put - Release context reference count
+ * @context
+ *
+ */
 static inline void
 kgsl_context_put(struct kgsl_context *context)
 {
 	kref_put(&context->refcount, kgsl_context_destroy);
 }
 
-#endif  
+/**
+ * kgsl_active_count_put - Decrease the device active count
+ * @device: Pointer to a KGSL device
+ *
+ * Decrease the active count for the KGSL device and trigger the suspend_gate
+ * completion if it hits zero
+ */
+static inline void
+kgsl_active_count_put(struct kgsl_device *device)
+{
+	if (device->active_cnt == 1)
+		INIT_COMPLETION(device->suspend_gate);
+
+	device->active_cnt--;
+
+	if (device->active_cnt == 0)
+		complete(&device->suspend_gate);
+}
+
+#endif  /* __KGSL_DEVICE_H */
diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c
new file mode 100644
index 0000000..2a5a5fa
--- /dev/null
+++ b/drivers/gpu/msm/kgsl_drm.c
@@ -0,0 +1,1507 @@
+/* Copyright (c) 2009-2012, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/* Implements an interface between KGSL and the DRM subsystem.  For now this
+ * is pretty simple, but it will take on more of the workload as time goes
+ * on
+ */
+#include "drmP.h"
+#include "drm.h"
+#include <linux/android_pmem.h>
+
+#include "kgsl.h"
+#include "kgsl_device.h"
+#include "kgsl_drm.h"
+#include "kgsl_mmu.h"
+#include "kgsl_sharedmem.h"
+
+#define DRIVER_AUTHOR           "Qualcomm"
+#define DRIVER_NAME             "kgsl"
+#define DRIVER_DESC             "KGSL DRM"
+#define DRIVER_DATE             "20100127"
+
+#define DRIVER_MAJOR            2
+#define DRIVER_MINOR            1
+#define DRIVER_PATCHLEVEL       1
+
+#define DRM_KGSL_GEM_FLAG_MAPPED (1 << 0)
+
+#define ENTRY_EMPTY -1
+#define ENTRY_NEEDS_CLEANUP -2
+
+#define DRM_KGSL_NOT_INITED -1
+#define DRM_KGSL_INITED   1
+
+#define DRM_KGSL_NUM_FENCE_ENTRIES (DRM_KGSL_HANDLE_WAIT_ENTRIES << 2)
+#define DRM_KGSL_HANDLE_WAIT_ENTRIES 5
+
+/* Returns true if the memory type is in PMEM */
+
+#ifdef CONFIG_KERNEL_PMEM_SMI_REGION
+#define TYPE_IS_PMEM(_t) \
+  (((_t & DRM_KGSL_GEM_TYPE_MEM_MASK) == DRM_KGSL_GEM_TYPE_EBI) || \
+   ((_t & DRM_KGSL_GEM_TYPE_MEM_MASK) == DRM_KGSL_GEM_TYPE_SMI) || \
+   ((_t) & DRM_KGSL_GEM_TYPE_PMEM))
+#else
+#define TYPE_IS_PMEM(_t) \
+  (((_t & DRM_KGSL_GEM_TYPE_MEM_MASK) == DRM_KGSL_GEM_TYPE_EBI) || \
+   ((_t) & (DRM_KGSL_GEM_TYPE_PMEM | DRM_KGSL_GEM_PMEM_EBI)))
+#endif
+
+/* Returns true if the memory type is regular */
+
+#define TYPE_IS_MEM(_t) \
+  (((_t & DRM_KGSL_GEM_TYPE_MEM_MASK) == DRM_KGSL_GEM_TYPE_KMEM) || \
+   ((_t & DRM_KGSL_GEM_TYPE_MEM_MASK) == DRM_KGSL_GEM_TYPE_KMEM_NOCACHE) || \
+   ((_t) & DRM_KGSL_GEM_TYPE_MEM))
+
+#define TYPE_IS_FD(_t) ((_t) & DRM_KGSL_GEM_TYPE_FD_MASK)
+
+/* Returns true if KMEM region is uncached */
+
+#define IS_MEM_UNCACHED(_t) \
+  ((_t == DRM_KGSL_GEM_TYPE_KMEM_NOCACHE) || \
+   (_t == DRM_KGSL_GEM_TYPE_KMEM) || \
+   (TYPE_IS_MEM(_t) && (_t & DRM_KGSL_GEM_CACHE_WCOMBINE)))
+
+struct drm_kgsl_gem_object_wait_list_entry {
+	struct list_head list;
+	int pid;
+	int in_use;
+	wait_queue_head_t process_wait_q;
+};
+
+struct drm_kgsl_gem_object_fence {
+	int32_t fence_id;
+	unsigned int num_buffers;
+	int ts_valid;
+	unsigned int timestamp;
+	int ts_device;
+	int lockpid;
+	struct list_head buffers_in_fence;
+};
+
+struct drm_kgsl_gem_object_fence_list_entry {
+	struct list_head list;
+	int in_use;
+	struct drm_gem_object *gem_obj;
+};
+
+static int32_t fence_id = 0x1;
+
+static struct drm_kgsl_gem_object_fence
+			  gem_buf_fence[DRM_KGSL_NUM_FENCE_ENTRIES];
+
+struct drm_kgsl_gem_object {
+	struct drm_gem_object *obj;
+	uint32_t type;
+	struct kgsl_memdesc memdesc;
+	struct kgsl_pagetable *pagetable;
+	uint64_t mmap_offset;
+	int bufcount;
+	int flags;
+	struct list_head list;
+	int active;
+
+	struct {
+		uint32_t offset;
+		uint32_t gpuaddr;
+	} bufs[DRM_KGSL_GEM_MAX_BUFFERS];
+
+	int bound;
+	int lockpid;
+	/* Put these here to avoid allocing all the time */
+	struct drm_kgsl_gem_object_wait_list_entry
+	wait_entries[DRM_KGSL_HANDLE_WAIT_ENTRIES];
+	/* Each object can only appear in a single fence */
+	struct drm_kgsl_gem_object_fence_list_entry
+	fence_entries[DRM_KGSL_NUM_FENCE_ENTRIES];
+
+	struct list_head wait_list;
+};
+
+static int kgsl_drm_inited = DRM_KGSL_NOT_INITED;
+
+/* This is a global list of all the memory currently mapped in the MMU */
+static struct list_head kgsl_mem_list;
+
+static void kgsl_gem_mem_flush(struct kgsl_memdesc *memdesc, int type, int op)
+{
+	int cacheop = 0;
+
+	switch (op) {
+	case DRM_KGSL_GEM_CACHE_OP_TO_DEV:
+		if (type & (DRM_KGSL_GEM_CACHE_WBACK |
+			    DRM_KGSL_GEM_CACHE_WBACKWA))
+			cacheop = KGSL_CACHE_OP_CLEAN;
+
+		break;
+
+	case DRM_KGSL_GEM_CACHE_OP_FROM_DEV:
+		if (type & (DRM_KGSL_GEM_CACHE_WBACK |
+			    DRM_KGSL_GEM_CACHE_WBACKWA |
+			    DRM_KGSL_GEM_CACHE_WTHROUGH))
+			cacheop = KGSL_CACHE_OP_INV;
+	}
+
+	kgsl_cache_range_op(memdesc, cacheop);
+}
+
+/* TODO:
+ * Add vsync wait */
+
+static int kgsl_drm_load(struct drm_device *dev, unsigned long flags)
+{
+	return 0;
+}
+
+static int kgsl_drm_unload(struct drm_device *dev)
+{
+	return 0;
+}
+
+struct kgsl_drm_device_priv {
+	struct kgsl_device *device[KGSL_DEVICE_MAX];
+	struct kgsl_device_private *devpriv[KGSL_DEVICE_MAX];
+};
+
+void kgsl_drm_preclose(struct drm_device *dev, struct drm_file *file_priv)
+{
+}
+
+static int kgsl_drm_suspend(struct drm_device *dev, pm_message_t state)
+{
+	return 0;
+}
+
+static int kgsl_drm_resume(struct drm_device *dev)
+{
+	return 0;
+}
+
+static void
+kgsl_gem_free_mmap_offset(struct drm_gem_object *obj)
+{
+	struct drm_device *dev = obj->dev;
+	struct drm_gem_mm *mm = dev->mm_private;
+	struct drm_kgsl_gem_object *priv = obj->driver_private;
+	struct drm_map_list *list;
+
+	list = &obj->map_list;
+	drm_ht_remove_item(&mm->offset_hash, &list->hash);
+	if (list->file_offset_node) {
+		drm_mm_put_block(list->file_offset_node);
+		list->file_offset_node = NULL;
+	}
+
+	kfree(list->map);
+	list->map = NULL;
+
+	priv->mmap_offset = 0;
+}
+
+static int
+kgsl_gem_memory_allocated(struct drm_gem_object *obj)
+{
+	struct drm_kgsl_gem_object *priv = obj->driver_private;
+	return priv->memdesc.size ? 1 : 0;
+}
+
+static int
+kgsl_gem_alloc_memory(struct drm_gem_object *obj)
+{
+	struct drm_kgsl_gem_object *priv = obj->driver_private;
+	int index;
+	int result = 0;
+
+	/* Return if the memory is already allocated */
+
+	if (kgsl_gem_memory_allocated(obj) || TYPE_IS_FD(priv->type))
+		return 0;
+
+	if (priv->pagetable == NULL) {
+		priv->pagetable = kgsl_mmu_getpagetable(KGSL_MMU_GLOBAL_PT);
+
+		if (priv->pagetable == NULL) {
+			DRM_ERROR("Unable to get the GPU MMU pagetable\n");
+			return -EINVAL;
+		}
+	}
+
+	/* Set the flags for the memdesc (probably 0, unless it is cached) */
+	priv->memdesc.priv = 0;
+
+	if (TYPE_IS_PMEM(priv->type)) {
+		if (priv->type == DRM_KGSL_GEM_TYPE_EBI ||
+		    priv->type & DRM_KGSL_GEM_PMEM_EBI) {
+				result = kgsl_sharedmem_ebimem_user(
+						&priv->memdesc,
+						priv->pagetable,
+						obj->size * priv->bufcount);
+				if (result) {
+					DRM_ERROR(
+					"Unable to allocate PMEM memory\n");
+					return result;
+				}
+		}
+		else
+			return -EINVAL;
+
+	} else if (TYPE_IS_MEM(priv->type)) {
+
+		if (priv->type == DRM_KGSL_GEM_TYPE_KMEM ||
+			priv->type & DRM_KGSL_GEM_CACHE_MASK)
+				list_add(&priv->list, &kgsl_mem_list);
+
+		result = kgsl_sharedmem_page_alloc_user(&priv->memdesc,
+					priv->pagetable,
+					obj->size * priv->bufcount);
+
+		if (result != 0) {
+				DRM_ERROR(
+				"Unable to allocate Vmalloc user memory\n");
+				return result;
+		}
+	} else
+		return -EINVAL;
+
+	for (index = 0; index < priv->bufcount; index++) {
+		priv->bufs[index].offset = index * obj->size;
+		priv->bufs[index].gpuaddr =
+			priv->memdesc.gpuaddr +
+			priv->bufs[index].offset;
+	}
+	priv->flags |= DRM_KGSL_GEM_FLAG_MAPPED;
+
+	return 0;
+}
+
+static void
+kgsl_gem_free_memory(struct drm_gem_object *obj)
+{
+	struct drm_kgsl_gem_object *priv = obj->driver_private;
+
+	if (!kgsl_gem_memory_allocated(obj) || TYPE_IS_FD(priv->type))
+		return;
+
+	kgsl_gem_mem_flush(&priv->memdesc,  priv->type,
+			   DRM_KGSL_GEM_CACHE_OP_FROM_DEV);
+
+	kgsl_sharedmem_free(&priv->memdesc);
+
+	kgsl_mmu_putpagetable(priv->pagetable);
+	priv->pagetable = NULL;
+
+	if ((priv->type == DRM_KGSL_GEM_TYPE_KMEM) ||
+	    (priv->type & DRM_KGSL_GEM_CACHE_MASK))
+		list_del(&priv->list);
+
+	priv->flags &= ~DRM_KGSL_GEM_FLAG_MAPPED;
+
+}
+
+int
+kgsl_gem_init_object(struct drm_gem_object *obj)
+{
+	struct drm_kgsl_gem_object *priv;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL) {
+		DRM_ERROR("Unable to create GEM object\n");
+		return -ENOMEM;
+	}
+
+	obj->driver_private = priv;
+	priv->obj = obj;
+
+	return 0;
+}
+
+void
+kgsl_gem_free_object(struct drm_gem_object *obj)
+{
+	kgsl_gem_free_memory(obj);
+	kgsl_gem_free_mmap_offset(obj);
+	drm_gem_object_release(obj);
+	kfree(obj->driver_private);
+}
+
+static int
+kgsl_gem_create_mmap_offset(struct drm_gem_object *obj)
+{
+	struct drm_device *dev = obj->dev;
+	struct drm_gem_mm *mm = dev->mm_private;
+	struct drm_kgsl_gem_object *priv = obj->driver_private;
+	struct drm_map_list *list;
+	int msize;
+
+	list = &obj->map_list;
+	list->map = kzalloc(sizeof(struct drm_map_list), GFP_KERNEL);
+	if (list->map == NULL) {
+		DRM_ERROR("Unable to allocate drm_map_list\n");
+		return -ENOMEM;
+	}
+
+	msize = obj->size * priv->bufcount;
+
+	list->map->type = _DRM_GEM;
+	list->map->size = msize;
+	list->map->handle = obj;
+
+	/* Allocate a mmap offset */
+	list->file_offset_node = drm_mm_search_free(&mm->offset_manager,
+						    msize / PAGE_SIZE,
+						    0, 0);
+
+	if (!list->file_offset_node) {
+		DRM_ERROR("Failed to allocate offset for %d\n", obj->name);
+		kfree(list->map);
+		return -ENOMEM;
+	}
+
+	list->file_offset_node = drm_mm_get_block(list->file_offset_node,
+						  msize / PAGE_SIZE, 0);
+
+	if (!list->file_offset_node) {
+		DRM_ERROR("Unable to create the file_offset_node\n");
+		kfree(list->map);
+		return -ENOMEM;
+	}
+
+	list->hash.key = list->file_offset_node->start;
+	if (drm_ht_insert_item(&mm->offset_hash, &list->hash)) {
+		DRM_ERROR("Failed to add to map hash\n");
+		drm_mm_put_block(list->file_offset_node);
+		kfree(list->map);
+		return -ENOMEM;
+	}
+
+	priv->mmap_offset = ((uint64_t) list->hash.key) << PAGE_SHIFT;
+
+	return 0;
+}
+
+int
+kgsl_gem_obj_addr(int drm_fd, int handle, unsigned long *start,
+			unsigned long *len)
+{
+	struct file *filp;
+	struct drm_device *dev;
+	struct drm_file *file_priv;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret = 0;
+
+	filp = fget(drm_fd);
+	if (unlikely(filp == NULL)) {
+		DRM_ERROR("Unable to get the DRM file descriptor\n");
+		return -EINVAL;
+	}
+	file_priv = filp->private_data;
+	if (unlikely(file_priv == NULL)) {
+		DRM_ERROR("Unable to get the file private data\n");
+		fput(filp);
+		return -EINVAL;
+	}
+	dev = file_priv->minor->dev;
+	if (unlikely(dev == NULL)) {
+		DRM_ERROR("Unable to get the minor device\n");
+		fput(filp);
+		return -EINVAL;
+	}
+
+	obj = drm_gem_object_lookup(dev, file_priv, handle);
+	if (unlikely(obj == NULL)) {
+		DRM_ERROR("Invalid GEM handle %x\n", handle);
+		fput(filp);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	/* We can only use the MDP for PMEM regions */
+
+	if (TYPE_IS_PMEM(priv->type)) {
+		*start = priv->memdesc.physaddr +
+			priv->bufs[priv->active].offset;
+
+		*len = priv->memdesc.size;
+
+		kgsl_gem_mem_flush(&priv->memdesc,
+				   priv->type, DRM_KGSL_GEM_CACHE_OP_TO_DEV);
+	} else {
+		*start = 0;
+		*len = 0;
+		ret = -EINVAL;
+	}
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	fput(filp);
+	return ret;
+}
+
+static int
+kgsl_gem_init_obj(struct drm_device *dev,
+		  struct drm_file *file_priv,
+		  struct drm_gem_object *obj,
+		  int *handle)
+{
+	struct drm_kgsl_gem_object *priv;
+	int ret, i;
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	memset(&priv->memdesc, 0, sizeof(priv->memdesc));
+	priv->bufcount = 1;
+	priv->active = 0;
+	priv->bound = 0;
+
+	/* To preserve backwards compatability, the default memory source
+	   is EBI */
+
+	priv->type = DRM_KGSL_GEM_TYPE_PMEM | DRM_KGSL_GEM_PMEM_EBI;
+
+	ret = drm_gem_handle_create(file_priv, obj, handle);
+
+	drm_gem_object_unreference(obj);
+	INIT_LIST_HEAD(&priv->wait_list);
+
+	for (i = 0; i < DRM_KGSL_HANDLE_WAIT_ENTRIES; i++) {
+		INIT_LIST_HEAD((struct list_head *) &priv->wait_entries[i]);
+		priv->wait_entries[i].pid = 0;
+		init_waitqueue_head(&priv->wait_entries[i].process_wait_q);
+	}
+
+	for (i = 0; i < DRM_KGSL_NUM_FENCE_ENTRIES; i++) {
+		INIT_LIST_HEAD((struct list_head *) &priv->fence_entries[i]);
+		priv->fence_entries[i].in_use = 0;
+		priv->fence_entries[i].gem_obj = obj;
+	}
+
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
+}
+
+int
+kgsl_gem_create_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_create *create = data;
+	struct drm_gem_object *obj;
+	int ret, handle;
+
+	/* Page align the size so we can allocate multiple buffers */
+	create->size = ALIGN(create->size, 4096);
+
+	obj = drm_gem_object_alloc(dev, create->size);
+
+	if (obj == NULL) {
+		DRM_ERROR("Unable to allocate the GEM object\n");
+		return -ENOMEM;
+	}
+
+	ret = kgsl_gem_init_obj(dev, file_priv, obj, &handle);
+	if (ret)
+		return ret;
+
+	create->handle = handle;
+	return 0;
+}
+
+int
+kgsl_gem_create_fd_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_create_fd *args = data;
+	struct file *file;
+	dev_t rdev;
+	struct fb_info *info;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret, put_needed, handle;
+
+	file = fget_light(args->fd, &put_needed);
+
+	if (file == NULL) {
+		DRM_ERROR("Unable to get the file object\n");
+		return -EBADF;
+	}
+
+	rdev = file->f_dentry->d_inode->i_rdev;
+
+	/* Only framebuffer objects are supported ATM */
+
+	if (MAJOR(rdev) != FB_MAJOR) {
+		DRM_ERROR("File descriptor is not a framebuffer\n");
+		ret = -EBADF;
+		goto error_fput;
+	}
+
+	info = registered_fb[MINOR(rdev)];
+
+	if (info == NULL) {
+		DRM_ERROR("Framebuffer minor %d is not registered\n",
+			  MINOR(rdev));
+		ret = -EBADF;
+		goto error_fput;
+	}
+
+	obj = drm_gem_object_alloc(dev, info->fix.smem_len);
+
+	if (obj == NULL) {
+		DRM_ERROR("Unable to allocate GEM object\n");
+		ret = -ENOMEM;
+		goto error_fput;
+	}
+
+	ret = kgsl_gem_init_obj(dev, file_priv, obj, &handle);
+
+	if (ret)
+		goto error_fput;
+
+	mutex_lock(&dev->struct_mutex);
+
+	priv = obj->driver_private;
+	priv->memdesc.physaddr = info->fix.smem_start;
+	priv->type = DRM_KGSL_GEM_TYPE_FD_FBMEM;
+
+	mutex_unlock(&dev->struct_mutex);
+	args->handle = handle;
+
+error_fput:
+	fput_light(file, put_needed);
+
+	return ret;
+}
+
+int
+kgsl_gem_setmemtype_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_memtype *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret = 0;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	if (TYPE_IS_FD(priv->type))
+		ret = -EINVAL;
+	else {
+		if (TYPE_IS_PMEM(args->type) || TYPE_IS_MEM(args->type))
+			priv->type = args->type;
+		else
+			ret = -EINVAL;
+	}
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+int
+kgsl_gem_getmemtype_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_memtype *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	args->type = priv->type;
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
+int
+kgsl_gem_unbind_gpu_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	return 0;
+}
+
+int
+kgsl_gem_bind_gpu_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	return 0;
+}
+
+/* Allocate the memory and prepare it for CPU mapping */
+
+int
+kgsl_gem_alloc_ioctl(struct drm_device *dev, void *data,
+		    struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_alloc *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	ret = kgsl_gem_alloc_memory(obj);
+
+	if (ret) {
+		DRM_ERROR("Unable to allocate object memory\n");
+	} else if (!priv->mmap_offset) {
+		ret = kgsl_gem_create_mmap_offset(obj);
+		if (ret)
+			DRM_ERROR("Unable to create a mmap offset\n");
+	}
+
+	args->offset = priv->mmap_offset;
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+int
+kgsl_gem_mmap_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_mmap *args = data;
+	struct drm_gem_object *obj;
+	unsigned long addr;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	down_write(&current->mm->mmap_sem);
+
+	addr = do_mmap(obj->filp, 0, args->size,
+		       PROT_READ | PROT_WRITE, MAP_SHARED,
+		       args->offset);
+
+	up_write(&current->mm->mmap_sem);
+
+	mutex_lock(&dev->struct_mutex);
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	if (IS_ERR((void *) addr))
+		return addr;
+
+	args->hostptr = (uint32_t) addr;
+	return 0;
+}
+
+/* This function is deprecated */
+
+int
+kgsl_gem_prep_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_prep *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	ret = kgsl_gem_alloc_memory(obj);
+	if (ret) {
+		DRM_ERROR("Unable to allocate object memory\n");
+		drm_gem_object_unreference(obj);
+		mutex_unlock(&dev->struct_mutex);
+		return ret;
+	}
+
+	if (priv->mmap_offset == 0) {
+		ret = kgsl_gem_create_mmap_offset(obj);
+		if (ret) {
+			drm_gem_object_unreference(obj);
+			mutex_unlock(&dev->struct_mutex);
+			return ret;
+		}
+	}
+
+	args->offset = priv->mmap_offset;
+	args->phys = priv->memdesc.physaddr;
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
+int
+kgsl_gem_get_bufinfo_ioctl(struct drm_device *dev, void *data,
+			   struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_bufinfo *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret = -EINVAL;
+	int index;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	if (!kgsl_gem_memory_allocated(obj)) {
+		DRM_ERROR("Memory not allocated for this object\n");
+		goto out;
+	}
+
+	for (index = 0; index < priv->bufcount; index++) {
+		args->offset[index] = priv->bufs[index].offset;
+		args->gpuaddr[index] = priv->bufs[index].gpuaddr;
+	}
+
+	args->count = priv->bufcount;
+	args->active = priv->active;
+
+	ret = 0;
+
+out:
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+int
+kgsl_gem_set_bufcount_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_bufcount *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret = -EINVAL;
+
+	if (args->bufcount < 1 || args->bufcount > DRM_KGSL_GEM_MAX_BUFFERS)
+		return -EINVAL;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	/* It is too much math to worry about what happens if we are already
+	   allocated, so just bail if we are */
+
+	if (kgsl_gem_memory_allocated(obj)) {
+		DRM_ERROR("Memory already allocated - cannot change"
+			  "number of buffers\n");
+		goto out;
+	}
+
+	priv->bufcount = args->bufcount;
+	ret = 0;
+
+out:
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+int
+kgsl_gem_set_active_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_active *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret = -EINVAL;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	if (args->active < 0 || args->active >= priv->bufcount) {
+		DRM_ERROR("Invalid active buffer %d\n", args->active);
+		goto out;
+	}
+
+	priv->active = args->active;
+	ret = 0;
+
+out:
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct drm_gem_object *obj = vma->vm_private_data;
+	struct drm_device *dev = obj->dev;
+	struct drm_kgsl_gem_object *priv;
+	unsigned long offset;
+	struct page *page;
+	int i;
+
+	mutex_lock(&dev->struct_mutex);
+
+	priv = obj->driver_private;
+
+	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
+	i = offset >> PAGE_SHIFT;
+	page = sg_page(&(priv->memdesc.sg[i]));
+
+	if (!page) {
+		mutex_unlock(&dev->struct_mutex);
+		return VM_FAULT_SIGBUS;
+	}
+
+	get_page(page);
+	vmf->page = page;
+
+	mutex_unlock(&dev->struct_mutex);
+	return 0;
+}
+
+int kgsl_gem_phys_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct drm_gem_object *obj = vma->vm_private_data;
+	struct drm_device *dev = obj->dev;
+	struct drm_kgsl_gem_object *priv;
+	unsigned long offset, pfn;
+	int ret = 0;
+
+	offset = ((unsigned long) vmf->virtual_address - vma->vm_start) >>
+		PAGE_SHIFT;
+
+	mutex_lock(&dev->struct_mutex);
+
+	priv = obj->driver_private;
+
+	pfn = (priv->memdesc.physaddr >> PAGE_SHIFT) + offset;
+	ret = vm_insert_pfn(vma,
+			    (unsigned long) vmf->virtual_address, pfn);
+	mutex_unlock(&dev->struct_mutex);
+
+	switch (ret) {
+	case -ENOMEM:
+	case -EAGAIN:
+		return VM_FAULT_OOM;
+	case -EFAULT:
+		return VM_FAULT_SIGBUS;
+	default:
+		return VM_FAULT_NOPAGE;
+	}
+}
+
+static struct vm_operations_struct kgsl_gem_kmem_vm_ops = {
+	.fault = kgsl_gem_kmem_fault,
+	.open = drm_gem_vm_open,
+	.close = drm_gem_vm_close,
+};
+
+static struct vm_operations_struct kgsl_gem_phys_vm_ops = {
+	.fault = kgsl_gem_phys_fault,
+	.open = drm_gem_vm_open,
+	.close = drm_gem_vm_close,
+};
+
+/* This is a clone of the standard drm_gem_mmap function modified to allow
+   us to properly map KMEM regions as well as the PMEM regions */
+
+int msm_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct drm_file *priv = filp->private_data;
+	struct drm_device *dev = priv->minor->dev;
+	struct drm_gem_mm *mm = dev->mm_private;
+	struct drm_local_map *map = NULL;
+	struct drm_gem_object *obj;
+	struct drm_hash_item *hash;
+	struct drm_kgsl_gem_object *gpriv;
+	int ret = 0;
+
+	mutex_lock(&dev->struct_mutex);
+
+	if (drm_ht_find_item(&mm->offset_hash, vma->vm_pgoff, &hash)) {
+		mutex_unlock(&dev->struct_mutex);
+		return drm_mmap(filp, vma);
+	}
+
+	map = drm_hash_entry(hash, struct drm_map_list, hash)->map;
+	if (!map ||
+	    ((map->flags & _DRM_RESTRICTED) && !capable(CAP_SYS_ADMIN))) {
+		ret =  -EPERM;
+		goto out_unlock;
+	}
+
+	/* Check for valid size. */
+	if (map->size < vma->vm_end - vma->vm_start) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	obj = map->handle;
+
+	gpriv = obj->driver_private;
+
+	/* VM_PFNMAP is only for memory that doesn't use struct page
+	 * in other words, not "normal" memory.  If you try to use it
+	 * with "normal" memory then the mappings don't get flushed. */
+
+	if (TYPE_IS_MEM(gpriv->type)) {
+		vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+		vma->vm_ops = &kgsl_gem_kmem_vm_ops;
+	} else {
+		vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP |
+			VM_DONTEXPAND;
+		vma->vm_ops = &kgsl_gem_phys_vm_ops;
+	}
+
+	vma->vm_private_data = map->handle;
+
+
+	/* Take care of requested caching policy */
+	if (gpriv->type == DRM_KGSL_GEM_TYPE_KMEM ||
+	    gpriv->type & DRM_KGSL_GEM_CACHE_MASK) {
+		if (gpriv->type & DRM_KGSL_GEM_CACHE_WBACKWA)
+			vma->vm_page_prot =
+			pgprot_writebackwacache(vma->vm_page_prot);
+		else if (gpriv->type & DRM_KGSL_GEM_CACHE_WBACK)
+				vma->vm_page_prot =
+				pgprot_writebackcache(vma->vm_page_prot);
+		else if (gpriv->type & DRM_KGSL_GEM_CACHE_WTHROUGH)
+				vma->vm_page_prot =
+				pgprot_writethroughcache(vma->vm_page_prot);
+		else
+			vma->vm_page_prot =
+			pgprot_writecombine(vma->vm_page_prot);
+	} else {
+		if (gpriv->type == DRM_KGSL_GEM_TYPE_KMEM_NOCACHE)
+			vma->vm_page_prot =
+			pgprot_noncached(vma->vm_page_prot);
+		else
+			/* default pmem is WC */
+			vma->vm_page_prot =
+			pgprot_writecombine(vma->vm_page_prot);
+	}
+
+	/* flush out existing KMEM cached mappings if new ones are
+	 * of uncached type */
+	if (IS_MEM_UNCACHED(gpriv->type))
+		kgsl_cache_range_op(&gpriv->memdesc,
+				    KGSL_CACHE_OP_FLUSH);
+
+	/* Add the other memory types here */
+
+	/* Take a ref for this mapping of the object, so that the fault
+	 * handler can dereference the mmap offset's pointer to the object.
+	 * This reference is cleaned up by the corresponding vm_close
+	 * (which should happen whether the vma was created by this call, or
+	 * by a vm_open due to mremap or partial unmap or whatever).
+	 */
+	drm_gem_object_reference(obj);
+
+	vma->vm_file = filp;	/* Needed for drm_vm_open() */
+	drm_vm_open_locked(vma);
+
+out_unlock:
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+void
+cleanup_fence(struct drm_kgsl_gem_object_fence *fence, int check_waiting)
+{
+	int j;
+	struct drm_kgsl_gem_object_fence_list_entry *this_fence_entry = NULL;
+	struct drm_kgsl_gem_object *unlock_obj;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object_wait_list_entry *lock_next;
+
+	fence->ts_valid = 0;
+	fence->timestamp = -1;
+	fence->ts_device = -1;
+
+	/* Walk the list of buffers in this fence and clean up the */
+	/* references. Note that this can cause memory allocations */
+	/* to be freed */
+	for (j = fence->num_buffers; j > 0; j--) {
+		this_fence_entry =
+				(struct drm_kgsl_gem_object_fence_list_entry *)
+				fence->buffers_in_fence.prev;
+
+		this_fence_entry->in_use = 0;
+		obj = this_fence_entry->gem_obj;
+		unlock_obj = obj->driver_private;
+
+		/* Delete it from the list */
+
+		list_del(&this_fence_entry->list);
+
+		/* we are unlocking - see if there are other pids waiting */
+		if (check_waiting) {
+			if (!list_empty(&unlock_obj->wait_list)) {
+				lock_next =
+				(struct drm_kgsl_gem_object_wait_list_entry *)
+					unlock_obj->wait_list.prev;
+
+				list_del((struct list_head *)&lock_next->list);
+
+				unlock_obj->lockpid = 0;
+				wake_up_interruptible(
+						&lock_next->process_wait_q);
+				lock_next->pid = 0;
+
+			} else {
+				/* List is empty so set pid to 0 */
+				unlock_obj->lockpid = 0;
+			}
+		}
+
+		drm_gem_object_unreference(obj);
+	}
+	/* here all the buffers in the fence are released */
+	/* clear the fence entry */
+	fence->fence_id = ENTRY_EMPTY;
+}
+
+int
+find_empty_fence(void)
+{
+	int i;
+
+	for (i = 0; i < DRM_KGSL_NUM_FENCE_ENTRIES; i++) {
+		if (gem_buf_fence[i].fence_id == ENTRY_EMPTY) {
+			gem_buf_fence[i].fence_id = fence_id++;
+			gem_buf_fence[i].ts_valid = 0;
+			INIT_LIST_HEAD(&(gem_buf_fence[i].buffers_in_fence));
+			if (fence_id == 0xFFFFFFF0)
+				fence_id = 1;
+			return i;
+		} else {
+
+			/* Look for entries to be cleaned up */
+			if (gem_buf_fence[i].fence_id == ENTRY_NEEDS_CLEANUP)
+				cleanup_fence(&gem_buf_fence[i], 0);
+		}
+	}
+
+	return ENTRY_EMPTY;
+}
+
+int
+find_fence(int index)
+{
+	int i;
+
+	for (i = 0; i < DRM_KGSL_NUM_FENCE_ENTRIES; i++) {
+		if (gem_buf_fence[i].fence_id == index)
+			return i;
+	}
+
+	return ENTRY_EMPTY;
+}
+
+void
+wakeup_fence_entries(struct drm_kgsl_gem_object_fence *fence)
+{
+    struct drm_kgsl_gem_object_fence_list_entry *this_fence_entry = NULL;
+	struct drm_kgsl_gem_object_wait_list_entry *lock_next;
+	struct drm_kgsl_gem_object *unlock_obj;
+	struct drm_gem_object *obj;
+
+	/* TS has expired when we get here */
+	fence->ts_valid = 0;
+	fence->timestamp = -1;
+	fence->ts_device = -1;
+
+	list_for_each_entry(this_fence_entry, &fence->buffers_in_fence, list) {
+		obj = this_fence_entry->gem_obj;
+		unlock_obj = obj->driver_private;
+
+		if (!list_empty(&unlock_obj->wait_list)) {
+			lock_next =
+				(struct drm_kgsl_gem_object_wait_list_entry *)
+					unlock_obj->wait_list.prev;
+
+			/* Unblock the pid */
+			lock_next->pid = 0;
+
+			/* Delete it from the list */
+			list_del((struct list_head *)&lock_next->list);
+
+			unlock_obj->lockpid = 0;
+			wake_up_interruptible(&lock_next->process_wait_q);
+
+		} else {
+			/* List is empty so set pid to 0 */
+			unlock_obj->lockpid = 0;
+		}
+	}
+	fence->fence_id = ENTRY_NEEDS_CLEANUP;  /* Mark it as needing cleanup */
+}
+
+int
+kgsl_gem_lock_handle_ioctl(struct drm_device *dev, void *data,
+						   struct drm_file *file_priv)
+{
+	/* The purpose of this function is to lock a given set of handles. */
+	/* The driver will maintain a list of locked handles. */
+	/* If a request comes in for a handle that's locked the thread will */
+	/* block until it's no longer in use. */
+
+	struct drm_kgsl_gem_lock_handles *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	struct drm_kgsl_gem_object_fence_list_entry *this_fence_entry = NULL;
+	struct drm_kgsl_gem_object_fence *fence;
+	struct drm_kgsl_gem_object_wait_list_entry *lock_item;
+	int i, j;
+	int result = 0;
+	uint32_t *lock_list;
+	uint32_t *work_list = NULL;
+	int32_t fence_index;
+
+	/* copy in the data from user space */
+	lock_list = kzalloc(sizeof(uint32_t) * args->num_handles, GFP_KERNEL);
+	if (!lock_list) {
+		DRM_ERROR("Unable allocate memory for lock list\n");
+		result = -ENOMEM;
+		goto error;
+	}
+
+	if (copy_from_user(lock_list, args->handle_list,
+			   sizeof(uint32_t) * args->num_handles)) {
+		DRM_ERROR("Unable to copy the lock list from the user\n");
+		result = -EFAULT;
+		goto free_handle_list;
+	}
+
+
+	work_list = lock_list;
+	mutex_lock(&dev->struct_mutex);
+
+	/* build the fence for this group of handles */
+	fence_index = find_empty_fence();
+	if (fence_index == ENTRY_EMPTY) {
+		DRM_ERROR("Unable to find a empty fence\n");
+		args->lock_id = 0xDEADBEEF;
+		result = -EFAULT;
+		goto out_unlock;
+	}
+
+	fence = &gem_buf_fence[fence_index];
+	gem_buf_fence[fence_index].num_buffers = args->num_handles;
+	args->lock_id = gem_buf_fence[fence_index].fence_id;
+
+	for (j = args->num_handles; j > 0; j--, lock_list++) {
+		obj = drm_gem_object_lookup(dev, file_priv, *lock_list);
+
+		if (obj == NULL) {
+			DRM_ERROR("Invalid GEM handle %x\n", *lock_list);
+			result = -EBADF;
+			goto out_unlock;
+		}
+
+		priv = obj->driver_private;
+		this_fence_entry = NULL;
+
+		/* get a fence entry to hook into the fence */
+		for (i = 0; i < DRM_KGSL_NUM_FENCE_ENTRIES; i++) {
+			if (!priv->fence_entries[i].in_use) {
+				this_fence_entry = &priv->fence_entries[i];
+				this_fence_entry->in_use = 1;
+				break;
+			}
+		}
+
+		if (this_fence_entry == NULL) {
+			fence->num_buffers = 0;
+			fence->fence_id = ENTRY_EMPTY;
+			args->lock_id = 0xDEADBEAD;
+			result = -EFAULT;
+			drm_gem_object_unreference(obj);
+			goto out_unlock;
+		}
+
+		/* We're trying to lock - add to a fence */
+		list_add((struct list_head *)this_fence_entry,
+				 &gem_buf_fence[fence_index].buffers_in_fence);
+		if (priv->lockpid) {
+
+			if (priv->lockpid == args->pid) {
+				/* now that things are running async this  */
+				/* happens when an op isn't done */
+				/* so it's already locked by the calling pid */
+					continue;
+			}
+
+
+			/* if a pid already had it locked */
+			/* create and add to wait list */
+			for (i = 0; i < DRM_KGSL_HANDLE_WAIT_ENTRIES; i++) {
+				if (priv->wait_entries[i].in_use == 0) {
+					/* this one is empty */
+					lock_item = &priv->wait_entries[i];
+				    lock_item->in_use = 1;
+					lock_item->pid = args->pid;
+					INIT_LIST_HEAD((struct list_head *)
+						&priv->wait_entries[i]);
+					break;
+				}
+			}
+
+			if (i == DRM_KGSL_HANDLE_WAIT_ENTRIES) {
+
+				result =  -EFAULT;
+				drm_gem_object_unreference(obj);
+				goto out_unlock;
+			}
+
+			list_add_tail((struct list_head *)&lock_item->list,
+							&priv->wait_list);
+			mutex_unlock(&dev->struct_mutex);
+			/* here we need to block */
+			wait_event_interruptible_timeout(
+					priv->wait_entries[i].process_wait_q,
+					(priv->lockpid == 0),
+					msecs_to_jiffies(64));
+			mutex_lock(&dev->struct_mutex);
+			lock_item->in_use = 0;
+		}
+
+		/* Getting here means no one currently holds the lock */
+		priv->lockpid = args->pid;
+
+		args->lock_id = gem_buf_fence[fence_index].fence_id;
+	}
+	fence->lockpid = args->pid;
+
+out_unlock:
+	mutex_unlock(&dev->struct_mutex);
+
+free_handle_list:
+	kfree(work_list);
+
+error:
+	return result;
+}
+
+int
+kgsl_gem_unlock_handle_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_unlock_handles *args = data;
+	int result = 0;
+	int32_t fence_index;
+
+	mutex_lock(&dev->struct_mutex);
+	fence_index = find_fence(args->lock_id);
+	if (fence_index == ENTRY_EMPTY) {
+		DRM_ERROR("Invalid lock ID: %x\n", args->lock_id);
+		result = -EFAULT;
+		goto out_unlock;
+	}
+
+	cleanup_fence(&gem_buf_fence[fence_index], 1);
+
+out_unlock:
+	mutex_unlock(&dev->struct_mutex);
+
+	return result;
+}
+
+
+int
+kgsl_gem_unlock_on_ts_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_unlock_on_ts *args = data;
+	int result = 0;
+	int ts_done = 0;
+	int32_t fence_index, ts_device;
+	struct drm_kgsl_gem_object_fence *fence;
+	struct kgsl_device *device;
+
+	if (args->type == DRM_KGSL_GEM_TS_3D)
+		ts_device = KGSL_DEVICE_3D0;
+	else if (args->type == DRM_KGSL_GEM_TS_2D)
+		ts_device = KGSL_DEVICE_2D0;
+	else {
+		result = -EINVAL;
+		goto error;
+	}
+
+	device = kgsl_get_device(ts_device);
+	ts_done = kgsl_check_timestamp(device, NULL, args->timestamp);
+
+	mutex_lock(&dev->struct_mutex);
+
+	fence_index = find_fence(args->lock_id);
+	if (fence_index == ENTRY_EMPTY) {
+		DRM_ERROR("Invalid lock ID: %x\n", args->lock_id);
+		result = -EFAULT;
+		goto out_unlock;
+	}
+
+	fence = &gem_buf_fence[fence_index];
+	fence->ts_device = ts_device;
+
+	if (!ts_done)
+		fence->ts_valid = 1;
+	else
+		cleanup_fence(fence, 1);
+
+
+out_unlock:
+	mutex_unlock(&dev->struct_mutex);
+
+error:
+	return result;
+}
+
+struct drm_ioctl_desc kgsl_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_CREATE, kgsl_gem_create_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_PREP, kgsl_gem_prep_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_SETMEMTYPE, kgsl_gem_setmemtype_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_GETMEMTYPE, kgsl_gem_getmemtype_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_BIND_GPU, kgsl_gem_bind_gpu_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_UNBIND_GPU, kgsl_gem_unbind_gpu_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_ALLOC, kgsl_gem_alloc_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_MMAP, kgsl_gem_mmap_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_GET_BUFINFO, kgsl_gem_get_bufinfo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_SET_BUFCOUNT,
+		      kgsl_gem_set_bufcount_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_SET_ACTIVE, kgsl_gem_set_active_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_LOCK_HANDLE,
+				  kgsl_gem_lock_handle_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_UNLOCK_HANDLE,
+				  kgsl_gem_unlock_handle_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_UNLOCK_ON_TS,
+				  kgsl_gem_unlock_on_ts_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_CREATE_FD, kgsl_gem_create_fd_ioctl,
+		      DRM_MASTER),
+};
+
+static struct drm_driver driver = {
+	.driver_features = DRIVER_GEM,
+	.load = kgsl_drm_load,
+	.unload = kgsl_drm_unload,
+	.preclose = kgsl_drm_preclose,
+	.suspend = kgsl_drm_suspend,
+	.resume = kgsl_drm_resume,
+	.reclaim_buffers = drm_core_reclaim_buffers,
+	.gem_init_object = kgsl_gem_init_object,
+	.gem_free_object = kgsl_gem_free_object,
+	.ioctls = kgsl_drm_ioctls,
+
+	.fops = {
+		 .owner = THIS_MODULE,
+		 .open = drm_open,
+		 .release = drm_release,
+		 .unlocked_ioctl = drm_ioctl,
+		 .mmap = msm_drm_gem_mmap,
+		 .poll = drm_poll,
+		 .fasync = drm_fasync,
+		 },
+
+	.name = DRIVER_NAME,
+	.desc = DRIVER_DESC,
+	.date = DRIVER_DATE,
+	.major = DRIVER_MAJOR,
+	.minor = DRIVER_MINOR,
+	.patchlevel = DRIVER_PATCHLEVEL,
+};
+
+int kgsl_drm_init(struct platform_device *dev)
+{
+	int i;
+
+	/* Only initialize once */
+	if (kgsl_drm_inited == DRM_KGSL_INITED)
+		return 0;
+
+	kgsl_drm_inited = DRM_KGSL_INITED;
+
+	driver.num_ioctls = DRM_ARRAY_SIZE(kgsl_drm_ioctls);
+
+	INIT_LIST_HEAD(&kgsl_mem_list);
+
+	for (i = 0; i < DRM_KGSL_NUM_FENCE_ENTRIES; i++) {
+		gem_buf_fence[i].num_buffers = 0;
+		gem_buf_fence[i].ts_valid = 0;
+		gem_buf_fence[i].fence_id = ENTRY_EMPTY;
+	}
+
+	return drm_platform_init(&driver, dev);
+}
+
+void kgsl_drm_exit(void)
+{
+	kgsl_drm_inited = DRM_KGSL_NOT_INITED;
+	drm_platform_exit(&driver, driver.kdriver.platform_device);
+}
diff --git a/drivers/gpu/msm/kgsl_events.c b/drivers/gpu/msm/kgsl_events.c
new file mode 100644
index 0000000..6798eed
--- /dev/null
+++ b/drivers/gpu/msm/kgsl_events.c
@@ -0,0 +1,324 @@
+/* Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <kgsl_device.h>
+
+#include "kgsl_trace.h"
+
+static void _add_event_to_list(struct list_head *head, struct kgsl_event *event)
+{
+	struct list_head *n;
+
+	for (n = head->next; n != head; n = n->next) {
+		struct kgsl_event *e =
+			list_entry(n, struct kgsl_event, list);
+
+		if (timestamp_cmp(e->timestamp, event->timestamp) > 0) {
+			list_add(&event->list, n->prev);
+			break;
+		}
+	}
+
+	if (n == head)
+		list_add_tail(&event->list, head);
+}
+
+/**
+ * kgsl_add_event - Add a new timstamp event for the KGSL device
+ * @device - KGSL device for the new event
+ * @id - the context ID that the event should be added to
+ * @ts - the timestamp to trigger the event on
+ * @cb - callback function to call when the timestamp expires
+ * @priv - private data for the specific event type
+ * @owner - driver instance that owns this event
+ *
+ * @returns - 0 on success or error code on failure
+ */
+int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
+	void (*cb)(struct kgsl_device *, void *, u32, u32), void *priv,
+	void *owner)
+{
+	struct kgsl_event *event;
+	unsigned int cur_ts;
+	struct kgsl_context *context = NULL;
+
+	if (cb == NULL)
+		return -EINVAL;
+
+	if (id != KGSL_MEMSTORE_GLOBAL) {
+		context = idr_find(&device->context_idr, id);
+		if (context == NULL)
+			return -EINVAL;
+	}
+	cur_ts = kgsl_readtimestamp(device, context, KGSL_TIMESTAMP_RETIRED);
+
+	/*
+	 * Check to see if the requested timestamp has already fired.  If it
+	 * did do the callback right away.  Make sure to send the timestamp that
+	 * the event expected instead of the current timestamp because sometimes
+	 * the event handlers can get confused.
+	 */
+
+	if (timestamp_cmp(cur_ts, ts) >= 0) {
+		trace_kgsl_fire_event(id, ts, 0);
+		cb(device, priv, id, ts);
+		return 0;
+	}
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (event == NULL)
+		return -ENOMEM;
+
+	event->context = context;
+	event->timestamp = ts;
+	event->priv = priv;
+	event->func = cb;
+	event->owner = owner;
+	event->created = jiffies;
+
+	trace_kgsl_register_event(id, ts);
+
+	/* inc refcount to avoid race conditions in cleanup */
+	if (context)
+		kgsl_context_get(context);
+
+	/* Add the event to either the owning context or the global list */
+
+	if (context) {
+		_add_event_to_list(&context->events, event);
+
+		/*
+		 * Add it to the master list of contexts with pending events if
+		 * it isn't already there
+		 */
+
+		if (list_empty(&context->events_list))
+			list_add_tail(&context->events_list,
+				&device->events_pending_list);
+
+	} else
+		_add_event_to_list(&device->events, event);
+
+	/*
+	 * Increase the active count on the device to avoid going into power
+	 * saving modes while events are pending
+	 */
+
+	device->active_cnt++;
+
+	queue_work(device->work_queue, &device->ts_expired_ws);
+	return 0;
+}
+EXPORT_SYMBOL(kgsl_add_event);
+
+/**
+ * kgsl_cancel_events_ctxt - Cancel all events for a context
+ * @device - KGSL device for the events to cancel
+ * @context - context whose events we want to cancel
+ *
+ */
+void kgsl_cancel_events_ctxt(struct kgsl_device *device,
+	struct kgsl_context *context)
+{
+	struct kgsl_event *event, *event_tmp;
+	unsigned int id, cur;
+
+	cur = kgsl_readtimestamp(device, context, KGSL_TIMESTAMP_RETIRED);
+	id = context->id;
+
+	list_for_each_entry_safe(event, event_tmp, &context->events, list) {
+		/*
+		 * "cancel" the events by calling their callback.
+		 * Currently, events are used for lock and memory
+		 * management, so if the process is dying the right
+		 * thing to do is release or free.
+		 *
+		 * Send the current timestamp so the event knows how far the
+		 * system got before the event was canceled
+		 */
+
+		trace_kgsl_fire_event(id, cur, jiffies - event->created);
+
+		if (event->func)
+			event->func(device, event->priv, id, cur);
+
+		kgsl_context_put(context);
+		list_del(&event->list);
+		kfree(event);
+
+		kgsl_active_count_put(device);
+	}
+
+	/* Remove ourselves from the master pending list */
+	list_del_init(&context->events_list);
+}
+
+/**
+ * kgsl_cancel_events - Cancel all generic events for a process
+ * @device - KGSL device for the events to cancel
+ * @owner - driver instance that owns the events to cancel
+ *
+ */
+void kgsl_cancel_events(struct kgsl_device *device,
+	void *owner)
+{
+	struct kgsl_event *event, *event_tmp;
+	unsigned int cur;
+
+	cur = kgsl_readtimestamp(device, NULL, KGSL_TIMESTAMP_RETIRED);
+
+	list_for_each_entry_safe(event, event_tmp, &device->events, list) {
+		if (event->owner != owner)
+			continue;
+
+		/*
+		 * "cancel" the events by calling their callback.
+		 * Currently, events are used for lock and memory
+		 * management, so if the process is dying the right
+		 * thing to do is release or free. Send the current timestamp so
+		 * the callback knows how far the GPU made it before things went
+		 * explosion
+		 */
+
+		trace_kgsl_fire_event(KGSL_MEMSTORE_GLOBAL, cur,
+			jiffies - event->created);
+
+		if (event->func)
+			event->func(device, event->priv, KGSL_MEMSTORE_GLOBAL,
+				cur);
+
+		if (event->context)
+			kgsl_context_put(event->context);
+
+		list_del(&event->list);
+		kfree(event);
+
+		kgsl_active_count_put(device);
+	}
+}
+EXPORT_SYMBOL(kgsl_cancel_events);
+
+static void _process_event_list(struct kgsl_device *device,
+		struct list_head *head, unsigned int timestamp)
+{
+	struct kgsl_event *event, *tmp;
+	unsigned int id;
+
+	list_for_each_entry_safe(event, tmp, head, list) {
+		if (timestamp_cmp(timestamp, event->timestamp) < 0)
+			break;
+
+		id = event->context ? event->context->id : KGSL_MEMSTORE_GLOBAL;
+
+		/*
+		 * Send the timestamp of the expired event, not the current
+		 * timestamp.  This prevents the event handlers from getting
+		 * confused if they don't bother comparing the current timetamp
+		 * to the timestamp they wanted
+		 */
+
+		trace_kgsl_fire_event(id, event->timestamp,
+			jiffies - event->created);
+
+		if (event->func)
+			event->func(device, event->priv, id, event->timestamp);
+
+		if (event->context)
+			kgsl_context_put(event->context);
+
+		list_del(&event->list);
+		kfree(event);
+
+		kgsl_active_count_put(device);
+	}
+}
+
+static inline int _mark_next_event(struct kgsl_device *device,
+		struct list_head *head)
+{
+	struct kgsl_event *event;
+
+	if (!list_empty(head)) {
+		event = list_first_entry(head, struct kgsl_event, list);
+
+		/*
+		 * Next event will return 0 if the event was marked or 1 if the
+		 * timestamp on the event has passed - return that up a layer
+		 */
+
+		return device->ftbl->next_event(device, event);
+	}
+
+	return 0;
+}
+
+static int kgsl_process_context_events(struct kgsl_device *device,
+		struct kgsl_context *context)
+{
+	while (1) {
+		unsigned int timestamp = kgsl_readtimestamp(device, context,
+			KGSL_TIMESTAMP_RETIRED);
+
+		_process_event_list(device, &context->events, timestamp);
+
+		/*
+		 * _mark_next event will return 1 as long as the next event
+		 * timestamp has expired - this is to cope with an unavoidable
+		 * race condition with the GPU that is still processing events.
+		 */
+
+		if (!_mark_next_event(device, &context->events))
+			break;
+	}
+
+	/*
+	 * Return 0 if the list is empty so the calling function can remove the
+	 * context from the pending list
+	 */
+
+	return list_empty(&context->events) ? 0 : 1;
+}
+
+void kgsl_process_events(struct work_struct *work)
+{
+	struct kgsl_device *device = container_of(work, struct kgsl_device,
+		ts_expired_ws);
+	struct kgsl_context *context, *tmp;
+	uint32_t timestamp;
+
+	mutex_lock(&device->mutex);
+
+	/* Process expired global events */
+	timestamp = kgsl_readtimestamp(device, NULL, KGSL_TIMESTAMP_RETIRED);
+	_process_event_list(device, &device->events, timestamp);
+	_mark_next_event(device, &device->events);
+
+	/* Now process all of the pending contexts */
+	list_for_each_entry_safe(context, tmp, &device->events_pending_list,
+		events_list) {
+
+		/*
+		 * If kgsl_timestamp_expired_context returns 0 then it no longer
+		 * has any pending events and can be removed from the list
+		 */
+
+		if (kgsl_process_context_events(device, context) == 0)
+			list_del_init(&context->events_list);
+	}
+
+	mutex_unlock(&device->mutex);
+}
+EXPORT_SYMBOL(kgsl_process_events);
diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c
index 33f242b..8f28505 100644
--- a/drivers/gpu/msm/kgsl_gpummu.c
+++ b/drivers/gpu/msm/kgsl_gpummu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -180,6 +180,13 @@
 	return NULL;
 }
 
+/**
+ * kgsl_ptpool_add
+ * @pool:  A pointer to a ptpool structure
+ * @entries: Number of entries to add
+ *
+ * Add static entries to the pagetable pool.
+ */
 
 static int
 kgsl_ptpool_add(struct kgsl_ptpool *pool, int count)
@@ -189,11 +196,15 @@
 
 	mutex_lock(&pool->lock);
 
+	/* Only 4MB can be allocated in one chunk, so larger allocations
+	   need to be split into multiple sections */
 
 	while (count) {
 		int entries = ((count * pool->ptsize) > SZ_4M) ?
 			SZ_4M / pool->ptsize : count;
 
+		/* Add the entries as static, i.e. they don't ever stand
+		   a chance of being removed */
 
 		ret =  _kgsl_ptpool_add_entries(pool, entries, 0);
 		if (ret)
@@ -206,6 +217,14 @@
 	return ret;
 }
 
+/**
+ * kgsl_ptpool_alloc
+ * @pool:  A pointer to a ptpool structure
+ * @addr: A pointer to store the physical address of the chunk
+ *
+ * Allocate a pagetable from the pool.  Returns the virtual address
+ * of the pagetable, the physical address is returned in physaddr
+ */
 
 static void *kgsl_ptpool_alloc(struct kgsl_ptpool *pool,
 				unsigned int *physaddr)
@@ -218,7 +237,7 @@
 	if (addr)
 		goto done;
 
-	
+	/* Add a chunk for 1 more pagetable and mark it as dynamic */
 	ret = _kgsl_ptpool_add_entries(pool, 1, 1);
 
 	if (ret)
@@ -241,6 +260,13 @@
 	kfree(chunk);
 }
 
+/**
+ * kgsl_ptpool_free
+ * @pool:  A pointer to a ptpool structure
+ * @addr: A pointer to the virtual address to free
+ *
+ * Free a pagetable allocated from the pool
+ */
 
 static void kgsl_ptpool_free(struct kgsl_ptpool *pool, void *addr)
 {
@@ -286,6 +312,13 @@
 	kfree(pool);
 }
 
+/**
+ * kgsl_ptpool_init
+ * @pool:  A pointer to a ptpool structure to initialize
+ * @entries:  The number of inital entries to add to the pool
+ *
+ * Initalize a pool and allocate an initial chunk of entries.
+ */
 void *kgsl_gpummu_ptpool_init(int entries)
 {
 	int ptsize = KGSL_PAGETABLE_SIZE;
@@ -322,8 +355,9 @@
 	return NULL;
 }
 
-int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt,
-					unsigned int pt_base)
+int kgsl_gpummu_pt_equal(struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt,
+			unsigned int pt_base)
 {
 	struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL;
 	return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
@@ -376,10 +410,10 @@
 	KGSL_MEM_CRIT(mmu->device,
 			"mmu page fault: page=0x%lx pt=%d op=%s axi=%d\n",
 			reg & ~(PAGE_SIZE - 1),
-			kgsl_mmu_get_ptname_from_ptbase(ptbase),
+			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase),
 			reg & 0x02 ? "WRITE" : "READ", (reg >> 4) & 0xF);
 	trace_kgsl_mmu_pagefault(mmu->device, reg & ~(PAGE_SIZE - 1),
-			kgsl_mmu_get_ptname_from_ptbase(ptbase),
+			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase),
 			reg & 0x02 ? "WRITE" : "READ");
 }
 
@@ -412,6 +446,8 @@
 	if (gpummu_pt->base.hostptr == NULL)
 		goto err_flushfilter;
 
+	/* ptpool allocations are from coherent memory, so update the
+	   device statistics acordingly */
 
 	KGSL_STATS_ADD(KGSL_PAGETABLE_SIZE, kgsl_driver.stats.coherent,
 		       kgsl_driver.stats.coherent_max);
@@ -444,7 +480,7 @@
 	}
 
 	if (flags & KGSL_MMUFLAGS_TLBFLUSH) {
-		
+		/* Invalidate all and tc */
 		kgsl_regwrite(mmu->device, MH_MMU_INVALIDATE,  0x00000003);
 	}
 }
@@ -454,11 +490,17 @@
 				unsigned int context_id)
 {
 	if (mmu->flags & KGSL_FLAGS_STARTED) {
+		/* page table not current, then setup mmu to use new
+		 *  specified page table
+		 */
 		if (mmu->hwpagetable != pagetable) {
 			mmu->hwpagetable = pagetable;
+			/* Since we do a TLB flush the tlb_flags should
+			 * be cleared by calling kgsl_mmu_pt_get_flags
+			 */
 			kgsl_mmu_pt_get_flags(pagetable, mmu->device->id);
 
-			
+			/* call device specific set page table */
 			kgsl_setstate(mmu, context_id, KGSL_MMUFLAGS_TLBFLUSH |
 				KGSL_MMUFLAGS_PTUPDATE);
 		}
@@ -467,11 +509,16 @@
 
 static int kgsl_gpummu_init(struct kgsl_mmu *mmu)
 {
+	/*
+	 * intialize device mmu
+	 *
+	 * call this with the global lock held
+	 */
 	int status = 0;
 
-	
+	/* sub-client MMU lookups require address translation */
 	if ((mmu->config & ~0x1) > 0) {
-		
+		/*make sure virtual address range is a multiple of 64Kb */
 		if (CONFIG_MSM_KGSL_PAGE_TABLE_SIZE & ((1 << 16) - 1)) {
 			KGSL_CORE_ERR("Invalid pagetable size requested "
 			"for GPUMMU: %x\n", CONFIG_MSM_KGSL_PAGE_TABLE_SIZE);
@@ -486,6 +533,11 @@
 
 static int kgsl_gpummu_start(struct kgsl_mmu *mmu)
 {
+	/*
+	 * intialize device mmu
+	 *
+	 * call this with the global lock held
+	 */
 
 	struct kgsl_device *device = mmu->device;
 	struct kgsl_gpummu_pt *gpummu_pt;
@@ -493,23 +545,28 @@
 	if (mmu->flags & KGSL_FLAGS_STARTED)
 		return 0;
 
-	
+	/* MMU not enabled */
 	if ((mmu->config & 0x1) == 0)
 		return 0;
 
-	
+	/* setup MMU and sub-client behavior */
 	kgsl_regwrite(device, MH_MMU_CONFIG, mmu->config);
 
-	
+	/* idle device */
 	kgsl_idle(device);
 
-	
+	/* enable axi interrupts */
 	kgsl_regwrite(device, MH_INTERRUPT_MASK,
 			GSL_MMU_INT_MASK | MH_INTERRUPT_MASK__MMU_PAGE_FAULT);
 
 	kgsl_sharedmem_set(&mmu->setstate_memory, 0, 0,
 			   mmu->setstate_memory.size);
 
+	/* TRAN_ERROR needs a 32 byte (32 byte aligned) chunk of memory
+	 * to complete transactions in case of an MMU fault. Note that
+	 * we'll leave the bottom 32 bytes of the setstate_memory for other
+	 * purposes (e.g. use it when dummy read cycles are needed
+	 * for other blocks) */
 	kgsl_regwrite(device, MH_MMU_TRAN_ERROR,
 		mmu->setstate_memory.physaddr + 32);
 
@@ -517,7 +574,7 @@
 		mmu->defaultpagetable =
 			kgsl_mmu_getpagetable(KGSL_MMU_GLOBAL_PT);
 
-	
+	/* Return error if the default pagetable doesn't exist */
 	if (mmu->defaultpagetable == NULL)
 		return -ENOMEM;
 
@@ -544,6 +601,9 @@
 	unsigned int range = kgsl_sg_size(memdesc->sg, memdesc->sglen);
 	struct kgsl_gpummu_pt *gpummu_pt = mmu_specific_pt;
 
+	/* All GPU addresses as assigned are page aligned, but some
+	   functions purturb the gpuaddr with an offset, so apply the
+	   mask here to make sure we have the right address */
 
 	unsigned int gpuaddr = memdesc->gpuaddr &  KGSL_MMU_ALIGN_MASK;
 
@@ -558,7 +618,7 @@
 	GSL_TLBFLUSH_FILTER_SETDIRTY(superpte / GSL_PT_SUPER_PTE);
 	for (pte = ptefirst; pte < ptelast; pte++) {
 #ifdef VERBOSE_DEBUG
-		
+		/* check if PTE exists */
 		if (!kgsl_pt_map_get(gpummu_pt, pte))
 			KGSL_CORE_ERR("pt entry %x is already "
 			"unmapped for pagetable %p\n", pte, gpummu_pt);
@@ -570,7 +630,7 @@
 				GSL_PT_SUPER_PTE);
 	}
 
-	
+	/* Post all writes to the pagetable */
 	wmb();
 
 	return 0;
@@ -594,7 +654,7 @@
 
 	pte = kgsl_pt_entry_get(KGSL_PAGETABLE_BASE, memdesc->gpuaddr);
 
-	
+	/* Flush the TLB if the first PTE isn't at the superpte boundary */
 	if (pte & (GSL_PT_SUPER_PTE - 1))
 		flushtlb = 1;
 
@@ -602,7 +662,7 @@
 		unsigned int paddr = kgsl_get_sg_pa(s);
 		unsigned int j;
 
-		
+		/* Each sg entry might be multiple pages long */
 		for (j = paddr; j < paddr + s->length; pte++, j += PAGE_SIZE) {
 			if (SUPERPTE_IS_DIRTY(pte))
 				flushtlb = 1;
@@ -610,14 +670,14 @@
 		}
 	}
 
-	
+	/* Flush the TLB if the last PTE isn't at the superpte boundary */
 	if ((pte + 1) & (GSL_PT_SUPER_PTE - 1))
 		flushtlb = 1;
 
 	wmb();
 
 	if (flushtlb) {
-		
+		/*set all devices as needing flushing*/
 		*tlb_flags = UINT_MAX;
 		GSL_TLBFLUSH_FILTER_RESET();
 	}
@@ -627,12 +687,16 @@
 
 static void kgsl_gpummu_stop(struct kgsl_mmu *mmu)
 {
-	kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000000);
 	mmu->flags &= ~KGSL_FLAGS_STARTED;
 }
 
 static int kgsl_gpummu_close(struct kgsl_mmu *mmu)
 {
+	/*
+	 *  close device mmu
+	 *
+	 *  call this with the global lock held
+	 */
 	if (mmu->setstate_memory.gpuaddr)
 		kgsl_sharedmem_free(&mmu->setstate_memory);
 
@@ -651,12 +715,18 @@
 }
 
 static unsigned int
-kgsl_gpummu_pt_get_base_addr(struct kgsl_pagetable *pt)
+kgsl_gpummu_get_pt_base_addr(struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt)
 {
 	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
 	return gpummu_pt->base.gpuaddr;
 }
 
+static int kgsl_gpummu_get_num_iommu_units(struct kgsl_mmu *mmu)
+{
+	return 1;
+}
+
 struct kgsl_mmu_ops gpummu_ops = {
 	.mmu_init = kgsl_gpummu_init,
 	.mmu_close = kgsl_gpummu_close,
@@ -666,10 +736,13 @@
 	.mmu_device_setstate = kgsl_gpummu_default_setstate,
 	.mmu_pagefault = kgsl_gpummu_pagefault,
 	.mmu_get_current_ptbase = kgsl_gpummu_get_current_ptbase,
+	.mmu_pt_equal = kgsl_gpummu_pt_equal,
+	.mmu_get_pt_base_addr = kgsl_gpummu_get_pt_base_addr,
 	.mmu_enable_clk = NULL,
 	.mmu_disable_clk_on_ts = NULL,
 	.mmu_get_pt_lsb = NULL,
-	.mmu_get_reg_map_desc = NULL,
+	.mmu_get_reg_gpuaddr = NULL,
+	.mmu_get_num_iommu_units = kgsl_gpummu_get_num_iommu_units,
 };
 
 struct kgsl_mmu_pt_ops gpummu_pt_ops = {
@@ -677,6 +750,4 @@
 	.mmu_unmap = kgsl_gpummu_unmap,
 	.mmu_create_pagetable = kgsl_gpummu_create_pagetable,
 	.mmu_destroy_pagetable = kgsl_gpummu_destroy_pagetable,
-	.mmu_pt_equal = kgsl_gpummu_pt_equal,
-	.mmu_pt_get_base_addr = kgsl_gpummu_pt_get_base_addr,
 };
diff --git a/drivers/gpu/msm/kgsl_gpummu.h b/drivers/gpu/msm/kgsl_gpummu.h
index d49a430..99e7d5f 100644
--- a/drivers/gpu/msm/kgsl_gpummu.h
+++ b/drivers/gpu/msm/kgsl_gpummu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -21,6 +21,7 @@
 	(MH_INTERRUPT_MASK__AXI_READ_ERROR | \
 	 MH_INTERRUPT_MASK__AXI_WRITE_ERROR)
 
+/* Macros to manage TLB flushing */
 #define GSL_TLBFLUSH_FILTER_ENTRY_NUMBITS     (sizeof(unsigned char) * 8)
 #define GSL_TLBFLUSH_FILTER_GET(superpte)			     \
 	      (*((unsigned char *)				    \
@@ -46,7 +47,7 @@
 struct kgsl_gpummu_pt {
 	struct kgsl_memdesc  base;
 	unsigned int   last_superpte;
-	
+	/* Maintain filter to manage tlb flushing */
 	struct kgsl_tlbflushfilter tlbflushfilter;
 };
 
@@ -74,4 +75,4 @@
 void *kgsl_gpummu_ptpool_init(int entries);
 void kgsl_gpummu_ptpool_destroy(void *ptpool);
 
-#endif 
+#endif /* __KGSL_GPUMMU_H */
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index dc517ae..f2393e4 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -33,9 +33,28 @@
 #include "z180.h"
 
 
+static struct kgsl_iommu_register_list kgsl_iommuv1_reg[KGSL_IOMMU_REG_MAX] = {
+	{ 0, 0, 0 },				/* GLOBAL_BASE */
+	{ 0x10, 0x0003FFFF, 14 },		/* TTBR0 */
+	{ 0x14, 0x0003FFFF, 14 },		/* TTBR1 */
+	{ 0x20, 0, 0 },				/* FSR */
+	{ 0x800, 0, 0 },			/* TLBIALL */
+	{ 0x820, 0, 0 },			/* RESUME */
+};
+
+static struct kgsl_iommu_register_list kgsl_iommuv2_reg[KGSL_IOMMU_REG_MAX] = {
+	{ 0, 0, 0 },				/* GLOBAL_BASE */
+	{ 0x20, 0x00FFFFFF, 14 },		/* TTBR0 */
+	{ 0x28, 0x00FFFFFF, 14 },		/* TTBR1 */
+	{ 0x58, 0, 0 },				/* FSR */
+	{ 0x618, 0, 0 },			/* TLBIALL */
+	{ 0x008, 0, 0 }				/* RESUME */
+};
+
 struct remote_iommu_petersons_spinlock kgsl_iommu_sync_lock_vars;
 
-static struct kgsl_iommu_unit *get_iommu_unit(struct device *dev)
+static int get_iommu_unit(struct device *dev, struct kgsl_mmu **mmu_out,
+			struct kgsl_iommu_unit **iommu_unit_out)
 {
 	int i, j, k;
 
@@ -56,13 +75,16 @@
 			struct kgsl_iommu_unit *iommu_unit =
 				&iommu->iommu_units[j];
 			for (k = 0; k < iommu_unit->dev_count; k++) {
-				if (iommu_unit->dev[k].dev == dev)
-					return iommu_unit;
+				if (iommu_unit->dev[k].dev == dev) {
+					*mmu_out = mmu;
+					*iommu_unit_out = iommu_unit;
+					return 0;
+				}
 			}
 		}
 	}
 
-	return NULL;
+	return -EINVAL;
 }
 
 static struct kgsl_iommu_device *get_iommu_device(struct kgsl_iommu_unit *unit,
@@ -81,47 +103,71 @@
 static int kgsl_iommu_fault_handler(struct iommu_domain *domain,
 	struct device *dev, unsigned long addr, int flags)
 {
-	struct kgsl_iommu_unit *iommu_unit = get_iommu_unit(dev);
-	struct kgsl_iommu_device *iommu_dev = get_iommu_device(iommu_unit, dev);
+	int ret = 0;
+	struct kgsl_mmu *mmu;
+	struct kgsl_iommu *iommu;
+	struct kgsl_iommu_unit *iommu_unit;
+	struct kgsl_iommu_device *iommu_dev;
 	unsigned int ptbase, fsr;
-	static unsigned long last_pagefault_jiffies;
-	static int last_pid;
-	int current_pid;
-	unsigned long wait_time_jiff = 0;
+	struct kgsl_device *device;
+	struct adreno_device *adreno_dev;
+	unsigned int no_page_fault_log = 0;
 
+	ret = get_iommu_unit(dev, &mmu, &iommu_unit);
+	if (ret)
+		goto done;
+	iommu_dev = get_iommu_device(iommu_unit, dev);
 	if (!iommu_dev) {
 		KGSL_CORE_ERR("Invalid IOMMU device %p\n", dev);
-		return -ENOSYS;
+		ret = -ENOSYS;
+		goto done;
 	}
+	iommu = mmu->priv;
+	device = mmu->device;
+	adreno_dev = ADRENO_DEVICE(device);
 
-	wait_time_jiff = last_pagefault_jiffies + msecs_to_jiffies(500);
-	last_pagefault_jiffies = jiffies;
+	ptbase = KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit,
+					iommu_dev->ctx_id, TTBR0);
 
-	ptbase = KGSL_IOMMU_GET_IOMMU_REG(iommu_unit->reg_map.hostptr,
-			iommu_dev->ctx_id, TTBR0);
-	current_pid = kgsl_mmu_get_ptname_from_ptbase(ptbase);
+	fsr = KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit,
+		iommu_dev->ctx_id, FSR);
 
-	if ((last_pid != current_pid) ||
-	    (time_after(jiffies, wait_time_jiff))
-	   ) {
-		fsr = KGSL_IOMMU_GET_IOMMU_REG(iommu_unit->reg_map.hostptr,
-			iommu_dev->ctx_id, FSR);
+	if (adreno_dev->ft_pf_policy & KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE)
+		no_page_fault_log = kgsl_mmu_log_fault_addr(mmu, ptbase, addr);
 
+	if (!no_page_fault_log) {
 		KGSL_MEM_CRIT(iommu_dev->kgsldev,
 			"GPU PAGE FAULT: addr = %lX pid = %d\n",
-			addr, kgsl_mmu_get_ptname_from_ptbase(ptbase));
+			addr, kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase));
 		KGSL_MEM_CRIT(iommu_dev->kgsldev, "context = %d FSR = %X\n",
 			iommu_dev->ctx_id, fsr);
-
-		last_pid = current_pid;
 	}
 
-	trace_kgsl_mmu_pagefault(iommu_dev->kgsldev, addr,
-			kgsl_mmu_get_ptname_from_ptbase(ptbase), 0);
+	mmu->fault = 1;
+	iommu_dev->fault = 1;
 
-	return 0;
+	trace_kgsl_mmu_pagefault(iommu_dev->kgsldev, addr,
+			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase), 0);
+
+	/*
+	 * We do not want the h/w to resume fetching data from an iommu unit
+	 * that has faulted, this is better for debugging as it will stall
+	 * the GPU and trigger a snapshot. To stall the transaction return
+	 * EBUSY error.
+	 */
+	if (adreno_dev->ft_pf_policy & KGSL_FT_PAGEFAULT_GPUHALT_ENABLE)
+		ret = -EBUSY;
+done:
+	return ret;
 }
 
+/*
+ * kgsl_iommu_disable_clk - Disable iommu clocks
+ * @mmu - Pointer to mmu structure
+ *
+ * Disables iommu clocks
+ * Return - void
+ */
 static void kgsl_iommu_disable_clk(struct kgsl_mmu *mmu)
 {
 	struct kgsl_iommu *iommu = mmu->priv;
@@ -135,6 +181,8 @@
 				continue;
 			iommu_drvdata = dev_get_drvdata(
 					iommu_unit->dev[j].dev->parent);
+			if (iommu_drvdata->aclk)
+				clk_disable_unprepare(iommu_drvdata->aclk);
 			if (iommu_drvdata->clk)
 				clk_disable_unprepare(iommu_drvdata->clk);
 			clk_disable_unprepare(iommu_drvdata->pclk);
@@ -143,6 +191,18 @@
 	}
 }
 
+/*
+ * kgsl_iommu_disable_clk_event - An event function that is executed when
+ * the required timestamp is reached. It disables the IOMMU clocks if
+ * the timestamp on which the clocks can be disabled has expired.
+ * @device - The kgsl device pointer
+ * @data - The data passed during event creation, it is the MMU pointer
+ * @id - Context ID, should always be KGSL_MEMSTORE_GLOBAL
+ * @ts - The current timestamp that has expired for the device
+ *
+ * Disables IOMMU clocks if timestamp has expired
+ * Return - void
+ */
 static void kgsl_iommu_clk_disable_event(struct kgsl_device *device, void *data,
 					unsigned int id, unsigned int ts)
 {
@@ -162,6 +222,9 @@
 		kgsl_iommu_disable_clk(mmu);
 		iommu->clk_event_queued = false;
 	} else {
+		/* add new event to fire when ts is reached, this can happen
+		 * if we queued an event and someone requested the clocks to
+		 * be disbaled on a later timestamp */
 		if (kgsl_add_event(device, id, iommu->iommu_last_cmd_ts,
 			kgsl_iommu_clk_disable_event, mmu, mmu)) {
 				KGSL_DRV_ERR(device,
@@ -171,6 +234,20 @@
 	}
 }
 
+/*
+ * kgsl_iommu_disable_clk_on_ts - Sets up event to disable IOMMU clocks
+ * @mmu - The kgsl MMU pointer
+ * @ts - Timestamp on which the clocks should be disabled
+ * @ts_valid - Indicates whether ts parameter is valid, if this parameter
+ * is false then it means that the calling function wants to disable the
+ * IOMMU clocks immediately without waiting for any timestamp
+ *
+ * Creates an event to disable the IOMMU clocks on timestamp and if event
+ * already exists then updates the timestamp of disabling the IOMMU clocks
+ * with the passed in ts if it is greater than the current value at which
+ * the clocks will be disabled
+ * Return - void
+ */
 static void
 kgsl_iommu_disable_clk_on_ts(struct kgsl_mmu *mmu, unsigned int ts,
 				bool ts_valid)
@@ -197,6 +274,14 @@
 	}
 }
 
+/*
+ * kgsl_iommu_enable_clk - Enable iommu clocks
+ * @mmu - Pointer to mmu structure
+ * @ctx_id - The context bank whose clocks are to be turned on
+ *
+ * Enables iommu clocks of a given context
+ * Return: 0 on success else error code
+ */
 static int kgsl_iommu_enable_clk(struct kgsl_mmu *mmu,
 				int ctx_id)
 {
@@ -224,6 +309,17 @@
 					goto done;
 				}
 			}
+			if (iommu_drvdata->aclk) {
+				ret = clk_prepare_enable(iommu_drvdata->aclk);
+				if (ret) {
+					if (iommu_drvdata->clk)
+						clk_disable_unprepare(
+							iommu_drvdata->clk);
+					clk_disable_unprepare(
+							iommu_drvdata->pclk);
+					goto done;
+				}
+			}
 			iommu_unit->dev[j].clk_enabled = true;
 		}
 	}
@@ -233,21 +329,44 @@
 	return ret;
 }
 
-static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt,
-					unsigned int pt_base)
+/*
+ * kgsl_iommu_pt_equal - Check if pagetables are equal
+ * @mmu - Pointer to mmu structure
+ * @pt - Pointer to pagetable
+ * @pt_base - Address of a pagetable that the IOMMU register is
+ * programmed with
+ *
+ * Checks whether the pt_base is equal to the base address of
+ * the pagetable which is contained in the pt structure
+ * Return - Non-zero if the pagetable addresses are equal else 0
+ */
+static int kgsl_iommu_pt_equal(struct kgsl_mmu *mmu,
+				struct kgsl_pagetable *pt,
+				unsigned int pt_base)
 {
+	struct kgsl_iommu *iommu = mmu->priv;
 	struct kgsl_iommu_pt *iommu_pt = pt ? pt->priv : NULL;
 	unsigned int domain_ptbase = iommu_pt ?
 				iommu_get_pt_base_addr(iommu_pt->domain) : 0;
-	
-	domain_ptbase &= (KGSL_IOMMU_TTBR0_PA_MASK <<
-				KGSL_IOMMU_TTBR0_PA_SHIFT);
-	pt_base &= (KGSL_IOMMU_TTBR0_PA_MASK <<
-				KGSL_IOMMU_TTBR0_PA_SHIFT);
+	/* Only compare the valid address bits of the pt_base */
+	domain_ptbase &=
+		(iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
+		iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
+
+	pt_base &=
+		(iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
+		iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
+
 	return domain_ptbase && pt_base &&
 		(domain_ptbase == pt_base);
 }
 
+/*
+ * kgsl_iommu_destroy_pagetable - Free up reaources help by a pagetable
+ * @mmu_specific_pt - Pointer to pagetable which is to be freed
+ *
+ * Return - void
+ */
 static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt)
 {
 	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
@@ -256,6 +375,13 @@
 	kfree(iommu_pt);
 }
 
+/*
+ * kgsl_iommu_create_pagetable - Create a IOMMU pagetable
+ *
+ * Allocate memory to hold a pagetable and allocate the IOMMU
+ * domain which is the actual IOMMU pagetable
+ * Return - void
+ */
 void *kgsl_iommu_create_pagetable(void)
 {
 	struct kgsl_iommu_pt *iommu_pt;
@@ -266,8 +392,13 @@
 				sizeof(struct kgsl_iommu_pt));
 		return NULL;
 	}
-	iommu_pt->domain = iommu_domain_alloc(&platform_bus_type,
-										  MSM_IOMMU_DOMAIN_PT_CACHEABLE);
+	/* L2 redirect is not stable on IOMMU v2 */
+	if (msm_soc_version_supports_iommu_v1())
+		iommu_pt->domain = iommu_domain_alloc(&platform_bus_type,
+					MSM_IOMMU_DOMAIN_PT_CACHEABLE);
+	else
+		iommu_pt->domain = iommu_domain_alloc(&platform_bus_type,
+					0);
 	if (!iommu_pt->domain) {
 		KGSL_CORE_ERR("Failed to create iommu domain\n");
 		kfree(iommu_pt);
@@ -280,6 +411,18 @@
 	return iommu_pt;
 }
 
+/*
+ * kgsl_detach_pagetable_iommu_domain - Detach the IOMMU unit from a
+ * pagetable
+ * @mmu - Pointer to the device mmu structure
+ * @priv - Flag indicating whether the private or user context is to be
+ * detached
+ *
+ * Detach the IOMMU unit with the domain that is contained in the
+ * hwpagetable of the given mmu. After detaching the IOMMU unit is not
+ * in use because the PTBR will not be set after a detach
+ * Return - void
+ */
 static void kgsl_detach_pagetable_iommu_domain(struct kgsl_mmu *mmu)
 {
 	struct kgsl_iommu_pt *iommu_pt;
@@ -290,6 +433,10 @@
 		struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[i];
 		iommu_pt = mmu->defaultpagetable->priv;
 		for (j = 0; j < iommu_unit->dev_count; j++) {
+			/*
+			 * If there is a 2nd default pagetable then priv domain
+			 * is attached with this pagetable
+			 */
 			if (mmu->priv_bank_table &&
 				(KGSL_IOMMU_CONTEXT_PRIV == j))
 				iommu_pt = mmu->priv_bank_table->priv;
@@ -305,16 +452,37 @@
 	}
 }
 
+/*
+ * kgsl_attach_pagetable_iommu_domain - Attach the IOMMU unit to a
+ * pagetable, i.e set the IOMMU's PTBR to the pagetable address and
+ * setup other IOMMU registers for the device so that it becomes
+ * active
+ * @mmu - Pointer to the device mmu structure
+ * @priv - Flag indicating whether the private or user context is to be
+ * attached
+ *
+ * Attach the IOMMU unit with the domain that is contained in the
+ * hwpagetable of the given mmu.
+ * Return - 0 on success else error code
+ */
 static int kgsl_attach_pagetable_iommu_domain(struct kgsl_mmu *mmu)
 {
 	struct kgsl_iommu_pt *iommu_pt;
 	struct kgsl_iommu *iommu = mmu->priv;
 	int i, j, ret = 0;
 
+	/*
+	 * Loop through all the iommu devcies under all iommu units and
+	 * attach the domain
+	 */
 	for (i = 0; i < iommu->unit_count; i++) {
 		struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[i];
 		iommu_pt = mmu->defaultpagetable->priv;
 		for (j = 0; j < iommu_unit->dev_count; j++) {
+			/*
+			 * If there is a 2nd default pagetable then priv domain
+			 * is attached to this pagetable
+			 */
 			if (mmu->priv_bank_table &&
 				(KGSL_IOMMU_CONTEXT_PRIV == j))
 				iommu_pt = mmu->priv_bank_table->priv;
@@ -339,6 +507,17 @@
 	return ret;
 }
 
+/*
+ * _get_iommu_ctxs - Get device pointer to IOMMU contexts
+ * @mmu - Pointer to mmu device
+ * data - Pointer to the platform data containing information about
+ * iommu devices for one iommu unit
+ * unit_id - The IOMMU unit number. This is not a specific ID but just
+ * a serial number. The serial numbers are treated as ID's of the
+ * IOMMU units
+ *
+ * Return - 0 on success else error code
+ */
 static int _get_iommu_ctxs(struct kgsl_mmu *mmu,
 	struct kgsl_device_iommu_data *data, unsigned int unit_id)
 {
@@ -384,6 +563,12 @@
 	return 0;
 }
 
+/*
+ * kgsl_get_sync_lock - Init Sync Lock between GPU and CPU
+ * @mmu - Pointer to mmu device
+ *
+ * Return - 0 on success else error code
+ */
 static int kgsl_iommu_init_sync_lock(struct kgsl_mmu *mmu)
 {
 	struct kgsl_iommu *iommu = mmu->device->mmu.priv;
@@ -401,7 +586,7 @@
 		return -ENXIO;
 	}
 
-	
+	/* Get the physical address of the Lock variables */
 	lock_phy_addr = (msm_iommu_lock_initialize()
 			- MSM_SHARED_RAM_BASE + msm_shared_ram_phys);
 
@@ -411,7 +596,7 @@
 		return -ENXIO;
 	}
 
-	
+	/* Align the physical address to PAGE boundary and store the offset */
 	page_offset = (lock_phy_addr & (PAGE_SIZE - 1));
 	lock_phy_addr = (lock_phy_addr & ~(PAGE_SIZE - 1));
 	iommu->sync_lock_desc.physaddr = (unsigned int)lock_phy_addr;
@@ -425,8 +610,8 @@
 	if (status)
 		return status;
 
-	
-	iommu->sync_lock_desc.priv |= KGSL_MEMFLAGS_GLOBAL;
+	/* Map Lock variables to GPU pagetable */
+	iommu->sync_lock_desc.priv |= KGSL_MEMDESC_GLOBAL;
 
 	pagetable = mmu->priv_bank_table ? mmu->priv_bank_table :
 				mmu->defaultpagetable;
@@ -436,11 +621,11 @@
 
 	if (status) {
 		kgsl_mmu_unmap(pagetable, &iommu->sync_lock_desc);
-		iommu->sync_lock_desc.priv &= ~~KGSL_MEMFLAGS_GLOBAL;
+		iommu->sync_lock_desc.priv &= ~KGSL_MEMDESC_GLOBAL;
 		return status;
 	}
 
-	
+	/* Store Lock variables GPU address  */
 	lock_gpu_addr = (iommu->sync_lock_desc.gpuaddr + page_offset);
 
 	kgsl_iommu_sync_lock_vars.flag[PROC_APPS] = (lock_gpu_addr +
@@ -454,12 +639,19 @@
 
 	iommu->sync_lock_vars = &kgsl_iommu_sync_lock_vars;
 
-	
+	/* Flag Sync Lock is Initialized  */
 	iommu->sync_lock_initialized = 1;
 
 	return status;
 }
 
+/*
+ * kgsl_iommu_sync_lock - Acquire Sync Lock between GPU and CPU
+ * @mmu - Pointer to mmu device
+ * @cmds - Pointer to array of commands
+ *
+ * Return - int - number of commands.
+ */
 inline unsigned int kgsl_iommu_sync_lock(struct kgsl_mmu *mmu,
 						unsigned int *cmds)
 {
@@ -480,7 +672,7 @@
 	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
 
 	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
-	
+	/* MEM SPACE = memory, FUNCTION = equals */
 	*cmds++ = 0x13;
 	*cmds++ = lock_vars->flag[PROC_GPU];
 	*cmds++ = 0x1;
@@ -494,7 +686,7 @@
 	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
 
 	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
-	
+	/* MEM SPACE = memory, FUNCTION = equals */
 	*cmds++ = 0x13;
 	*cmds++ = lock_vars->flag[PROC_GPU];
 	*cmds++ = 0x1;
@@ -511,6 +703,13 @@
 	return cmds - start;
 }
 
+/*
+ * kgsl_iommu_sync_lock - Release Sync Lock between GPU and CPU
+ * @mmu - Pointer to mmu device
+ * @cmds - Pointer to array of commands
+ *
+ * Return - int - number of commands.
+ */
 inline unsigned int kgsl_iommu_sync_unlock(struct kgsl_mmu *mmu,
 					unsigned int *cmds)
 {
@@ -529,7 +728,7 @@
 	*cmds++ = 0;
 
 	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
-	
+	/* MEM SPACE = memory, FUNCTION = equals */
 	*cmds++ = 0x13;
 	*cmds++ = lock_vars->flag[PROC_GPU];
 	*cmds++ = 0x0;
@@ -541,6 +740,14 @@
 	return cmds - start;
 }
 
+/*
+ * kgsl_get_iommu_ctxt - Get device pointer to IOMMU contexts
+ * @mmu - Pointer to mmu device
+ *
+ * Get the device pointers for the IOMMU user and priv contexts of the
+ * kgsl device
+ * Return - 0 on success else error code
+ */
 static int kgsl_get_iommu_ctxt(struct kgsl_mmu *mmu)
 {
 	struct platform_device *pdev =
@@ -550,7 +757,7 @@
 	struct kgsl_iommu *iommu = mmu->device->mmu.priv;
 	int i, ret = 0;
 
-	
+	/* Go through the IOMMU data and get all the context devices */
 	if (KGSL_IOMMU_MAX_UNITS < pdata_dev->iommu_count) {
 		KGSL_CORE_ERR("Too many IOMMU units defined\n");
 		ret = -EINVAL;
@@ -567,6 +774,13 @@
 	return ret;
 }
 
+/*
+ * kgsl_set_register_map - Map the IOMMU regsiters in the memory descriptors
+ * of the respective iommu units
+ * @mmu - Pointer to mmu structure
+ *
+ * Return - 0 on success else error code
+ */
 static int kgsl_set_register_map(struct kgsl_mmu *mmu)
 {
 	struct platform_device *pdev =
@@ -580,7 +794,7 @@
 	for (; i < pdata_dev->iommu_count; i++) {
 		struct kgsl_device_iommu_data data = pdata_dev->iommu_data[i];
 		iommu_unit = &iommu->iommu_units[i];
-		
+		/* set up the IOMMU register map for the given IOMMU unit */
 		if (!data.physstart || !data.physend) {
 			KGSL_CORE_ERR("The register range for IOMMU unit not"
 					" specified\n");
@@ -599,13 +813,15 @@
 		}
 		iommu_unit->reg_map.size = data.physend - data.physstart + 1;
 		iommu_unit->reg_map.physaddr = data.physstart;
-		memdesc_sg_phys(&iommu_unit->reg_map, data.physstart,
+		ret = memdesc_sg_phys(&iommu_unit->reg_map, data.physstart,
 				iommu_unit->reg_map.size);
+		if (ret)
+			goto err;
 	}
 	iommu->unit_count = pdata_dev->iommu_count;
 	return ret;
 err:
-	
+	/* Unmap any mapped IOMMU regions */
 	for (; i >= 0; i--) {
 		iommu_unit = &iommu->iommu_units[i];
 		iounmap(iommu_unit->reg_map.hostptr);
@@ -615,12 +831,36 @@
 	return ret;
 }
 
-static unsigned int kgsl_iommu_pt_get_base_addr(struct kgsl_pagetable *pt)
+/*
+ * kgsl_iommu_get_pt_base_addr - Get the address of the pagetable that the
+ * IOMMU ttbr0 register is programmed with
+ * @mmu - Pointer to mmu
+ * @pt - kgsl pagetable pointer that contains the IOMMU domain pointer
+ *
+ * Return - actual pagetable address that the ttbr0 register is programmed
+ * with
+ */
+static unsigned int kgsl_iommu_get_pt_base_addr(struct kgsl_mmu *mmu,
+						struct kgsl_pagetable *pt)
 {
+	struct kgsl_iommu *iommu = mmu->priv;
 	struct kgsl_iommu_pt *iommu_pt = pt->priv;
-	return iommu_get_pt_base_addr(iommu_pt->domain);
+	return iommu_get_pt_base_addr(iommu_pt->domain) &
+			(iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
+			iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
 }
 
+/*
+ * kgsl_iommu_get_pt_lsb - Return the lsb of the ttbr0 IOMMU register
+ * @mmu - Pointer to mmu structure
+ * @hostptr - Pointer to the IOMMU register map. This is used to match
+ * the iommu device whose lsb value is to be returned
+ * @ctx_id - The context bank whose lsb valus is to be returned
+ * Return - returns the lsb which is the last 14 bits of the ttbr0 IOMMU
+ * register. ttbr0 is the actual PTBR for of the IOMMU. The last 14 bits
+ * are only programmed once in the beginning when a domain is attached
+ * does not change.
+ */
 static int kgsl_iommu_get_pt_lsb(struct kgsl_mmu *mmu,
 				unsigned int unit_id,
 				enum kgsl_iommu_context_id ctx_id)
@@ -642,6 +882,9 @@
 				unsigned int context_id)
 {
 	if (mmu->flags & KGSL_FLAGS_STARTED) {
+		/* page table not current, then setup mmu to use new
+		 *  specified page table
+		 */
 		if (mmu->hwpagetable != pagetable) {
 			unsigned int flags = 0;
 			mmu->hwpagetable = pagetable;
@@ -656,6 +899,11 @@
 
 static int kgsl_iommu_init(struct kgsl_mmu *mmu)
 {
+	/*
+	 * intialize device mmu
+	 *
+	 * call this with the global lock held
+	 */
 	int status = 0;
 	struct kgsl_iommu *iommu;
 
@@ -674,6 +922,19 @@
 	if (status)
 		goto done;
 
+	iommu->iommu_reg_list = kgsl_iommuv1_reg;
+	iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V1;
+
+	if (msm_soc_version_supports_iommu_v1()) {
+		iommu->iommu_reg_list = kgsl_iommuv1_reg;
+		iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V1;
+	} else {
+		iommu->iommu_reg_list = kgsl_iommuv2_reg;
+		iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V2;
+	}
+
+	/* A nop is required in an indirect buffer when switching
+	 * pagetables in-stream */
 	kgsl_sharedmem_writel(&mmu->setstate_memory,
 				KGSL_IOMMU_SETSTATE_NOP_OFFSET,
 				cp_nop_packet(1));
@@ -688,41 +949,53 @@
 	return status;
 }
 
+/*
+ * kgsl_iommu_setup_defaultpagetable - Setup the initial defualtpagetable
+ * for iommu. This function is only called once during first start, successive
+ * start do not call this funciton.
+ * @mmu - Pointer to mmu structure
+ *
+ * Create the  initial defaultpagetable and setup the iommu mappings to it
+ * Return - 0 on success else error code
+ */
 static int kgsl_iommu_setup_defaultpagetable(struct kgsl_mmu *mmu)
 {
 	int status = 0;
 	int i = 0;
 	struct kgsl_iommu *iommu = mmu->priv;
-	struct kgsl_iommu_pt *iommu_pt;
 	struct kgsl_pagetable *pagetable = NULL;
 
-	if (!cpu_is_msm8960()) {
+	/* If chip is not 8960 then we use the 2nd context bank for pagetable
+	 * switching on the 3D side for which a separate table is allocated */
+	if (!cpu_is_msm8960() && msm_soc_version_supports_iommu_v1()) {
 		mmu->priv_bank_table =
 			kgsl_mmu_getpagetable(KGSL_MMU_PRIV_BANK_TABLE_NAME);
 		if (mmu->priv_bank_table == NULL) {
 			status = -ENOMEM;
 			goto err;
 		}
-		iommu_pt = mmu->priv_bank_table->priv;
 	}
 	mmu->defaultpagetable = kgsl_mmu_getpagetable(KGSL_MMU_GLOBAL_PT);
-	
+	/* Return error if the default pagetable doesn't exist */
 	if (mmu->defaultpagetable == NULL) {
 		status = -ENOMEM;
 		goto err;
 	}
 	pagetable = mmu->priv_bank_table ? mmu->priv_bank_table :
 				mmu->defaultpagetable;
-	
-	for (i = 0; i < iommu->unit_count; i++) {
-		iommu->iommu_units[i].reg_map.priv |= KGSL_MEMFLAGS_GLOBAL;
-		status = kgsl_mmu_map(pagetable,
-			&(iommu->iommu_units[i].reg_map),
-			GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-		if (status) {
-			iommu->iommu_units[i].reg_map.priv &=
-							~KGSL_MEMFLAGS_GLOBAL;
-			goto err;
+	/* Map the IOMMU regsiters to only defaultpagetable */
+	if (msm_soc_version_supports_iommu_v1()) {
+		for (i = 0; i < iommu->unit_count; i++) {
+			iommu->iommu_units[i].reg_map.priv |=
+						KGSL_MEMDESC_GLOBAL;
+			status = kgsl_mmu_map(pagetable,
+				&(iommu->iommu_units[i].reg_map),
+				GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+			if (status) {
+				iommu->iommu_units[i].reg_map.priv &=
+							~KGSL_MEMDESC_GLOBAL;
+				goto err;
+			}
 		}
 	}
 	return status;
@@ -730,7 +1003,7 @@
 	for (i--; i >= 0; i--) {
 		kgsl_mmu_unmap(pagetable,
 				&(iommu->iommu_units[i].reg_map));
-		iommu->iommu_units[i].reg_map.priv &= ~KGSL_MEMFLAGS_GLOBAL;
+		iommu->iommu_units[i].reg_map.priv &= ~KGSL_MEMDESC_GLOBAL;
 	}
 	if (mmu->priv_bank_table) {
 		kgsl_mmu_putpagetable(mmu->priv_bank_table);
@@ -758,20 +1031,20 @@
 		if (status)
 			return -ENOMEM;
 
-		
+		/* Initialize the sync lock between GPU and CPU */
 		if (msm_soc_version_supports_iommu_v1() &&
 			(device->id == KGSL_DEVICE_3D0))
 				kgsl_iommu_init_sync_lock(mmu);
 	}
 
+	/* We use the GPU MMU to control access to IOMMU registers on 8960 with
+	 * a225, hence we still keep the MMU active on 8960 */
 	if (cpu_is_msm8960()) {
 		struct kgsl_mh *mh = &(mmu->device->mh);
 		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000001);
 		kgsl_regwrite(mmu->device, MH_MMU_MPU_END,
 			mh->mpu_base +
-			iommu->iommu_units
-				[iommu->unit_count - 1].reg_map.gpuaddr -
-				PAGE_SIZE);
+			iommu->iommu_units[0].reg_map.gpuaddr);
 	} else {
 		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000000);
 	}
@@ -793,14 +1066,20 @@
 		KGSL_CORE_ERR("clk enable failed\n");
 		goto done;
 	}
+	/* Get the lsb value of pagetables set in the IOMMU ttbr0 register as
+	 * that value should not change when we change pagetables, so while
+	 * changing pagetables we can use this lsb value of the pagetable w/o
+	 * having to read it again
+	 */
 	for (i = 0; i < iommu->unit_count; i++) {
 		struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[i];
-		for (j = 0; j < iommu_unit->dev_count; j++)
-			iommu_unit->dev[j].pt_lsb = KGSL_IOMMMU_PT_LSB(
-						KGSL_IOMMU_GET_IOMMU_REG(
-						iommu_unit->reg_map.hostptr,
+		for (j = 0; j < iommu_unit->dev_count; j++) {
+			iommu_unit->dev[j].pt_lsb = KGSL_IOMMMU_PT_LSB(iommu,
+						KGSL_IOMMU_GET_CTX_REG(iommu,
+						iommu_unit,
 						iommu_unit->dev[j].ctx_id,
 						TTBR0));
+		}
 	}
 
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
@@ -823,6 +1102,9 @@
 	unsigned int range = kgsl_sg_size(memdesc->sg, memdesc->sglen);
 	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
 
+	/* All GPU addresses as assigned are page aligned, but some
+	   functions purturb the gpuaddr with an offset, so apply the
+	   mask here to make sure we have the right address */
 
 	unsigned int gpuaddr = memdesc->gpuaddr &  KGSL_MMU_ALIGN_MASK;
 
@@ -836,7 +1118,11 @@
 			range, ret);
 
 #ifdef CONFIG_KGSL_PER_PROCESS_PAGE_TABLE
-	if (!ret)
+	/*
+	 * Flushing only required if per process pagetables are used. With
+	 * global case, flushing will happen inside iommu_map function
+	 */
+	if (!ret && msm_soc_version_supports_iommu_v1())
 		*tlb_flags = UINT_MAX;
 #endif
 	return 0;
@@ -874,17 +1160,39 @@
 static void kgsl_iommu_stop(struct kgsl_mmu *mmu)
 {
 	struct kgsl_iommu *iommu = mmu->priv;
+	int i, j;
+	/*
+	 *  stop device mmu
+	 *
+	 *  call this with the global lock held
+	 */
 
 	if (mmu->flags & KGSL_FLAGS_STARTED) {
-		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000000);
-		
+		/* detach iommu attachment */
 		kgsl_detach_pagetable_iommu_domain(mmu);
 		mmu->hwpagetable = NULL;
 
 		mmu->flags &= ~KGSL_FLAGS_STARTED;
-	}
 
-	
+		if (mmu->fault) {
+			for (i = 0; i < iommu->unit_count; i++) {
+				struct kgsl_iommu_unit *iommu_unit =
+					&iommu->iommu_units[i];
+				for (j = 0; j < iommu_unit->dev_count; j++) {
+					if (iommu_unit->dev[j].fault) {
+						kgsl_iommu_enable_clk(mmu, j);
+						KGSL_IOMMU_SET_CTX_REG(iommu,
+						iommu_unit,
+						iommu_unit->dev[j].ctx_id,
+						RESUME, 1);
+						iommu_unit->dev[j].fault = 0;
+					}
+				}
+			}
+			mmu->fault = 0;
+		}
+	}
+	/* switch off MMU clocks and cancel any events it has queued */
 	iommu->clk_event_queued = false;
 	kgsl_cancel_events(mmu->device, mmu);
 	kgsl_iommu_disable_clk(mmu);
@@ -920,98 +1228,117 @@
 {
 	unsigned int pt_base;
 	struct kgsl_iommu *iommu = mmu->priv;
+	/* We cannot enable or disable the clocks in interrupt context, this
+	 function is called from interrupt context if there is an axi error */
 	if (in_interrupt())
 		return 0;
-	
+	/* Return the current pt base by reading IOMMU pt_base register */
 	kgsl_iommu_enable_clk(mmu, KGSL_IOMMU_CONTEXT_USER);
-	pt_base = readl_relaxed(iommu->iommu_units[0].reg_map.hostptr +
-			(KGSL_IOMMU_CONTEXT_USER << KGSL_IOMMU_CTX_SHIFT) +
-			KGSL_IOMMU_TTBR0);
+	pt_base = KGSL_IOMMU_GET_CTX_REG(iommu, (&iommu->iommu_units[0]),
+					KGSL_IOMMU_CONTEXT_USER,
+					TTBR0);
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
-	return pt_base & (KGSL_IOMMU_TTBR0_PA_MASK <<
-				KGSL_IOMMU_TTBR0_PA_SHIFT);
+	return pt_base &
+		(iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
+		iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
 }
 
+/*
+ * kgsl_iommu_default_setstate - Change the IOMMU pagetable or flush IOMMU tlb
+ * of the primary context bank
+ * @mmu - Pointer to mmu structure
+ * @flags - Flags indicating whether pagetable has to chnage or tlb is to be
+ * flushed or both
+ *
+ * Based on flags set the new pagetable fo the IOMMU unit or flush it's tlb or
+ * do both by doing direct register writes to the IOMMu registers through the
+ * cpu
+ * Return - void
+ */
 static void kgsl_iommu_default_setstate(struct kgsl_mmu *mmu,
 					uint32_t flags)
 {
 	struct kgsl_iommu *iommu = mmu->priv;
 	int temp;
 	int i;
-	unsigned int pt_base = kgsl_iommu_pt_get_base_addr(
-					mmu->hwpagetable);
+	unsigned int pt_base = kgsl_iommu_get_pt_base_addr(mmu,
+						mmu->hwpagetable);
 	unsigned int pt_val;
 
 	if (kgsl_iommu_enable_clk(mmu, KGSL_IOMMU_CONTEXT_USER)) {
 		KGSL_DRV_ERR(mmu->device, "Failed to enable iommu clocks\n");
 		return;
 	}
-	
-	pt_base &= (KGSL_IOMMU_TTBR0_PA_MASK << KGSL_IOMMU_TTBR0_PA_SHIFT);
+	/* Mask off the lsb of the pt base address since lsb will not change */
+	pt_base &= (iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
+			iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
 
-	
-	if (msm_soc_version_supports_iommu_v1())
-		kgsl_idle(mmu->device);
-
-	
+	//if (msm_soc_version_supports_iommu_v1())
+	/* Acquire GPU-CPU sync Lock here */
 	msm_iommu_lock();
 
 	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
 		if (!msm_soc_version_supports_iommu_v1())
 			kgsl_idle(mmu->device);
 		for (i = 0; i < iommu->unit_count; i++) {
+			/* get the lsb value which should not change when
+			 * changing ttbr0 */
 			pt_val = kgsl_iommu_get_pt_lsb(mmu, i,
 						KGSL_IOMMU_CONTEXT_USER);
 			pt_val += pt_base;
 
-			KGSL_IOMMU_SET_IOMMU_REG(
-				iommu->iommu_units[i].reg_map.hostptr,
+			KGSL_IOMMU_SET_CTX_REG(iommu, (&iommu->iommu_units[i]),
 				KGSL_IOMMU_CONTEXT_USER, TTBR0, pt_val);
 
 			mb();
-			temp = KGSL_IOMMU_GET_IOMMU_REG(
-				iommu->iommu_units[i].reg_map.hostptr,
+			temp = KGSL_IOMMU_GET_CTX_REG(iommu,
+				(&iommu->iommu_units[i]),
 				KGSL_IOMMU_CONTEXT_USER, TTBR0);
 		}
 	}
-	
+	/* Flush tlb */
 	if (flags & KGSL_MMUFLAGS_TLBFLUSH) {
 		for (i = 0; i < iommu->unit_count; i++) {
-			KGSL_IOMMU_SET_IOMMU_REG(
-				iommu->iommu_units[i].reg_map.hostptr,
-				KGSL_IOMMU_CONTEXT_USER, CTX_TLBIALL,
-				1);
+			KGSL_IOMMU_SET_CTX_REG(iommu, (&iommu->iommu_units[i]),
+				KGSL_IOMMU_CONTEXT_USER, TLBIALL, 1);
 			mb();
 		}
 	}
 
-	
+	/* Release GPU-CPU sync Lock here */
 	msm_iommu_unlock();
 
-	
+	/* Disable smmu clock */
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
 }
 
-static int kgsl_iommu_get_reg_map_desc(struct kgsl_mmu *mmu,
-					void **reg_map_desc)
+/*
+ * kgsl_iommu_get_reg_gpuaddr - Returns the gpu address of IOMMU regsiter
+ * @mmu - Pointer to mmu structure
+ * @iommu_unit - The iommu unit for which base address is requested
+ * @ctx_id - The context ID of the IOMMU ctx
+ * @reg - The register for which address is required
+ *
+ * Return - The number of iommu units which is also the number of register
+ * mapped descriptor arrays which the out parameter will have
+ */
+static unsigned int kgsl_iommu_get_reg_gpuaddr(struct kgsl_mmu *mmu,
+					int iommu_unit, int ctx_id, int reg)
 {
 	struct kgsl_iommu *iommu = mmu->priv;
-	void **reg_desc_ptr;
-	int i;
 
-	reg_desc_ptr = kmalloc(iommu->unit_count *
-			sizeof(struct kgsl_memdesc *), GFP_KERNEL);
-	if (!reg_desc_ptr) {
-		KGSL_CORE_ERR("Failed to kmalloc(%d)\n",
-			iommu->unit_count * sizeof(struct kgsl_memdesc *));
-		return -ENOMEM;
-	}
+	if (KGSL_IOMMU_GLOBAL_BASE == reg)
+		return iommu->iommu_units[iommu_unit].reg_map.gpuaddr;
+	else
+		return iommu->iommu_units[iommu_unit].reg_map.gpuaddr +
+			iommu->iommu_reg_list[reg].reg_offset +
+			(ctx_id << KGSL_IOMMU_CTX_SHIFT) + iommu->ctx_offset;
+}
 
-	for (i = 0; i < iommu->unit_count; i++)
-		reg_desc_ptr[i] = &(iommu->iommu_units[i].reg_map);
-
-	*reg_map_desc = reg_desc_ptr;
-	return i;
+static int kgsl_iommu_get_num_iommu_units(struct kgsl_mmu *mmu)
+{
+	struct kgsl_iommu *iommu = mmu->priv;
+	return iommu->unit_count;
 }
 
 struct kgsl_mmu_ops iommu_ops = {
@@ -1026,7 +1353,10 @@
 	.mmu_enable_clk = kgsl_iommu_enable_clk,
 	.mmu_disable_clk_on_ts = kgsl_iommu_disable_clk_on_ts,
 	.mmu_get_pt_lsb = kgsl_iommu_get_pt_lsb,
-	.mmu_get_reg_map_desc = kgsl_iommu_get_reg_map_desc,
+	.mmu_get_reg_gpuaddr = kgsl_iommu_get_reg_gpuaddr,
+	.mmu_get_num_iommu_units = kgsl_iommu_get_num_iommu_units,
+	.mmu_pt_equal = kgsl_iommu_pt_equal,
+	.mmu_get_pt_base_addr = kgsl_iommu_get_pt_base_addr,
 	.mmu_sync_lock = kgsl_iommu_sync_lock,
 	.mmu_sync_unlock = kgsl_iommu_sync_unlock,
 };
@@ -1036,6 +1366,4 @@
 	.mmu_unmap = kgsl_iommu_unmap,
 	.mmu_create_pagetable = kgsl_iommu_create_pagetable,
 	.mmu_destroy_pagetable = kgsl_iommu_destroy_pagetable,
-	.mmu_pt_equal = kgsl_iommu_pt_equal,
-	.mmu_pt_get_base_addr = kgsl_iommu_pt_get_base_addr,
 };
diff --git a/drivers/gpu/msm/kgsl_iommu.h b/drivers/gpu/msm/kgsl_iommu.h
index 3389f08..4507700 100644
--- a/drivers/gpu/msm/kgsl_iommu.h
+++ b/drivers/gpu/msm/kgsl_iommu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -15,35 +15,72 @@
 
 #include <mach/iommu.h>
 
-#define KGSL_IOMMU_TTBR0			0x10
-#define KGSL_IOMMU_TTBR1			0x14
-#define KGSL_IOMMU_FSR				0x20
+#define KGSL_IOMMU_CTX_OFFSET_V1	0
+#define KGSL_IOMMU_CTX_OFFSET_V2	0x8000
+#define KGSL_IOMMU_CTX_SHIFT		12
 
-#define KGSL_IOMMU_TTBR0_PA_MASK		0x0003FFFF
-#define KGSL_IOMMU_TTBR0_PA_SHIFT		14
-#define KGSL_IOMMU_CTX_TLBIALL			0x800
-#define KGSL_IOMMU_CTX_SHIFT			12
+enum kgsl_iommu_reg_map {
+	KGSL_IOMMU_GLOBAL_BASE = 0,
+	KGSL_IOMMU_CTX_TTBR0,
+	KGSL_IOMMU_CTX_TTBR1,
+	KGSL_IOMMU_CTX_FSR,
+	KGSL_IOMMU_CTX_TLBIALL,
+	KGSL_IOMMU_CTX_RESUME,
+	KGSL_IOMMU_REG_MAX
+};
 
+struct kgsl_iommu_register_list {
+	unsigned int reg_offset;
+	unsigned int reg_mask;
+	unsigned int reg_shift;
+};
+
+/*
+ * Max number of iommu units that the gpu core can have
+ * On APQ8064, KGSL can control a maximum of 2 IOMMU units.
+ */
 #define KGSL_IOMMU_MAX_UNITS 2
 
+/* Max number of iommu contexts per IOMMU unit */
 #define KGSL_IOMMU_MAX_DEVS_PER_UNIT 2
 
-#define KGSL_IOMMU_SET_IOMMU_REG(base_addr, ctx, REG, val)		\
-		writel_relaxed(val, base_addr +				\
-				(ctx << KGSL_IOMMU_CTX_SHIFT) +		\
-				KGSL_IOMMU_##REG)
+/* Macros to read/write IOMMU registers */
+#define KGSL_IOMMU_SET_CTX_REG(iommu, iommu_unit, ctx, REG, val)	\
+		writel_relaxed(val,					\
+		iommu_unit->reg_map.hostptr +				\
+		iommu->iommu_reg_list[KGSL_IOMMU_CTX_##REG].reg_offset +\
+		(ctx << KGSL_IOMMU_CTX_SHIFT) +				\
+		iommu->ctx_offset)
 
-#define KGSL_IOMMU_GET_IOMMU_REG(base_addr, ctx, REG)			\
-		readl_relaxed(base_addr +				\
-			(ctx << KGSL_IOMMU_CTX_SHIFT) +			\
-			KGSL_IOMMU_##REG)
+#define KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit, ctx, REG)		\
+		readl_relaxed(						\
+		iommu_unit->reg_map.hostptr +				\
+		iommu->iommu_reg_list[KGSL_IOMMU_CTX_##REG].reg_offset +\
+		(ctx << KGSL_IOMMU_CTX_SHIFT) +				\
+		iommu->ctx_offset)
 
-#define KGSL_IOMMMU_PT_LSB(pt_val)					\
-		(pt_val & ~(KGSL_IOMMU_TTBR0_PA_MASK <<			\
-				KGSL_IOMMU_TTBR0_PA_SHIFT))
+/* Gets the lsb value of pagetable */
+#define KGSL_IOMMMU_PT_LSB(iommu, pt_val) \
+	(pt_val &							\
+	~(iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<	\
+	iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift))
 
+/* offset at which a nop command is placed in setstate_memory */
 #define KGSL_IOMMU_SETSTATE_NOP_OFFSET	1024
 
+/*
+ * struct kgsl_iommu_device - Structure holding data about iommu contexts
+ * @dev: Device pointer to iommu context
+ * @attached: Indicates whether this iommu context is presently attached to
+ * a pagetable/domain or not
+ * @pt_lsb: The LSB of IOMMU_TTBR0 register which is the pagetable
+ * register
+ * @ctx_id: This iommu units context id. It can be either 0 or 1
+ * @clk_enabled: If set indicates that iommu clocks of this iommu context
+ * are on, else the clocks are off
+ * fault: Flag when set indicates that this iommu device has caused a page
+ * fault
+ */
 struct kgsl_iommu_device {
 	struct device *dev;
 	bool attached;
@@ -51,25 +88,62 @@
 	enum kgsl_iommu_context_id ctx_id;
 	bool clk_enabled;
 	struct kgsl_device *kgsldev;
+	int fault;
 };
 
+/*
+ * struct kgsl_iommu_unit - Structure holding data about iommu units. An IOMMU
+ * units is basically a separte IOMMU h/w block with it's own IOMMU contexts
+ * @dev: Pointer to array of struct kgsl_iommu_device which has information
+ * about the IOMMU contexts under this IOMMU unit
+ * @dev_count: Number of IOMMU contexts that are valid in the previous feild
+ * @reg_map: Memory descriptor which holds the mapped address of this IOMMU
+ * units register range
+ */
 struct kgsl_iommu_unit {
 	struct kgsl_iommu_device dev[KGSL_IOMMU_MAX_DEVS_PER_UNIT];
 	unsigned int dev_count;
 	struct kgsl_memdesc reg_map;
 };
 
+/*
+ * struct kgsl_iommu - Structure holding iommu data for kgsl driver
+ * @dev: Array of kgsl_iommu_device which contain information about
+ * iommu contexts owned by graphics cores
+ * @unit_count: Number of IOMMU units that are available for this
+ * instance of the IOMMU driver
+ * @iommu_last_cmd_ts: The timestamp of last command submitted that
+ * aceeses iommu registers
+ * @clk_event_queued: Indicates whether an event to disable clocks
+ * is already queued or not
+ * @device: Pointer to kgsl device
+ * @ctx_offset: The context offset to be added to base address when
+ * accessing IOMMU registers
+ * @iommu_reg_list: List of IOMMU registers { offset, map, shift } array
+ * @sync_lock_vars: Pointer to the IOMMU spinlock for serializing access to the
+ * IOMMU registers
+ * @sync_lock_desc: GPU Memory descriptor for the memory containing the
+ * spinlocks
+ * @sync_lock_initialized: True if the sync_lock feature is enabled
+ */
 struct kgsl_iommu {
 	struct kgsl_iommu_unit iommu_units[KGSL_IOMMU_MAX_UNITS];
 	unsigned int unit_count;
 	unsigned int iommu_last_cmd_ts;
 	bool clk_event_queued;
 	struct kgsl_device *device;
+	unsigned int ctx_offset;
+	struct kgsl_iommu_register_list *iommu_reg_list;
 	struct remote_iommu_petersons_spinlock *sync_lock_vars;
 	struct kgsl_memdesc sync_lock_desc;
 	bool sync_lock_initialized;
 };
 
+/*
+ * struct kgsl_iommu_pt - Iommu pagetable structure private to kgsl driver
+ * @domain: Pointer to the iommu domain that contains the iommu pagetable
+ * @iommu: Pointer to iommu structure
+ */
 struct kgsl_iommu_pt {
 	struct iommu_domain *domain;
 	struct kgsl_iommu *iommu;
diff --git a/drivers/gpu/msm/kgsl_log.h b/drivers/gpu/msm/kgsl_log.h
index 9c6e317..83d14f7 100644
--- a/drivers/gpu/msm/kgsl_log.h
+++ b/drivers/gpu/msm/kgsl_log.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2008-2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -103,8 +103,19 @@
 #define KGSL_PWR_CRIT(_dev, fmt, args...) \
 KGSL_LOG_CRIT(_dev->dev, _dev->pwr_log, fmt, ##args)
 
+#define KGSL_FT_INFO(_dev, fmt, args...) \
+KGSL_LOG_INFO(_dev->dev, _dev->ft_log, fmt, ##args)
+#define KGSL_FT_WARN(_dev, fmt, args...) \
+KGSL_LOG_WARN(_dev->dev, _dev->ft_log, fmt, ##args)
+#define KGSL_FT_ERR(_dev, fmt, args...) \
+KGSL_LOG_ERR(_dev->dev, _dev->ft_log, fmt, ##args)
+#define KGSL_FT_CRIT(_dev, fmt, args...) \
+KGSL_LOG_CRIT(_dev->dev, _dev->ft_log, fmt, ##args)
+
+/* Core error messages - these are for core KGSL functions that have
+   no device associated with them (such as memory) */
 
 #define KGSL_CORE_ERR(fmt, args...) \
 pr_err("kgsl: %s: " fmt, __func__, ##args)
 
-#endif 
+#endif /* __KGSL_LOG_H */
diff --git a/drivers/gpu/msm/kgsl_mmu.c b/drivers/gpu/msm/kgsl_mmu.c
index df74c11..d1f58c4 100644
--- a/drivers/gpu/msm/kgsl_mmu.c
+++ b/drivers/gpu/msm/kgsl_mmu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -18,13 +18,14 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/iommu.h>
+#include <mach/iommu.h>
 #include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_mmu.h"
 #include "kgsl_device.h"
 #include "kgsl_sharedmem.h"
-#include "adreno_postmortem.h"
+#include "adreno.h"
 
 #define KGSL_MMU_ALIGN_SHIFT    13
 #define KGSL_MMU_ALIGN_MASK     (~((1 << KGSL_MMU_ALIGN_SHIFT) - 1))
@@ -36,7 +37,7 @@
 static int kgsl_cleanup_pt(struct kgsl_pagetable *pt)
 {
 	int i;
-	
+	/* For IOMMU only unmap the global structures to global pt */
 	if ((KGSL_MMU_TYPE_NONE != kgsl_mmu_type) &&
 		(KGSL_MMU_TYPE_IOMMU == kgsl_mmu_type) &&
 		(KGSL_MMU_GLOBAL_PT !=  pt->name) &&
@@ -56,7 +57,7 @@
 	int i = 0;
 	int status = 0;
 
-	
+	/* For IOMMU only map the global structures to global pt */
 	if ((KGSL_MMU_TYPE_NONE != kgsl_mmu_type) &&
 		(KGSL_MMU_TYPE_IOMMU == kgsl_mmu_type) &&
 		(KGSL_MMU_GLOBAL_PT !=  pt->name) &&
@@ -311,29 +312,31 @@
 
 unsigned int kgsl_mmu_get_ptsize(void)
 {
+	/*
+	 * For IOMMU, we could do up to 4G virtual range if we wanted to, but
+	 * it makes more sense to return a smaller range and leave the rest of
+	 * the virtual range for future improvements
+	 */
 
 	if (KGSL_MMU_TYPE_GPU == kgsl_mmu_type)
 		return CONFIG_MSM_KGSL_PAGE_TABLE_SIZE;
 	else if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_type)
-#ifdef CONFIG_KGSL_PER_PROCESS_PAGE_TABLE
-		return CONFIG_MSM_KGSL_PAGE_TABLE_SIZE_FOR_IOMMU;
-#else
-                return SZ_2G - KGSL_PAGETABLE_BASE;
-#endif
-
+		return SZ_2G - KGSL_PAGETABLE_BASE;
 	else
 		return 0;
 }
 
 int
-kgsl_mmu_get_ptname_from_ptbase(unsigned int pt_base)
+kgsl_mmu_get_ptname_from_ptbase(struct kgsl_mmu *mmu, unsigned int pt_base)
 {
 	struct kgsl_pagetable *pt;
 	int ptid = -1;
 
+	if (!mmu->mmu_ops || !mmu->mmu_ops->mmu_pt_equal)
+		return KGSL_MMU_GLOBAL_PT;
 	spin_lock(&kgsl_driver.ptlock);
 	list_for_each_entry(pt, &kgsl_driver.pagetable_list, list) {
-		if (pt->pt_ops->mmu_pt_equal(pt, pt_base)) {
+		if (mmu->mmu_ops->mmu_pt_equal(mmu, pt, pt_base)) {
 			ptid = (int) pt->name;
 			break;
 		}
@@ -344,6 +347,35 @@
 }
 EXPORT_SYMBOL(kgsl_mmu_get_ptname_from_ptbase);
 
+unsigned int
+kgsl_mmu_log_fault_addr(struct kgsl_mmu *mmu, unsigned int pt_base,
+					unsigned int addr)
+{
+	struct kgsl_pagetable *pt;
+	unsigned int ret = 0;
+
+	if (!mmu->mmu_ops || !mmu->mmu_ops->mmu_pt_equal)
+		return KGSL_MMU_GLOBAL_PT;
+	spin_lock(&kgsl_driver.ptlock);
+	list_for_each_entry(pt, &kgsl_driver.pagetable_list, list) {
+		if (mmu->mmu_ops->mmu_pt_equal(mmu, pt, pt_base)) {
+			if ((addr & (PAGE_SIZE-1)) == pt->fault_addr) {
+				ret = 1;
+				break;
+			} else {
+				pt->fault_addr = (addr & (PAGE_SIZE-1));
+				ret = 0;
+				break;
+			}
+
+		}
+	}
+	spin_unlock(&kgsl_driver.ptlock);
+
+	return ret;
+}
+EXPORT_SYMBOL(kgsl_mmu_log_fault_addr);
+
 int kgsl_mmu_init(struct kgsl_device *device)
 {
 	int status = 0;
@@ -379,7 +411,7 @@
 
 	if (kgsl_mmu_type == KGSL_MMU_TYPE_NONE) {
 		kgsl_regwrite(device, MH_MMU_CONFIG, 0);
-		
+		/* Setup gpuaddr of global mappings */
 		if (!mmu->setstate_memory.gpuaddr)
 			kgsl_setup_pt(NULL);
 		return 0;
@@ -395,6 +427,10 @@
 
 	kgsl_regread(device, MH_AXI_ERROR, &reg);
 	pt_base = kgsl_mmu_get_current_ptbase(&device->mmu);
+	/*
+	 * Read gpu virtual and physical addresses that
+	 * caused the error from the debug data.
+	 */
 	kgsl_regwrite(device, MH_DEBUG_CTRL, 44);
 	kgsl_regread(device, MH_DEBUG_DATA, &gpu_err);
 	kgsl_regwrite(device, MH_DEBUG_CTRL, 45);
@@ -445,7 +481,12 @@
 
 	pagetable->name = name;
 	pagetable->max_entries = KGSL_PAGETABLE_ENTRIES(ptsize);
+	pagetable->fault_addr = 0xFFFFFFFF;
 
+	/*
+	 * create a separate kgsl pool for IOMMU, global mappings can be mapped
+	 * just once from this pool of the defaultpagetable
+	 */
 	if ((KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) &&
 		((KGSL_MMU_GLOBAL_PT == name) ||
 		(KGSL_MMU_PRIV_BANK_TABLE_NAME == name))) {
@@ -493,7 +534,7 @@
 	list_add(&pagetable->list, &kgsl_driver.pagetable_list);
 	spin_unlock_irqrestore(&kgsl_driver.ptlock, flags);
 
-	
+	/* Create the sysfs entries */
 	pagetable_add_sysfs_objects(pagetable);
 
 	return pagetable;
@@ -521,6 +562,10 @@
 #ifndef CONFIG_KGSL_PER_PROCESS_PAGE_TABLE
 	name = KGSL_MMU_GLOBAL_PT;
 #endif
+	/* We presently do not support per-process for IOMMU-v2 */
+	if (!msm_soc_version_supports_iommu_v1())
+		name = KGSL_MMU_GLOBAL_PT;
+
 	pt = kgsl_get_pagetable(name);
 
 	if (pt == NULL)
@@ -539,6 +584,12 @@
 			uint32_t flags)
 {
 	struct kgsl_device *device = mmu->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	if (!(flags & (KGSL_MMUFLAGS_TLBFLUSH | KGSL_MMUFLAGS_PTUPDATE))
+		&& !adreno_is_a2xx(adreno_dev))
+		return;
+
 	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_type)
 		return;
 	else if (device->ftbl->setstate)
@@ -551,11 +602,11 @@
 void kgsl_mh_start(struct kgsl_device *device)
 {
 	struct kgsl_mh *mh = &device->mh;
-	
+	/* force mmu off to for now*/
 	kgsl_regwrite(device, MH_MMU_CONFIG, 0);
 	kgsl_idle(device);
 
-	
+	/* define physical memory range accessible by the core */
 	kgsl_regwrite(device, MH_MMU_MPU_BASE, mh->mpu_base);
 	kgsl_regwrite(device, MH_MMU_MPU_END,
 			mh->mpu_base + mh->mpu_range);
@@ -569,13 +620,17 @@
 		kgsl_regwrite(device, MH_CLNT_INTF_CTRL_CONFIG2,
 				mh->mh_intf_cfg2);
 
+	/*
+	 * Interrupts are enabled on a per-device level when
+	 * kgsl_pwrctrl_irq() is called
+	 */
 }
 
 static inline struct gen_pool *
 _get_pool(struct kgsl_pagetable *pagetable, unsigned int flags)
 {
 	if (pagetable->kgsl_pool &&
-		(KGSL_MEMFLAGS_GLOBAL & flags))
+		(KGSL_MEMDESC_GLOBAL & flags))
 		return pagetable->kgsl_pool;
 	return pagetable->pool;
 }
@@ -588,6 +643,7 @@
 	int ret;
 	struct gen_pool *pool;
 	int size;
+	int page_align = ilog2(PAGE_SIZE);
 
 	if (kgsl_mmu_type == KGSL_MMU_TYPE_NONE) {
 		if (memdesc->sglen == 1) {
@@ -609,10 +665,19 @@
 
 	size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
 
-	
+	/* Allocate from kgsl pool if it exists for global mappings */
 	pool = _get_pool(pagetable, memdesc->priv);
 
-	memdesc->gpuaddr = gen_pool_alloc(pool, size);
+	/* Allocate aligned virtual addresses for iommu. This allows
+	 * more efficient pagetable entries if the physical memory
+	 * is also aligned. Don't do this for GPUMMU, because
+	 * the address space is so small.
+	 */
+	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype() &&
+	    kgsl_memdesc_get_align(memdesc) > 0)
+		page_align = kgsl_memdesc_get_align(memdesc);
+
+	memdesc->gpuaddr = gen_pool_alloc_aligned(pool, size, page_align);
 	if (memdesc->gpuaddr == 0) {
 		KGSL_CORE_ERR("gen_pool_alloc(%d) failed from pool: %s\n",
 			size,
@@ -634,7 +699,7 @@
 	if (ret)
 		goto err_free_gpuaddr;
 
-	
+	/* Keep track of the statistics for the sysfs files */
 
 	KGSL_STATS_ADD(1, pagetable->stats.entries,
 		       pagetable->stats.max_entries);
@@ -660,6 +725,8 @@
 {
 	struct gen_pool *pool;
 	int size;
+	unsigned int start_addr = 0;
+	unsigned int end_addr = 0;
 
 	if (memdesc->size == 0 || memdesc->gpuaddr == 0)
 		return 0;
@@ -671,13 +738,22 @@
 
 	size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
 
+	start_addr = memdesc->gpuaddr;
+	end_addr = (memdesc->gpuaddr + size);
+
 	if (KGSL_MMU_TYPE_IOMMU != kgsl_mmu_get_mmutype())
 		spin_lock(&pagetable->lock);
 	pagetable->pt_ops->mmu_unmap(pagetable->priv, memdesc,
 					&pagetable->tlb_flags);
+
+	/* If buffer is unmapped 0 fault addr */
+	if ((pagetable->fault_addr >= start_addr) &&
+		(pagetable->fault_addr < end_addr))
+		pagetable->fault_addr = 0;
+
 	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
 		spin_lock(&pagetable->lock);
-	
+	/* Remove the statistics */
 	pagetable->stats.entries--;
 	pagetable->stats.mapped -= size;
 
@@ -686,7 +762,11 @@
 	pool = _get_pool(pagetable, memdesc->priv);
 	gen_pool_free(pool, memdesc->gpuaddr, size);
 
-	if (!(memdesc->priv & KGSL_MEMFLAGS_GLOBAL))
+	/*
+	 * Don't clear the gpuaddr on global mappings because they
+	 * may be in use by other pagetables
+	 */
+	if (!(memdesc->priv & KGSL_MEMDESC_GLOBAL))
 		memdesc->gpuaddr = 0;
 	return 0;
 }
@@ -702,18 +782,18 @@
 		KGSL_CORE_ERR("invalid memdesc\n");
 		goto error;
 	}
-	
+	/* Not all global mappings are needed for all MMU types */
 	if (!memdesc->size)
 		return 0;
 
 	gpuaddr = memdesc->gpuaddr;
-	memdesc->priv |= KGSL_MEMFLAGS_GLOBAL;
+	memdesc->priv |= KGSL_MEMDESC_GLOBAL;
 
 	result = kgsl_mmu_map(pagetable, memdesc, protflags);
 	if (result)
 		goto error;
 
-	
+	/*global mappings must have the same gpu address in all pagetables*/
 	if (gpuaddr && gpuaddr != memdesc->gpuaddr) {
 		KGSL_CORE_ERR("pt %p addr mismatch phys 0x%08x"
 			"gpu 0x%0x 0x%08x", pagetable, memdesc->physaddr,
@@ -792,15 +872,13 @@
 
 void kgsl_mmu_set_mmutype(char *mmutype)
 {
-	
+	/* Set the default MMU - GPU on <=8960 and nothing on >= 8064 */
 	kgsl_mmu_type =
 		cpu_is_apq8064() ? KGSL_MMU_TYPE_NONE : KGSL_MMU_TYPE_GPU;
 
-#ifndef CONFIG_MSM_KGSL_DEFAULT_GPUMMU
-	
+	/* Use the IOMMU if it is found */
 	if (iommu_present(&platform_bus_type))
 		kgsl_mmu_type = KGSL_MMU_TYPE_IOMMU;
-#endif
 
 	if (mmutype && !strncmp(mmutype, "gpummu", 6))
 		kgsl_mmu_type = KGSL_MMU_TYPE_GPU;
@@ -811,3 +889,13 @@
 		kgsl_mmu_type = KGSL_MMU_TYPE_NONE;
 }
 EXPORT_SYMBOL(kgsl_mmu_set_mmutype);
+
+int kgsl_mmu_gpuaddr_in_range(unsigned int gpuaddr)
+{
+	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_type)
+		return 1;
+	return ((gpuaddr >= KGSL_PAGETABLE_BASE) &&
+		(gpuaddr < (KGSL_PAGETABLE_BASE + kgsl_mmu_get_ptsize())));
+}
+EXPORT_SYMBOL(kgsl_mmu_gpuaddr_in_range);
+
diff --git a/drivers/gpu/msm/kgsl_mmu.h b/drivers/gpu/msm/kgsl_mmu.h
index d8713d3..377f342 100644
--- a/drivers/gpu/msm/kgsl_mmu.h
+++ b/drivers/gpu/msm/kgsl_mmu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -13,6 +13,10 @@
 #ifndef __KGSL_MMU_H
 #define __KGSL_MMU_H
 
+/*
+ * These defines control the split between ttbr1 and ttbr0 pagetables of IOMMU
+ * and what ranges of memory we map to them
+ */
 #define KGSL_IOMMU_GLOBAL_MEM_BASE	0xC0000000
 #define KGSL_IOMMU_GLOBAL_MEM_SIZE	SZ_4M
 #define KGSL_IOMMU_TTBR1_SPLIT		2
@@ -20,6 +24,9 @@
 #define KGSL_MMU_ALIGN_SHIFT    13
 #define KGSL_MMU_ALIGN_MASK     (~((1 << KGSL_MMU_ALIGN_SHIFT) - 1))
 
+/* Identifier for the global page table */
+/* Per process page tables will probably pass in the thread group
+   as an identifier */
 
 #define KGSL_MMU_GLOBAL_PT 0
 #define KGSL_MMU_PRIV_BANK_TABLE_NAME 0xFFFFFFFF
@@ -31,6 +38,10 @@
 #define GSL_PT_PAGE_RV		0x00000002
 #define GSL_PT_PAGE_DIRTY	0x00000004
 
+/* MMU registers - the register locations for all cores are the
+   same.  The method for getting to those locations differs between
+   2D and 3D, but the 2D and 3D register functions do that magic
+   for us */
 
 #define MH_MMU_CONFIG                0x0040
 #define MH_MMU_VA_RANGE              0x0041
@@ -52,6 +63,7 @@
 #define MH_CLNT_INTF_CTRL_CONFIG1    0x0A54
 #define MH_CLNT_INTF_CTRL_CONFIG2    0x0A55
 
+/* MH_MMU_CONFIG bit definitions */
 
 #define MH_MMU_CONFIG__RB_W_CLNT_BEHAVIOR__SHIFT           0x00000004
 #define MH_MMU_CONFIG__CP_W_CLNT_BEHAVIOR__SHIFT           0x00000006
@@ -65,6 +77,7 @@
 #define MH_MMU_CONFIG__TC_R_CLNT_BEHAVIOR__SHIFT           0x00000016
 #define MH_MMU_CONFIG__PA_W_CLNT_BEHAVIOR__SHIFT           0x00000018
 
+/* MMU Flags */
 #define KGSL_MMUFLAGS_TLBFLUSH         0x10000000
 #define KGSL_MMUFLAGS_PTUPDATE         0x20000000
 
@@ -101,6 +114,7 @@
 	} stats;
 	const struct kgsl_mmu_pt_ops *pt_ops;
 	unsigned int tlb_flags;
+	unsigned int fault_addr;
 	void *priv;
 };
 
@@ -126,8 +140,15 @@
 	int (*mmu_get_pt_lsb)(struct kgsl_mmu *mmu,
 				unsigned int unit_id,
 				enum kgsl_iommu_context_id ctx_id);
-	int (*mmu_get_reg_map_desc)(struct kgsl_mmu *mmu,
-				void **reg_map_desc);
+	unsigned int (*mmu_get_reg_gpuaddr)(struct kgsl_mmu *mmu,
+			int iommu_unit_num, int ctx_id, int reg);
+	int (*mmu_get_num_iommu_units)(struct kgsl_mmu *mmu);
+	int (*mmu_pt_equal) (struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt,
+			unsigned int pt_base);
+	unsigned int (*mmu_get_pt_base_addr)
+			(struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt);
 	unsigned int (*mmu_sync_lock)
 			(struct kgsl_mmu *mmu,
 			unsigned int *cmds);
@@ -146,10 +167,6 @@
 			unsigned int *tlb_flags);
 	void *(*mmu_create_pagetable) (void);
 	void (*mmu_destroy_pagetable) (void *pt);
-	int (*mmu_pt_equal) (struct kgsl_pagetable *pt,
-			unsigned int pt_base);
-	unsigned int (*mmu_pt_get_base_addr)
-			(struct kgsl_pagetable *pt);
 };
 
 #define KGSL_MMU_FLAGS_IOMMU_SYNC BIT(31)
@@ -160,13 +177,14 @@
 	struct kgsl_device     *device;
 	unsigned int     config;
 	struct kgsl_memdesc    setstate_memory;
-	
+	/* current page table object being used by device mmu */
 	struct kgsl_pagetable  *defaultpagetable;
-	
+	/* pagetable object used for priv bank of IOMMU */
 	struct kgsl_pagetable  *priv_bank_table;
 	struct kgsl_pagetable  *hwpagetable;
 	const struct kgsl_mmu_ops *mmu_ops;
 	void *priv;
+	int fault;
 };
 
 #include "kgsl_gpummu.h"
@@ -191,7 +209,10 @@
 unsigned int kgsl_virtaddr_to_physaddr(void *virtaddr);
 void kgsl_setstate(struct kgsl_mmu *mmu, unsigned int context_id,
 			uint32_t flags);
-int kgsl_mmu_get_ptname_from_ptbase(unsigned int pt_base);
+int kgsl_mmu_get_ptname_from_ptbase(struct kgsl_mmu *mmu,
+					unsigned int pt_base);
+unsigned int kgsl_mmu_log_fault_addr(struct kgsl_mmu *mmu,
+			unsigned int pt_base, unsigned int addr);
 int kgsl_mmu_pt_get_flags(struct kgsl_pagetable *pt,
 			enum kgsl_deviceid id);
 void kgsl_mmu_ptpool_destroy(void *ptpool);
@@ -200,7 +221,13 @@
 void kgsl_mmu_set_mmutype(char *mmutype);
 enum kgsl_mmutype kgsl_mmu_get_mmutype(void);
 unsigned int kgsl_mmu_get_ptsize(void);
+int kgsl_mmu_gpuaddr_in_range(unsigned int gpuaddr);
 
+/*
+ * Static inline functions of MMU that simply call the SMMU specific
+ * function using a function pointer. These functions can be thought
+ * of as wrappers around the actual function
+ */
 
 static inline unsigned int kgsl_mmu_get_current_ptbase(struct kgsl_mmu *mmu)
 {
@@ -231,28 +258,21 @@
 		mmu->mmu_ops->mmu_stop(mmu);
 }
 
-static inline int kgsl_mmu_pt_equal(struct kgsl_pagetable *pt,
+static inline int kgsl_mmu_pt_equal(struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt,
 			unsigned int pt_base)
 {
-	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_get_mmutype())
+	if (mmu->mmu_ops && mmu->mmu_ops->mmu_pt_equal)
+		return mmu->mmu_ops->mmu_pt_equal(mmu, pt, pt_base);
+	else
 		return 1;
-	else
-		return pt->pt_ops->mmu_pt_equal(pt, pt_base);
 }
 
-static inline unsigned int kgsl_mmu_pt_get_base_addr(struct kgsl_pagetable *pt)
+static inline unsigned int kgsl_mmu_get_pt_base_addr(struct kgsl_mmu *mmu,
+						struct kgsl_pagetable *pt)
 {
-	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_get_mmutype())
-		return 0;
-	else
-		return pt->pt_ops->mmu_pt_get_base_addr(pt);
-}
-
-static inline int kgsl_mmu_get_reg_map_desc(struct kgsl_mmu *mmu,
-						void **reg_map_desc)
-{
-	if (mmu->mmu_ops && mmu->mmu_ops->mmu_get_reg_map_desc)
-		return mmu->mmu_ops->mmu_get_reg_map_desc(mmu, reg_map_desc);
+	if (mmu->mmu_ops && mmu->mmu_ops->mmu_get_pt_base_addr)
+		return mmu->mmu_ops->mmu_get_pt_base_addr(mmu, pt);
 	else
 		return 0;
 }
@@ -285,7 +305,7 @@
 
 static inline unsigned int kgsl_mmu_get_int_mask(void)
 {
-	
+	/* Dont enable gpummu interrupts, if iommu is enabled */
 	if (KGSL_MMU_TYPE_GPU == kgsl_mmu_get_mmutype())
 		return KGSL_MMU_INT_MASK;
 	else
@@ -293,10 +313,23 @@
 			MH_INTERRUPT_MASK__AXI_WRITE_ERROR);
 }
 
-static inline int kgsl_mmu_gpuaddr_in_range(unsigned int gpuaddr)
+static inline unsigned int kgsl_mmu_get_reg_gpuaddr(struct kgsl_mmu *mmu,
+							int iommu_unit_num,
+							int ctx_id, int reg)
 {
-	return ((gpuaddr >= KGSL_PAGETABLE_BASE) &&
-		(gpuaddr < (KGSL_PAGETABLE_BASE + kgsl_mmu_get_ptsize())));
+	if (mmu->mmu_ops && mmu->mmu_ops->mmu_get_reg_gpuaddr)
+		return mmu->mmu_ops->mmu_get_reg_gpuaddr(mmu, iommu_unit_num,
+							ctx_id, reg);
+	else
+		return 0;
+}
+
+static inline int kgsl_mmu_get_num_iommu_units(struct kgsl_mmu *mmu)
+{
+	if (mmu->mmu_ops && mmu->mmu_ops->mmu_get_num_iommu_units)
+		return mmu->mmu_ops->mmu_get_num_iommu_units(mmu);
+	else
+		return 0;
 }
 
 static inline int kgsl_mmu_sync_lock(struct kgsl_mmu *mmu,
@@ -319,4 +352,4 @@
 		return 0;
 }
 
-#endif 
+#endif /* __KGSL_MMU_H */
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 3ed1ec8..d489119 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -17,7 +17,7 @@
 #include <linux/pm_runtime.h>
 #include <mach/msm_iomap.h>
 #include <mach/msm_bus.h>
-#include <linux/fb.h>
+#include <linux/ktime.h>
 
 #include "kgsl.h"
 #include "kgsl_pwrscale.h"
@@ -38,16 +38,6 @@
 	uint map;
 };
 
-struct gpufreq_stats {
-    unsigned long long last_time;
-    unsigned int last_index;
-    unsigned int cur_index;
-};
-
-static spinlock_t gpufreq_stats_lock;
-static unsigned long long gputime_in_state[KGSL_MAX_PWRLEVELS] = {0};
-struct gpufreq_stats gpufreq_stat;
-
 struct clk_pair clks[KGSL_MAX_CLKS] = {
 	{
 		.name = "src_clk",
@@ -71,72 +61,328 @@
 	},
 };
 
-static int gpufreq_stats_update(unsigned int update_time_only, unsigned int last_index, unsigned int cur_index)
+/* Update the elapsed time at a particular clock level
+ * if the device is active(on_time = true).Otherwise
+ * store it as sleep time.
+ */
+static void update_clk_statistics(struct kgsl_device *device,
+				bool on_time)
 {
-    unsigned long long cur_time;
-    spin_lock(&gpufreq_stats_lock);
-    cur_time = get_jiffies_64();
-    if (update_time_only)
-        goto done;
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+	struct kgsl_clk_stats *clkstats = &pwr->clk_stats;
+	ktime_t elapsed;
+	int elapsed_us;
+	if (clkstats->start.tv64 == 0)
+		clkstats->start = ktime_get();
+	clkstats->stop = ktime_get();
+	elapsed = ktime_sub(clkstats->stop, clkstats->start);
+	elapsed_us = ktime_to_us(elapsed);
+	clkstats->elapsed += elapsed_us;
+	if (on_time)
+		clkstats->clock_time[pwr->active_pwrlevel] += elapsed_us;
+	else
+		clkstats->clock_time[pwr->num_pwrlevels - 1] += elapsed_us;
+	clkstats->start = ktime_get();
+}
 
-    if (last_index < KGSL_MAX_PWRLEVELS)
-        gputime_in_state[last_index] = gputime_in_state[last_index] + cur_time - gpufreq_stat.last_time;
+/*
+ * Given a requested power level do bounds checking on the constraints and
+ * return the nearest possible level
+ */
 
-done:
+static inline int _adjust_pwrlevel(struct kgsl_pwrctrl *pwr, int level)
+{
+	int max_pwrlevel = max_t(int, pwr->thermal_pwrlevel, pwr->max_pwrlevel);
+	int min_pwrlevel = max_t(int, pwr->thermal_pwrlevel, pwr->min_pwrlevel);
 
-    gpufreq_stat.cur_index = cur_index;
-    gpufreq_stat.last_index = last_index;
-    gpufreq_stat.last_time = cur_time;
+	if (level < max_pwrlevel)
+		return max_pwrlevel;
+	if (level > min_pwrlevel)
+		return min_pwrlevel;
 
-    spin_unlock(&gpufreq_stats_lock);
-    return 0;
+	return level;
 }
 
 void kgsl_pwrctrl_pwrlevel_change(struct kgsl_device *device,
 				unsigned int new_level)
 {
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-	if (new_level < (pwr->num_pwrlevels - 1) &&
-		new_level >= pwr->thermal_pwrlevel &&
-		new_level != pwr->active_pwrlevel) {
-		struct kgsl_pwrlevel *pwrlevel = &pwr->pwrlevels[new_level];
-		int diff = new_level - pwr->active_pwrlevel;
-		int d = (diff > 0) ? 1 : -1;
-		int level = pwr->active_pwrlevel;
-		pwr->active_pwrlevel = new_level;
-		if ((test_bit(KGSL_PWRFLAGS_CLK_ON, &pwr->power_flags)) ||
-			(device->state == KGSL_STATE_NAP)) {
-			if (pwr->idle_needed == true)
-				device->ftbl->idle(device);
-			while (level != new_level) {
-				level += d;
-				clk_set_rate(pwr->grp_clks[0],
-						pwr->pwrlevels[level].gpu_freq);
-			}
+	struct kgsl_pwrlevel *pwrlevel;
+	int delta;
+	int level;
+
+	/* Adjust the power level to the current constraints */
+	new_level = _adjust_pwrlevel(pwr, new_level);
+
+	if (new_level == pwr->active_pwrlevel)
+		return;
+
+	delta = new_level < pwr->active_pwrlevel ? -1 : 1;
+
+	update_clk_statistics(device, true);
+
+	level = pwr->active_pwrlevel;
+
+	/*
+	 * Set the active powerlevel first in case the clocks are off - if we
+	 * don't do this then the pwrlevel change won't take effect when the
+	 * clocks come back
+	 */
+
+	pwr->active_pwrlevel = new_level;
+
+	if (test_bit(KGSL_PWRFLAGS_CLK_ON, &pwr->power_flags) ||
+		(device->state == KGSL_STATE_NAP)) {
+
+		/*
+		 * On some platforms, instability is caused on
+		 * changing clock freq when the core is busy.
+		 * Idle the gpu core before changing the clock freq.
+		 */
+
+		if (pwr->idle_needed == true)
+			device->ftbl->idle(device);
+
+		/*
+		 * Don't shift by more than one level at a time to
+		 * avoid glitches.
+		 */
+
+		while (level != new_level) {
+			level += delta;
+
+			clk_set_rate(pwr->grp_clks[0],
+				pwr->pwrlevels[level].gpu_freq);
 		}
-		if (test_bit(KGSL_PWRFLAGS_AXI_ON, &pwr->power_flags)) {
-			if (pwr->pcl) {
-				msm_bus_scale_client_update_request(pwr->pcl,
-					pwrlevel->bus_freq);
-			} else if (pwr->ebi1_clk)
-				clk_set_rate(pwr->ebi1_clk, pwrlevel->bus_freq);
-		}
-		trace_kgsl_pwrlevel(device, pwr->active_pwrlevel,
-				    pwrlevel->gpu_freq);
 	}
+
+	pwrlevel = &pwr->pwrlevels[pwr->active_pwrlevel];
+
+	if (test_bit(KGSL_PWRFLAGS_AXI_ON, &pwr->power_flags)) {
+
+		if (pwr->pcl)
+			msm_bus_scale_client_update_request(pwr->pcl,
+				pwrlevel->bus_freq);
+		else if (pwr->ebi1_clk)
+			clk_set_rate(pwr->ebi1_clk, pwrlevel->bus_freq);
+	}
+
+	trace_kgsl_pwrlevel(device, pwr->active_pwrlevel, pwrlevel->gpu_freq);
 }
+
 EXPORT_SYMBOL(kgsl_pwrctrl_pwrlevel_change);
 
-static int __gpuclk_store(int max, struct device *dev,
-						  struct device_attribute *attr,
-						  const char *buf, size_t count)
-{	int ret, i, delta = 5000000;
-	unsigned long val;
+static int kgsl_pwrctrl_thermal_pwrlevel_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
 	struct kgsl_pwrctrl *pwr;
+	int ret, level;
 
 	if (device == NULL)
 		return 0;
+
+	pwr = &device->pwrctrl;
+
+	ret = sscanf(buf, "%d", &level);
+	if (ret != 1)
+		return count;
+
+	if (level < 0)
+		return count;
+
+	mutex_lock(&device->mutex);
+
+	if (level > pwr->num_pwrlevels - 2)
+		level = pwr->num_pwrlevels - 2;
+
+	pwr->thermal_pwrlevel = level;
+
+	/*
+	 * If there is no power policy set the clock to the requested thermal
+	 * level - if thermal now happens to be higher than max, then that will
+	 * be limited by the pwrlevel change function.  Otherwise if there is
+	 * a policy only change the active clock if it is higher then the new
+	 * thermal level
+	 */
+
+	if (device->pwrscale.policy == NULL ||
+		pwr->thermal_pwrlevel > pwr->active_pwrlevel)
+		kgsl_pwrctrl_pwrlevel_change(device, pwr->thermal_pwrlevel);
+
+	mutex_unlock(&device->mutex);
+
+	return count;
+}
+
+static int kgsl_pwrctrl_thermal_pwrlevel_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	if (device == NULL)
+		return 0;
+	pwr = &device->pwrctrl;
+	return snprintf(buf, PAGE_SIZE, "%d\n", pwr->thermal_pwrlevel);
+}
+
+static int kgsl_pwrctrl_max_pwrlevel_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	int ret, level, max_level;
+
+	if (device == NULL)
+		return 0;
+
+	pwr = &device->pwrctrl;
+
+	ret = sscanf(buf, "%d", &level);
+	if (ret != 1)
+		return count;
+
+	/* If the use specifies a negative number, then don't change anything */
+	if (level < 0)
+		return count;
+
+	mutex_lock(&device->mutex);
+
+	/* You can't set a maximum power level lower than the minimum */
+	if (level > pwr->min_pwrlevel)
+		level = pwr->min_pwrlevel;
+
+	pwr->max_pwrlevel = level;
+
+
+	max_level = max_t(int, pwr->thermal_pwrlevel, pwr->max_pwrlevel);
+
+	/*
+	 * If there is no policy then move to max by default.  Otherwise only
+	 * move max if the current level happens to be higher then the new max
+	 */
+
+	if (device->pwrscale.policy == NULL ||
+		(max_level > pwr->active_pwrlevel))
+		kgsl_pwrctrl_pwrlevel_change(device, max_level);
+
+	mutex_unlock(&device->mutex);
+
+	return count;
+}
+
+static int kgsl_pwrctrl_max_pwrlevel_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	if (device == NULL)
+		return 0;
+	pwr = &device->pwrctrl;
+	return snprintf(buf, PAGE_SIZE, "%d\n", pwr->max_pwrlevel);
+}
+
+static int kgsl_pwrctrl_min_pwrlevel_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	int ret, level, min_level;
+
+	if (device == NULL)
+		return 0;
+
+	pwr = &device->pwrctrl;
+
+	ret = sscanf(buf, "%d", &level);
+	if (ret != 1)
+		return count;
+
+	/* Don't do anything on obviously incorrect values */
+	if (level < 0)
+		return count;
+
+	mutex_lock(&device->mutex);
+	if (level > pwr->num_pwrlevels - 2)
+		level = pwr->num_pwrlevels - 2;
+
+	/* You can't set a minimum power level lower than the maximum */
+	if (level < pwr->max_pwrlevel)
+		level = pwr->max_pwrlevel;
+
+	pwr->min_pwrlevel = level;
+
+	min_level = max_t(int, pwr->thermal_pwrlevel, pwr->min_pwrlevel);
+
+	/* Only move the power level higher if minimum is higher then the
+	 * current level
+	 */
+
+	if (min_level < pwr->active_pwrlevel)
+		kgsl_pwrctrl_pwrlevel_change(device, min_level);
+
+	mutex_unlock(&device->mutex);
+
+	return count;
+}
+
+static int kgsl_pwrctrl_min_pwrlevel_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	if (device == NULL)
+		return 0;
+	pwr = &device->pwrctrl;
+	return snprintf(buf, PAGE_SIZE, "%d\n", pwr->min_pwrlevel);
+}
+
+static int kgsl_pwrctrl_num_pwrlevels_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	if (device == NULL)
+		return 0;
+	pwr = &device->pwrctrl;
+	return snprintf(buf, PAGE_SIZE, "%d\n", pwr->num_pwrlevels - 1);
+}
+
+/* Given a GPU clock value, return the nearest powerlevel */
+
+static int _get_nearest_pwrlevel(struct kgsl_pwrctrl *pwr, unsigned int clock)
+{
+	int i;
+
+	for (i = 0; i < pwr->num_pwrlevels - 1; i++) {
+		if (abs(pwr->pwrlevels[i].gpu_freq - clock) < 5000000)
+			return i;
+	}
+
+	return -ERANGE;
+}
+
+static int kgsl_pwrctrl_max_gpuclk_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	unsigned long val;
+	int ret, level;
+
+	if (device == NULL)
+		return 0;
+
 	pwr = &device->pwrctrl;
 
 	ret = sscanf(buf, "%ld", &val);
@@ -144,40 +390,30 @@
 		return count;
 
 	mutex_lock(&device->mutex);
-	for (i = 0; i < pwr->num_pwrlevels; i++) {
-		if (abs(pwr->pwrlevels[i].gpu_freq - val) < delta) {
-			if (max)
-				pwr->thermal_pwrlevel = i;
-			break;
-		}
-	}
-
-	if (i == pwr->num_pwrlevels)
+	level = _get_nearest_pwrlevel(pwr, val);
+	if (level < 0)
 		goto done;
 
+	pwr->thermal_pwrlevel = level;
 
-	if (pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq >
-	    pwr->pwrlevels[pwr->thermal_pwrlevel].gpu_freq)
+	/*
+	 * if the thermal limit is lower than the current setting,
+	 * move the speed down immediately
+	 */
+
+	if (pwr->thermal_pwrlevel > pwr->active_pwrlevel)
 		kgsl_pwrctrl_pwrlevel_change(device, pwr->thermal_pwrlevel);
-	else if (!max)
-		kgsl_pwrctrl_pwrlevel_change(device, i);
 
 done:
 	mutex_unlock(&device->mutex);
 	return count;
 }
 
-static int kgsl_pwrctrl_max_gpuclk_store(struct device *dev,
-					 struct device_attribute *attr,
-					 const char *buf, size_t count)
-{
-	return __gpuclk_store(1, dev, attr, buf, count);
-}
-
 static int kgsl_pwrctrl_max_gpuclk_show(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
 {
+
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
 	struct kgsl_pwrctrl *pwr;
 	if (device == NULL)
@@ -191,7 +427,27 @@
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
 {
-	return __gpuclk_store(0, dev, attr, buf, count);
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_pwrctrl *pwr;
+	unsigned long val;
+	int ret, level;
+
+	if (device == NULL)
+		return 0;
+
+	pwr = &device->pwrctrl;
+
+	ret = sscanf(buf, "%ld", &val);
+	if (ret != 1)
+		return count;
+
+	mutex_lock(&device->mutex);
+	level = _get_nearest_pwrlevel(pwr, val);
+	if (level >= 0)
+		kgsl_pwrctrl_pwrlevel_change(device, level);
+
+	mutex_unlock(&device->mutex);
+	return count;
 }
 
 static int kgsl_pwrctrl_gpuclk_show(struct device *dev,
@@ -277,7 +533,7 @@
 
 	mutex_lock(&device->mutex);
 
-	
+	/* Let the timeout be requested in ms, but convert to jiffies. */
 	val /= div;
 	if (val >= org_interval_timeout)
 		pwr->interval_timeout = val;
@@ -304,20 +560,46 @@
 {
 	int ret;
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
-	struct kgsl_busy *b = &device->pwrctrl.busy;
-	ret = snprintf(buf, 17, "%7d %7d\n",
-				   b->on_time_old, b->time_old);
+	struct kgsl_clk_stats *clkstats = &device->pwrctrl.clk_stats;
+	ret = snprintf(buf, PAGE_SIZE, "%7d %7d\n",
+			clkstats->on_time_old, clkstats->elapsed_old);
 	if (!test_bit(KGSL_PWRFLAGS_AXI_ON, &device->pwrctrl.power_flags)) {
-		b->on_time_old = 0;
-		b->time_old = 0;
+		clkstats->on_time_old = 0;
+		clkstats->elapsed_old = 0;
 	}
 	return ret;
 }
 
+static int kgsl_pwrctrl_gputop_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	int ret;
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	struct kgsl_clk_stats *clkstats = &device->pwrctrl.clk_stats;
+	int i = 0;
+	char *ptr = buf;
+
+	ret = snprintf(buf, PAGE_SIZE, "%7d %7d ", clkstats->on_time_old,
+					clkstats->elapsed_old);
+	for (i = 0, ptr += ret; i < device->pwrctrl.num_pwrlevels;
+							i++, ptr += ret)
+		ret = snprintf(ptr, PAGE_SIZE, "%7d ",
+						clkstats->old_clock_time[i]);
+
+	if (!test_bit(KGSL_PWRFLAGS_AXI_ON, &device->pwrctrl.power_flags)) {
+		clkstats->on_time_old = 0;
+		clkstats->elapsed_old = 0;
+		for (i = 0; i < KGSL_MAX_PWRLEVELS ; i++)
+			clkstats->old_clock_time[i] = 0;
+	}
+	return (unsigned int) (ptr - buf);
+}
+
 static int kgsl_pwrctrl_gpu_available_frequencies_show(
-				struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
+					struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
 {
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
 	struct kgsl_pwrctrl *pwr;
@@ -333,131 +615,31 @@
 	return num_chars;
 }
 
-
-static int kgsl_pwrctrl_gpubusy_time_show(struct device *dev,
-                    struct device_attribute *attr,
-                    char *buf)
-{
-    int ret;
-    struct kgsl_device *device = kgsl_device_from_dev(dev);
-    s64 system_time, busy_time;
-
-    if(device == NULL)
-        return 0;
-
-    system_time = device->gputime.total;
-    do_div(system_time, 1000);
-    busy_time = device->gputime.busy;
-    do_div(busy_time, 1000);
-    ret = snprintf(buf, 63, "%lld %lld\n", system_time, busy_time);
-
-    return ret;
-}
-
-static int kgsl_pwrctrl_gpubusy_time_in_state_show(struct device *dev,
-        struct device_attribute *attr,
-        char *buf)
-{
-	int i;
-	char* tmp = buf;
-	struct kgsl_device *device = kgsl_device_from_dev(dev);
-	struct platform_device *pdev = NULL;
-	struct kgsl_device_platform_data *pdata = NULL;
-	s64 system_time, busy_time;
-
-	if (device == NULL)
-		return 0;
-
-	pdev = container_of(device->parentdev, struct platform_device, dev);
-	if (pdev == NULL)
-		return 0;
-
-	pdata = pdev->dev.platform_data;
-	if (pdata == NULL)
-		return 0;
-
-	for(i=0;i<pdata->num_levels;i++) {
-		system_time = device->gputime_in_state[i].total;
-		do_div(system_time, 1000);
-		busy_time = device->gputime_in_state[i].busy;
-		do_div(busy_time, 1000);
-		tmp = (char*)( (int)tmp + snprintf(tmp, PAGE_SIZE - (int)(tmp-buf), "%d %lld %lld\n", pdata->pwrlevel[i].gpu_freq, system_time, busy_time));
-	}
-	return (ssize_t)(tmp - buf);
-}
-
-static int kgsl_pwrctrl_gputime_in_state_show(struct device *dev,
-                    struct device_attribute *attr,
-                    char *buf)
-{
-	ssize_t len = 0;
-	int i;
-	struct kgsl_device *device = kgsl_device_from_dev(dev);
-
-	if (device == NULL)
-		return 0;
-
-	if (test_bit(KGSL_PWRFLAGS_CLK_ON, &device->pwrctrl.power_flags) || (device->state == KGSL_STATE_NAP))
-		gpufreq_stats_update(0, device->pwrctrl.active_pwrlevel, device->pwrctrl.active_pwrlevel);
-
-	for (i = 0; i < device->pwrctrl.num_pwrlevels; i++) {
-		len += sprintf(buf + len, "%u %llu\n", device->pwrctrl.pwrlevels[i].gpu_freq,
-					(unsigned long long)jiffies_to_clock_t(gputime_in_state[i]));
-	}
-
-	return len;
-}
-
-static int kgsl_pwrctrl_init_pwrlevel_store(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf, size_t count)
-{
-	char temp[20];
-	unsigned long val;
-	struct kgsl_device *device = kgsl_device_from_dev(dev);
-	struct kgsl_pwrctrl *pwr;
-	int rc;
-
-	if (device == NULL)
-		return 0;
-	pwr = &device->pwrctrl;
-
-	snprintf(temp, sizeof(temp), "%.*s",
-		(int)min(count, sizeof(temp) - 1), buf);
-	rc = strict_strtoul(temp, 0, &val);
-	if (rc)
-		return rc;
-
-	mutex_lock(&device->mutex);
-
-	if (val >=0 && val < pwr->num_pwrlevels - 1)
-		pwr->default_pwrlevel = val;
-
-	mutex_unlock(&device->mutex);
-
-	return count;
-}
-
-static int kgsl_pwrctrl_init_pwrlevel_show(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
-{
-	struct kgsl_device *device = kgsl_device_from_dev(dev);
-	if (device == NULL)
-		return 0;
-	return snprintf(buf, PAGE_SIZE, "%d\n", device->pwrctrl.default_pwrlevel);
-}
-
 DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store);
-DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show, kgsl_pwrctrl_max_gpuclk_store);
+DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show,
+	kgsl_pwrctrl_max_gpuclk_store);
 DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store);
-DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show, kgsl_pwrctrl_idle_timer_store);
-DEVICE_ATTR(gputime_in_state, 0444, kgsl_pwrctrl_gputime_in_state_show, NULL);
-DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show, NULL);
-DEVICE_ATTR(gpu_available_frequencies, 0444,kgsl_pwrctrl_gpu_available_frequencies_show,NULL);
-DEVICE_ATTR(gpubusy_time, 0644, kgsl_pwrctrl_gpubusy_time_show, NULL);
-DEVICE_ATTR(gpubusy_time_in_state, 0644, kgsl_pwrctrl_gpubusy_time_in_state_show, NULL);
-DEVICE_ATTR(init_pwrlevel, 0644, kgsl_pwrctrl_init_pwrlevel_show, kgsl_pwrctrl_init_pwrlevel_store);
+DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show,
+	kgsl_pwrctrl_idle_timer_store);
+DEVICE_ATTR(gpubusy, 0444, kgsl_pwrctrl_gpubusy_show,
+	NULL);
+DEVICE_ATTR(gputop, 0444, kgsl_pwrctrl_gputop_show,
+	NULL);
+DEVICE_ATTR(gpu_available_frequencies, 0444,
+	kgsl_pwrctrl_gpu_available_frequencies_show,
+	NULL);
+DEVICE_ATTR(max_pwrlevel, 0644,
+	kgsl_pwrctrl_max_pwrlevel_show,
+	kgsl_pwrctrl_max_pwrlevel_store);
+DEVICE_ATTR(min_pwrlevel, 0644,
+	kgsl_pwrctrl_min_pwrlevel_show,
+	kgsl_pwrctrl_min_pwrlevel_store);
+DEVICE_ATTR(thermal_pwrlevel, 0644,
+	kgsl_pwrctrl_thermal_pwrlevel_show,
+	kgsl_pwrctrl_thermal_pwrlevel_store);
+DEVICE_ATTR(num_pwrlevels, 0444,
+	kgsl_pwrctrl_num_pwrlevels_show,
+	NULL);
 
 static const struct device_attribute *pwrctrl_attr_list[] = {
 	&dev_attr_gpuclk,
@@ -465,11 +647,12 @@
 	&dev_attr_pwrnap,
 	&dev_attr_idle_timer,
 	&dev_attr_gpubusy,
+	&dev_attr_gputop,
 	&dev_attr_gpu_available_frequencies,
-	&dev_attr_gpubusy_time,
-	&dev_attr_gpubusy_time_in_state,
-	&dev_attr_gputime_in_state,
-	&dev_attr_init_pwrlevel,
+	&dev_attr_max_pwrlevel,
+	&dev_attr_min_pwrlevel,
+	&dev_attr_thermal_pwrlevel,
+	&dev_attr_num_pwrlevels,
 	NULL
 };
 
@@ -483,27 +666,37 @@
 	kgsl_remove_device_sysfs_files(device->dev, pwrctrl_attr_list);
 }
 
+static void update_statistics(struct kgsl_device *device)
+{
+	struct kgsl_clk_stats *clkstats = &device->pwrctrl.clk_stats;
+	unsigned int on_time = 0;
+	int i;
+	int num_pwrlevels = device->pwrctrl.num_pwrlevels - 1;
+	/*PER CLK TIME*/
+	for (i = 0; i < num_pwrlevels; i++) {
+		clkstats->old_clock_time[i] = clkstats->clock_time[i];
+		on_time += clkstats->clock_time[i];
+		clkstats->clock_time[i] = 0;
+	}
+	clkstats->old_clock_time[num_pwrlevels] =
+				clkstats->clock_time[num_pwrlevels];
+	clkstats->clock_time[num_pwrlevels] = 0;
+	clkstats->on_time_old = on_time;
+	clkstats->elapsed_old = clkstats->elapsed;
+	clkstats->elapsed = 0;
+}
+
+/* Track the amount of time the gpu is on vs the total system time. *
+ * Regularly update the percentage of busy time displayed by sysfs. */
 static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time)
 {
-	struct kgsl_busy *b = &device->pwrctrl.busy;
-	int elapsed;
-	if (b->start.tv_sec == 0)
-		do_gettimeofday(&(b->start));
-	do_gettimeofday(&(b->stop));
-	elapsed = (b->stop.tv_sec - b->start.tv_sec) * 1000000;
-	elapsed += b->stop.tv_usec - b->start.tv_usec;
-	b->time += elapsed;
-	if (on_time)
-		b->on_time += elapsed;
-	
-	if ((b->time > UPDATE_BUSY_VAL) ||
+	struct kgsl_clk_stats *clkstats = &device->pwrctrl.clk_stats;
+	update_clk_statistics(device, on_time);
+	/* Update the output regularly and reset the counters. */
+	if ((clkstats->elapsed > UPDATE_BUSY_VAL) ||
 		!test_bit(KGSL_PWRFLAGS_AXI_ON, &device->pwrctrl.power_flags)) {
-		b->on_time_old = b->on_time;
-		b->time_old = b->time;
-		b->on_time = 0;
-		b->time = 0;
+		update_statistics(device);
 	}
-	do_gettimeofday(&(b->start));
 }
 
 void kgsl_pwrctrl_clk(struct kgsl_device *device, int state,
@@ -515,20 +708,10 @@
 		if (test_and_clear_bit(KGSL_PWRFLAGS_CLK_ON,
 			&pwr->power_flags)) {
 			trace_kgsl_clk(device, state);
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE_SYSTRACE
-			if(device->id == 0) {
-				trace_kgsl_usage(device, state, task_tgid_nr(current), device->gputime.total, device->gputime.busy,
-				pwr->active_pwrlevel, pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq);
-				device->prev_pid= -1;
-			}
-#endif
 			for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
-				if (pwr->grp_clks[i]) {
+				if (pwr->grp_clks[i])
 					clk_disable(pwr->grp_clks[i]);
-					if (i == 0)
-						gpufreq_stats_update(0, pwr->active_pwrlevel, (pwr->num_pwrlevels - 1));
-				}
-			
+			/* High latency clock maintenance. */
 			if ((pwr->pwrlevels[0].gpu_freq > 0) &&
 				(requested_state != KGSL_STATE_NAP)) {
 				clk_set_rate(pwr->grp_clks[0],
@@ -539,20 +722,21 @@
 						clk_unprepare(pwr->grp_clks[i]);
 			}
 			kgsl_pwrctrl_busy_time(device, true);
+		} else if (requested_state == KGSL_STATE_SLEEP) {
+			/* High latency clock maintenance. */
+			if ((pwr->pwrlevels[0].gpu_freq > 0))
+				clk_set_rate(pwr->grp_clks[0],
+					pwr->pwrlevels[pwr->num_pwrlevels - 1].
+					gpu_freq);
+			for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
+				if (pwr->grp_clks[i])
+					clk_unprepare(pwr->grp_clks[i]);
 		}
 	} else if (state == KGSL_PWRFLAGS_ON) {
 		if (!test_and_set_bit(KGSL_PWRFLAGS_CLK_ON,
 			&pwr->power_flags)) {
-
 			trace_kgsl_clk(device, state);
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE_SYSTRACE
-			if(device->id == 0) {
-				trace_kgsl_usage(device, state, task_tgid_nr(current), device->gputime.total, device->gputime.busy,
-				pwr->active_pwrlevel, pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq);
-				device->prev_pid = task_tgid_nr(current);
-			}
-#endif
-			
+			/* High latency clock maintenance. */
 			if (device->state != KGSL_STATE_NAP) {
 				for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
 					if (pwr->grp_clks[i])
@@ -564,12 +748,11 @@
 						[pwr->active_pwrlevel].
 						gpu_freq);
 			}
+			/* as last step, enable grp_clk
+			   this is to let GPU interrupt to come */
 			for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
-				if (pwr->grp_clks[i]) {
+				if (pwr->grp_clks[i])
 					clk_enable(pwr->grp_clks[i]);
-					if (i == 0)
-						gpufreq_stats_update(1, KGSL_MAX_PWRLEVELS, pwr->active_pwrlevel);
-				}
 			kgsl_pwrctrl_busy_time(device, false);
 		}
 	}
@@ -617,8 +800,8 @@
 		if (test_and_clear_bit(KGSL_PWRFLAGS_POWER_ON,
 			&pwr->power_flags)) {
 			trace_kgsl_rail(device, state);
-			if (pwr->gpu_dig)
-				regulator_disable(pwr->gpu_dig);
+			if (pwr->gpu_cx)
+				regulator_disable(pwr->gpu_cx);
 			if (pwr->gpu_reg)
 				regulator_disable(pwr->gpu_reg);
 		}
@@ -634,8 +817,8 @@
 							"failed: %d\n",
 							status);
 			}
-			if (pwr->gpu_dig) {
-				int status = regulator_enable(pwr->gpu_dig);
+			if (pwr->gpu_cx) {
+				int status = regulator_enable(pwr->gpu_cx);
 				if (status)
 					KGSL_DRV_ERR(device,
 							"cx regulator_enable "
@@ -678,9 +861,7 @@
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	struct kgsl_device_platform_data *pdata = pdev->dev.platform_data;
 
-	spin_lock_init(&gpufreq_stats_lock);
-
-	
+	/*acquire clocks */
 	for (i = 0; i < KGSL_MAX_CLKS; i++) {
 		if (pdata->clk_map & clks[i].map) {
 			clk = clk_get(&pdev->dev, clks[i].name);
@@ -689,11 +870,11 @@
 			pwr->grp_clks[i] = clk;
 		}
 	}
-	
+	/* Make sure we have a source clk for freq setting */
 	if (pwr->grp_clks[0] == NULL)
 		pwr->grp_clks[0] = pwr->grp_clks[1];
 
-	
+	/* put the AXI bus into asynchronous mode with the graphics cores */
 	if (pdata->set_grp_async != NULL)
 		pdata->set_grp_async();
 
@@ -704,6 +885,13 @@
 		goto done;
 	}
 	pwr->num_pwrlevels = pdata->num_levels;
+
+	/* Initialize the user and thermal clock constraints */
+
+	pwr->max_pwrlevel = 0;
+	pwr->min_pwrlevel = pdata->num_levels - 2;
+	pwr->thermal_pwrlevel = 0;
+
 	pwr->active_pwrlevel = pdata->init_level;
 	pwr->default_pwrlevel = pdata->init_level;
 	for (i = 0; i < pdata->num_levels; i++) {
@@ -717,7 +905,7 @@
 		pwr->pwrlevels[i].io_fraction =
 			pdata->pwrlevel[i].io_fraction;
 	}
-	
+	/* Do not set_rate for targets in sync with AXI */
 	if (pwr->pwrlevels[0].gpu_freq > 0)
 		clk_set_rate(pwr->grp_clks[0], pwr->
 				pwrlevels[pwr->num_pwrlevels - 1].gpu_freq);
@@ -727,11 +915,11 @@
 		pwr->gpu_reg = NULL;
 
 	if (pwr->gpu_reg) {
-		pwr->gpu_dig = regulator_get(&pdev->dev, "vdd_dig");
-		if (IS_ERR(pwr->gpu_dig))
-			pwr->gpu_dig = NULL;
+		pwr->gpu_cx = regulator_get(&pdev->dev, "vddcx");
+		if (IS_ERR(pwr->gpu_cx))
+			pwr->gpu_cx = NULL;
 	} else
-		pwr->gpu_dig = NULL;
+		pwr->gpu_cx = NULL;
 
 	pwr->power_flags = 0;
 
@@ -762,9 +950,6 @@
 
 	pm_runtime_enable(device->parentdev);
 	register_early_suspend(&device->display_off);
-
-	gpufreq_stats_update(1, pwr->active_pwrlevel, KGSL_MAX_PWRLEVELS);
-
 	return result;
 
 clk_err:
@@ -798,9 +983,9 @@
 		pwr->gpu_reg = NULL;
 	}
 
-	if (pwr->gpu_dig) {
-		regulator_put(pwr->gpu_dig);
-		pwr->gpu_dig = NULL;
+	if (pwr->gpu_cx) {
+		regulator_put(pwr->gpu_cx);
+		pwr->gpu_cx = NULL;
 	}
 
 	for (i = 1; i < KGSL_MAX_CLKS; i++)
@@ -829,14 +1014,17 @@
 			mod_timer(&device->idle_timer,
 					jiffies +
 					device->pwrctrl.interval_timeout);
-			device->pwrctrl.busy.no_nap_cnt++;
-			if (device->pwrctrl.busy.no_nap_cnt > UPDATE_BUSY) {
+			/* If the GPU has been too busy to sleep, make sure *
+			 * that is acurately reflected in the % busy numbers. */
+			device->pwrctrl.clk_stats.no_nap_cnt++;
+			if (device->pwrctrl.clk_stats.no_nap_cnt >
+							 UPDATE_BUSY) {
 				kgsl_pwrctrl_busy_time(device, true);
-				device->pwrctrl.busy.no_nap_cnt = 0;
+				device->pwrctrl.clk_stats.no_nap_cnt = 0;
 			}
 		}
 	} else if (device->state & (KGSL_STATE_HUNG |
-					KGSL_STATE_DUMP_AND_RECOVER)) {
+					KGSL_STATE_DUMP_AND_FT)) {
 		kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 	}
 
@@ -854,7 +1042,7 @@
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER);
 		else
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP);
-		
+		/* Have work run in a non-interrupt context. */
 		queue_work(device->work_queue, &device->idle_check_ws);
 	}
 }
@@ -875,7 +1063,7 @@
 		break;
 	case KGSL_STATE_INIT:
 	case KGSL_STATE_HUNG:
-	case KGSL_STATE_DUMP_AND_RECOVER:
+	case KGSL_STATE_DUMP_AND_FT:
 		if (test_bit(KGSL_PWRFLAGS_CLK_ON,
 					 &device->pwrctrl.power_flags))
 			break;
@@ -899,9 +1087,9 @@
 		mutex_unlock(&device->mutex);
 		wait_for_completion(&device->hwaccess_gate);
 		mutex_lock(&device->mutex);
-	} else if (device->state == KGSL_STATE_DUMP_AND_RECOVER) {
+	} else if (device->state == KGSL_STATE_DUMP_AND_FT) {
 		mutex_unlock(&device->mutex);
-		wait_for_completion(&device->recovery_gate);
+		wait_for_completion(&device->ft_gate);
 		mutex_lock(&device->mutex);
 	} else if (device->state == KGSL_STATE_SLUMBER)
 		kgsl_pwrctrl_wake(device);
@@ -934,7 +1122,7 @@
 _sleep_accounting(struct kgsl_device *device)
 {
 	kgsl_pwrctrl_busy_time(device, false);
-	device->pwrctrl.busy.start.tv_sec = 0;
+	device->pwrctrl.clk_stats.start = ktime_set(0, 0);
 	device->pwrctrl.time = 0;
 	kgsl_pwrscale_sleep(device);
 }
@@ -942,21 +1130,16 @@
 static int
 _sleep(struct kgsl_device *device)
 {
-	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	switch (device->state) {
 	case KGSL_STATE_ACTIVE:
 		if (!device->ftbl->isidle(device)) {
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 			return -EBUSY;
 		}
-		
+		/* fall through */
 	case KGSL_STATE_NAP:
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
 		kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF);
-		if (pwr->pwrlevels[0].gpu_freq > 0)
-			clk_set_rate(pwr->grp_clks[0],
-				pwr->pwrlevels[pwr->num_pwrlevels - 1].
-				gpu_freq);
 		_sleep_accounting(device);
 		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP);
@@ -971,6 +1154,9 @@
 				kgsl_pwrstate_to_str(device->state));
 		break;
 	}
+
+	kgsl_mmu_disable_clk_on_ts(&device->mmu, 0, false);
+
 	return 0;
 }
 
@@ -983,7 +1169,7 @@
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 			return -EBUSY;
 		}
-		
+		/* fall through */
 	case KGSL_STATE_NAP:
 	case KGSL_STATE_SLEEP:
 		del_timer_sync(&device->idle_timer);
@@ -1004,12 +1190,14 @@
 	return 0;
 }
 
+/******************************************************************/
+/* Caller must hold the device mutex. */
 int kgsl_pwrctrl_sleep(struct kgsl_device *device)
 {
 	int status = 0;
 	KGSL_PWR_INFO(device, "sleep device %d\n", device->id);
 
-	
+	/* Work through the legal state transitions */
 	switch (device->requested_state) {
 	case KGSL_STATE_NAP:
 		status = _nap(device);
@@ -1031,6 +1219,8 @@
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_sleep);
 
+/******************************************************************/
+/* Caller must hold the device mutex. */
 void kgsl_pwrctrl_wake(struct kgsl_device *device)
 {
 	int status;
@@ -1043,23 +1233,24 @@
 			KGSL_DRV_ERR(device, "start failed %d\n", status);
 			break;
 		}
-		
+		/* fall through */
 	case KGSL_STATE_SLEEP:
 		kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON);
 		kgsl_pwrscale_wake(device);
-		
+		/* fall through */
 	case KGSL_STATE_NAP:
-		
+		/* Turn on the core clocks */
 		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
-		
+		/* Enable state before turning on irq */
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
-		
+		/* Re-enable HW access */
 		mod_timer(&device->idle_timer,
 				jiffies + device->pwrctrl.interval_timeout);
 		pm_qos_update_request(&device->pm_qos_req_dma,
-						GPU_SWFI_LATENCY);
+					GPU_SWFI_LATENCY);
 	case KGSL_STATE_ACTIVE:
+		kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 		break;
 	default:
 		KGSL_PWR_WARN(device, "unhandled state %s\n",
@@ -1072,7 +1263,7 @@
 
 void kgsl_pwrctrl_enable(struct kgsl_device *device)
 {
-	
+	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON);
 	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON);
@@ -1081,7 +1272,7 @@
 
 void kgsl_pwrctrl_disable(struct kgsl_device *device)
 {
-	
+	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF);
 	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF);
@@ -1121,7 +1312,7 @@
 		return "SUSPEND";
 	case KGSL_STATE_HUNG:
 		return "HUNG";
-	case KGSL_STATE_DUMP_AND_RECOVER:
+	case KGSL_STATE_DUMP_AND_FT:
 		return "DNR";
 	case KGSL_STATE_SLUMBER:
 		return "SLUMBER";
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h
index 591582f..8d66505 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.h
+++ b/drivers/gpu/msm/kgsl_pwrctrl.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -13,6 +13,9 @@
 #ifndef __KGSL_PWRCTRL_H
 #define __KGSL_PWRCTRL_H
 
+/*****************************************************************************
+** power flags
+*****************************************************************************/
 #define KGSL_PWRFLAGS_ON   1
 #define KGSL_PWRFLAGS_OFF  0
 
@@ -24,16 +27,41 @@
 
 struct platform_device;
 
-struct kgsl_busy {
-	struct timeval start;
-	struct timeval stop;
-	int on_time;
-	int time;
-	int on_time_old;
-	int time_old;
+struct kgsl_clk_stats {
+	unsigned int old_clock_time[KGSL_MAX_PWRLEVELS];
+	unsigned int clock_time[KGSL_MAX_PWRLEVELS];
+	unsigned int on_time_old;
+	ktime_t start;
+	ktime_t stop;
 	unsigned int no_nap_cnt;
+	unsigned int elapsed;
+	unsigned int elapsed_old;
 };
 
+/**
+ * struct kgsl_pwrctrl - Power control settings for a KGSL device
+ * @interrupt_num - The interrupt number for the device
+ * @ebi1_clk - Pointer to the EBI clock structure
+ * @grp_clks - Array of clocks structures that we control
+ * @power_flags - Control flags for power
+ * @pwrlevels - List of supported power levels
+ * @active_pwrlevel - The currently active power level
+ * @thermal_pwrlevel - maximum powerlevel constraint from thermal
+ * @max_pwrlevel - maximum allowable powerlevel per the user
+ * @min_pwrlevel - minimum allowable powerlevel per the user
+ * @num_pwrlevels - number of available power levels
+ * @interval_timeout - timeout in jiffies to be idle before a power event
+ * @strtstp_sleepwake - true if the device supports low latency GPU start/stop
+ * @gpu_reg - pointer to the regulator structure for gpu_reg
+ * @gpu_cx - pointer to the regulator structure for gpu_cx
+ * @pcl - bus scale identifier
+ * @nap_allowed - true if the device supports naps
+ * @idle_needed - true if the device needs a idle before clock change
+ * @irq_name - resource name for the IRQ
+ * @restore_slumber - Flag to indicate that we are in a suspend/restore sequence
+ * @clk_stats - structure of clock statistics
+ */
+
 struct kgsl_pwrctrl {
 	int interrupt_num;
 	struct clk *ebi1_clk;
@@ -43,18 +71,20 @@
 	unsigned int active_pwrlevel;
 	int thermal_pwrlevel;
 	unsigned int default_pwrlevel;
+	unsigned int max_pwrlevel;
+	unsigned int min_pwrlevel;
 	unsigned int num_pwrlevels;
 	unsigned int interval_timeout;
 	bool strtstp_sleepwake;
 	struct regulator *gpu_reg;
-	struct regulator *gpu_dig;
+	struct regulator *gpu_cx;
 	uint32_t pcl;
 	unsigned int nap_allowed;
 	unsigned int idle_needed;
 	const char *irq_name;
 	s64 time;
-	struct kgsl_busy busy;
 	unsigned int restore_slumber;
+	struct kgsl_clk_stats clk_stats;
 };
 
 void kgsl_pwrctrl_irq(struct kgsl_device *device, int state);
@@ -79,4 +109,4 @@
 
 void kgsl_pwrctrl_set_state(struct kgsl_device *device, unsigned int state);
 void kgsl_pwrctrl_request_state(struct kgsl_device *device, unsigned int state);
-#endif 
+#endif /* __KGSL_PWRCTRL_H */
diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c
index 12e1885..dffae70 100644
--- a/drivers/gpu/msm/kgsl_pwrscale.c
+++ b/drivers/gpu/msm/kgsl_pwrscale.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -39,6 +39,7 @@
 struct kgsl_pwrscale_attribute pwrscale_attr_##_name = \
 __ATTR(_name, _mode, _show, _store)
 
+/* Master list of available policies */
 
 static struct kgsl_pwrscale_policy *kgsl_pwrscale_policies[] = {
 #ifdef CONFIG_MSM_SCM
@@ -59,6 +60,8 @@
 	int i;
 	struct kgsl_pwrscale_policy *policy = NULL;
 
+	/* The special keyword none allows the user to detach all
+	   policies */
 	if (!strncmp("none", buf, 4)) {
 		kgsl_pwrscale_detach_policy(device);
 		return count;
@@ -234,11 +237,9 @@
 void kgsl_pwrscale_busy(struct kgsl_device *device)
 {
 	if (PWRSCALE_ACTIVE(device) && device->pwrscale.policy->busy)
-		if ((!device->pwrscale.gpu_busy) &&
-			(device->requested_state != KGSL_STATE_SLUMBER))
+		if (device->requested_state != KGSL_STATE_SLUMBER)
 			device->pwrscale.policy->busy(device,
 					&device->pwrscale);
-	device->pwrscale.gpu_busy = 1;
 }
 
 void kgsl_pwrscale_idle(struct kgsl_device *device)
@@ -248,7 +249,6 @@
 			device->requested_state != KGSL_STATE_SLEEP)
 			device->pwrscale.policy->idle(device,
 					&device->pwrscale);
-	device->pwrscale.gpu_busy = 0;
 }
 EXPORT_SYMBOL(kgsl_pwrscale_idle);
 
@@ -299,8 +299,14 @@
 {
 	if (device->pwrscale.policy != NULL) {
 		device->pwrscale.policy->close(device, &device->pwrscale);
+
+		/*
+		 * Try to set max pwrlevel which will be limited to thermal by
+		 * kgsl_pwrctrl_pwrlevel_change if thermal is indeed lower
+		 */
+
 		kgsl_pwrctrl_pwrlevel_change(device,
-				device->pwrctrl.thermal_pwrlevel);
+				device->pwrctrl.max_pwrlevel);
 	}
 	device->pwrscale.policy = NULL;
 }
@@ -333,7 +339,7 @@
 
 	device->pwrscale.policy = policy;
 
-	
+	/* Pwrscale is enabled by default at attach time */
 	kgsl_pwrscale_enable(device);
 
 	if (policy) {
diff --git a/drivers/gpu/msm/kgsl_pwrscale.h b/drivers/gpu/msm/kgsl_pwrscale.h
index 34698cd..f17b394 100644
--- a/drivers/gpu/msm/kgsl_pwrscale.h
+++ b/drivers/gpu/msm/kgsl_pwrscale.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -36,7 +36,6 @@
 	struct kgsl_pwrscale_policy *policy;
 	struct kobject kobj;
 	void *priv;
-	int gpu_busy;
 	int enabled;
 };
 
diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c
new file mode 100644
index 0000000..c3188a5
--- /dev/null
+++ b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c
@@ -0,0 +1,232 @@
+/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/idle_stats_device.h>
+#include <linux/cpufreq.h>
+#include <linux/notifier.h>
+#include <linux/cpumask.h>
+#include <linux/tick.h>
+
+#include "kgsl.h"
+#include "kgsl_pwrscale.h"
+#include "kgsl_device.h"
+
+#define MAX_CORES 4
+struct _cpu_info {
+	spinlock_t lock;
+	struct notifier_block cpu_nb;
+	u64 start[MAX_CORES];
+	u64 end[MAX_CORES];
+	int curr_freq[MAX_CORES];
+	int max_freq[MAX_CORES];
+};
+
+struct idlestats_priv {
+	char name[32];
+	struct msm_idle_stats_device idledev;
+	struct kgsl_device *device;
+	struct msm_idle_pulse pulse;
+	struct _cpu_info cpu_info;
+};
+
+static int idlestats_cpufreq_notifier(
+				struct notifier_block *nb,
+				unsigned long val, void *data)
+{
+	struct _cpu_info *cpu = container_of(nb,
+						struct _cpu_info, cpu_nb);
+	struct cpufreq_freqs *freq = data;
+
+	if (val != CPUFREQ_POSTCHANGE)
+		return 0;
+
+	spin_lock(&cpu->lock);
+	if (freq->cpu < num_possible_cpus())
+		cpu->curr_freq[freq->cpu] = freq->new / 1000;
+	spin_unlock(&cpu->lock);
+
+	return 0;
+}
+
+static void idlestats_get_sample(struct msm_idle_stats_device *idledev,
+	struct msm_idle_pulse *pulse)
+{
+	struct kgsl_power_stats stats;
+	struct idlestats_priv *priv = container_of(idledev,
+		struct idlestats_priv, idledev);
+	struct kgsl_device *device = priv->device;
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+
+	mutex_lock(&device->mutex);
+	/* If the GPU is asleep, don't wake it up - assume that we
+	   are idle */
+
+	if (device->state == KGSL_STATE_ACTIVE) {
+		device->ftbl->power_stats(device, &stats);
+		pulse->busy_start_time = pwr->time - stats.busy_time;
+		pulse->busy_interval = stats.busy_time;
+	} else {
+		pulse->busy_start_time = pwr->time;
+		pulse->busy_interval = 0;
+	}
+	pulse->wait_interval = 0;
+	mutex_unlock(&device->mutex);
+}
+
+static void idlestats_busy(struct kgsl_device *device,
+			struct kgsl_pwrscale *pwrscale)
+{
+	struct idlestats_priv *priv = pwrscale->priv;
+	struct kgsl_power_stats stats;
+	int i, busy, nr_cpu = 1;
+
+	if (priv->pulse.busy_start_time != 0) {
+		priv->pulse.wait_interval = 0;
+		/* Calculate the total CPU busy time for this GPU pulse */
+		for (i = 0; i < num_possible_cpus(); i++) {
+			spin_lock(&priv->cpu_info.lock);
+			if (cpu_online(i)) {
+				priv->cpu_info.end[i] =
+						(u64)ktime_to_us(ktime_get()) -
+						get_cpu_idle_time_us(i, NULL);
+				busy = priv->cpu_info.end[i] -
+						priv->cpu_info.start[i];
+				/* Normalize the busy time by frequency */
+				busy = priv->cpu_info.curr_freq[i] *
+					(busy / priv->cpu_info.max_freq[i]);
+				priv->pulse.wait_interval += busy;
+				nr_cpu++;
+			}
+			spin_unlock(&priv->cpu_info.lock);
+		}
+		priv->pulse.wait_interval /= nr_cpu;
+
+		/* This is called from within a mutex protected function, so
+		   no additional locking required */
+		device->ftbl->power_stats(device, &stats);
+
+		/* If total_time is zero, then we don't have
+		   any interesting statistics to store */
+		if (stats.total_time == 0) {
+			priv->pulse.busy_start_time = 0;
+			return;
+		}
+
+		priv->pulse.busy_interval = stats.busy_time;
+		msm_idle_stats_idle_end(&priv->idledev, &priv->pulse);
+	}
+	priv->pulse.busy_start_time = ktime_to_us(ktime_get());
+}
+
+static void idlestats_idle(struct kgsl_device *device,
+		struct kgsl_pwrscale *pwrscale)
+{
+	int i, nr_cpu;
+	struct idlestats_priv *priv = pwrscale->priv;
+
+	nr_cpu = num_possible_cpus();
+	for (i = 0; i < nr_cpu; i++)
+		if (cpu_online(i))
+			priv->cpu_info.start[i] =
+					(u64)ktime_to_us(ktime_get()) -
+					get_cpu_idle_time_us(i, NULL);
+
+	msm_idle_stats_idle_start(&priv->idledev);
+}
+
+static void idlestats_sleep(struct kgsl_device *device,
+			struct kgsl_pwrscale *pwrscale)
+{
+	struct idlestats_priv *priv = pwrscale->priv;
+	msm_idle_stats_update_event(&priv->idledev,
+		MSM_IDLE_STATS_EVENT_IDLE_TIMER_EXPIRED);
+}
+
+static void idlestats_wake(struct kgsl_device *device,
+			struct kgsl_pwrscale *pwrscale)
+{
+	/* Use highest perf level on wake-up from
+	   sleep for better performance */
+	kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO);
+}
+
+static int idlestats_init(struct kgsl_device *device,
+		     struct kgsl_pwrscale *pwrscale)
+{
+	struct idlestats_priv *priv;
+	struct cpufreq_policy cpu_policy;
+	int ret, i;
+
+	priv = pwrscale->priv = kzalloc(sizeof(struct idlestats_priv),
+		GFP_KERNEL);
+	if (pwrscale->priv == NULL)
+		return -ENOMEM;
+
+	snprintf(priv->name, sizeof(priv->name), "idle_stats_%s",
+		 device->name);
+
+	priv->device = device;
+
+	priv->idledev.name = (const char *) priv->name;
+	priv->idledev.get_sample = idlestats_get_sample;
+
+	spin_lock_init(&priv->cpu_info.lock);
+	priv->cpu_info.cpu_nb.notifier_call =
+			idlestats_cpufreq_notifier;
+	ret = cpufreq_register_notifier(&priv->cpu_info.cpu_nb,
+				CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret)
+		goto err;
+	for (i = 0; i < num_possible_cpus(); i++) {
+		cpufreq_frequency_table_cpuinfo(&cpu_policy,
+					cpufreq_frequency_get_table(i));
+		priv->cpu_info.max_freq[i] = cpu_policy.max / 1000;
+		priv->cpu_info.curr_freq[i] = cpu_policy.max / 1000;
+	}
+	ret = msm_idle_stats_register_device(&priv->idledev);
+err:
+	if (ret) {
+		kfree(pwrscale->priv);
+		pwrscale->priv = NULL;
+	}
+
+	return ret;
+}
+
+static void idlestats_close(struct kgsl_device *device,
+		      struct kgsl_pwrscale *pwrscale)
+{
+	struct idlestats_priv *priv = pwrscale->priv;
+
+	if (pwrscale->priv == NULL)
+		return;
+
+	cpufreq_unregister_notifier(&priv->cpu_info.cpu_nb,
+						CPUFREQ_TRANSITION_NOTIFIER);
+	msm_idle_stats_deregister_device(&priv->idledev);
+
+	kfree(pwrscale->priv);
+	pwrscale->priv = NULL;
+}
+
+struct kgsl_pwrscale_policy kgsl_pwrscale_policy_idlestats = {
+	.name = "idlestats",
+	.init = idlestats_init,
+	.idle = idlestats_idle,
+	.busy = idlestats_busy,
+	.sleep = idlestats_sleep,
+	.wake = idlestats_wake,
+	.close = idlestats_close
+};
diff --git a/drivers/gpu/msm/kgsl_pwrscale_msm.c b/drivers/gpu/msm/kgsl_pwrscale_msm.c
index f3948c3..073e474 100644
--- a/drivers/gpu/msm/kgsl_pwrscale_msm.c
+++ b/drivers/gpu/msm/kgsl_pwrscale_msm.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -17,22 +17,26 @@
 #include "kgsl_pwrscale.h"
 #include "kgsl_device.h"
 #include "a2xx_reg.h"
+#include "kgsl_trace.h"
 
 struct msm_priv {
-	struct kgsl_device *device;
-	int enabled;
-	int handle;
-	unsigned int cur_freq;
-	struct msm_dcvs_idle idle_source;
-	struct msm_dcvs_freq freq_sink;
-	struct msm_dcvs_core_info *core_info;
+	struct kgsl_device		*device;
+	int				enabled;
+	unsigned int			cur_freq;
+	unsigned int			req_level;
+	int				floor_level;
+	struct msm_dcvs_core_info	*core_info;
+	int				gpu_busy;
+	int				dcvs_core_id;
 };
 
-static int msm_idle_enable(struct msm_dcvs_idle *self,
-					enum msm_core_control_event event)
+/* reference to be used in idle and freq callbacks */
+static struct msm_priv *the_msm_priv;
+
+static int msm_idle_enable(int type_core_num,
+		enum msm_core_control_event event)
 {
-	struct msm_priv *priv = container_of(self, struct msm_priv,
-								idle_source);
+	struct msm_priv *priv = the_msm_priv;
 
 	switch (event) {
 	case MSM_DCVS_ENABLE_IDLE_PULSE:
@@ -48,16 +52,17 @@
 	return 0;
 }
 
-static int msm_set_freq(struct msm_dcvs_freq *self,
-						unsigned int freq)
+/* Set the requested frequency if it is within 5MHz (delta) of a
+ * supported frequency.
+ */
+static int msm_set_freq(int core_num, unsigned int freq)
 {
 	int i, delta = 5000000;
-	struct msm_priv *priv = container_of(self, struct msm_priv,
-								freq_sink);
+	struct msm_priv *priv = the_msm_priv;
 	struct kgsl_device *device = priv->device;
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 
-	
+	/* msm_dcvs manager uses frequencies in kHz */
 	freq *= 1000;
 	for (i = 0; i < pwr->num_pwrlevels; i++)
 		if (abs(pwr->pwrlevels[i].gpu_freq - freq) < delta)
@@ -66,19 +71,51 @@
 		return 0;
 
 	mutex_lock(&device->mutex);
-	kgsl_pwrctrl_pwrlevel_change(device, i);
-	priv->cur_freq = pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq;
+	priv->req_level = i;
+	if (priv->req_level <= priv->floor_level) {
+		kgsl_pwrctrl_pwrlevel_change(device, priv->req_level);
+		priv->cur_freq = pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq;
+	}
 	mutex_unlock(&device->mutex);
 
-	
+	/* return current frequency in kHz */
 	return priv->cur_freq / 1000;
 }
 
-static unsigned int msm_get_freq(struct msm_dcvs_freq *self)
+static int msm_set_min_freq(int core_num, unsigned int freq)
 {
-	struct msm_priv *priv = container_of(self, struct msm_priv,
-								freq_sink);
-	
+	int i, delta = 5000000;
+	struct msm_priv *priv = the_msm_priv;
+	struct kgsl_device *device = priv->device;
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+
+	/* msm_dcvs manager uses frequencies in kHz */
+	freq *= 1000;
+	for (i = 0; i < pwr->num_pwrlevels; i++)
+		if (abs(pwr->pwrlevels[i].gpu_freq - freq) < delta)
+			break;
+	if (i == pwr->num_pwrlevels)
+		return 0;
+
+	mutex_lock(&device->mutex);
+	priv->floor_level = i;
+	if (priv->floor_level <= priv->req_level)
+		kgsl_pwrctrl_pwrlevel_change(device, priv->floor_level);
+	else if (priv->floor_level > priv->req_level)
+		kgsl_pwrctrl_pwrlevel_change(device, priv->req_level);
+
+	priv->cur_freq = pwr->pwrlevels[pwr->active_pwrlevel].gpu_freq;
+	mutex_unlock(&device->mutex);
+
+	/* return current frequency in kHz */
+	return priv->cur_freq / 1000;
+}
+
+static unsigned int msm_get_freq(int core_num)
+{
+	struct msm_priv *priv = the_msm_priv;
+
+	/* return current frequency in kHz */
 	return priv->cur_freq / 1000;
 }
 
@@ -86,29 +123,64 @@
 			struct kgsl_pwrscale *pwrscale)
 {
 	struct msm_priv *priv = pwrscale->priv;
-	if (priv->enabled)
-		msm_dcvs_idle(priv->handle, MSM_DCVS_IDLE_EXIT, 0);
+	if (priv->enabled && !priv->gpu_busy) {
+		msm_dcvs_idle(priv->dcvs_core_id, MSM_DCVS_IDLE_EXIT, 0);
+		trace_kgsl_mpdcvs(device, 1);
+		priv->gpu_busy = 1;
+	}
 	return;
 }
 
 static void msm_idle(struct kgsl_device *device,
-			struct kgsl_pwrscale *pwrscale)
+		struct kgsl_pwrscale *pwrscale)
 {
 	struct msm_priv *priv = pwrscale->priv;
-	unsigned int rb_rptr, rb_wptr;
-	kgsl_regread(device, REG_CP_RB_RPTR, &rb_rptr);
-	kgsl_regread(device, REG_CP_RB_WPTR, &rb_wptr);
 
-	if (priv->enabled && (rb_rptr == rb_wptr))
-		msm_dcvs_idle(priv->handle, MSM_DCVS_IDLE_ENTER, 0);
-
+	if (priv->enabled && priv->gpu_busy)
+		if (device->ftbl->isidle(device)) {
+			msm_dcvs_idle(priv->dcvs_core_id,
+					MSM_DCVS_IDLE_ENTER, 0);
+			trace_kgsl_mpdcvs(device, 0);
+			priv->gpu_busy = 0;
+		}
 	return;
 }
 
 static void msm_sleep(struct kgsl_device *device,
 			struct kgsl_pwrscale *pwrscale)
 {
-	
+	struct msm_priv *priv = pwrscale->priv;
+
+	if (priv->enabled && priv->gpu_busy) {
+		msm_dcvs_idle(priv->dcvs_core_id, MSM_DCVS_IDLE_ENTER, 0);
+		trace_kgsl_mpdcvs(device, 0);
+		priv->gpu_busy = 0;
+	}
+
+	return;
+}
+
+static void msm_set_io_fraction(struct kgsl_device *device,
+				unsigned int value)
+{
+	int i;
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+
+	for (i = 0; i < pwr->num_pwrlevels; i++)
+		pwr->pwrlevels[i].io_fraction = value;
+
+}
+
+static void msm_restore_io_fraction(struct kgsl_device *device)
+{
+	int i;
+	struct kgsl_device_platform_data *pdata =
+				kgsl_device_get_drvdata(device);
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+
+	for (i = 0; i < pdata->num_levels; i++)
+		pwr->pwrlevels[i].io_fraction =
+			pdata->pwrlevel[i].io_fraction;
 }
 
 static int msm_init(struct kgsl_device *device,
@@ -116,59 +188,60 @@
 {
 	struct msm_priv *priv;
 	struct msm_dcvs_freq_entry *tbl;
-	int i, ret, low_level;
+	int i, ret = -EINVAL, low_level;
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	struct platform_device *pdev =
 		container_of(device->parentdev, struct platform_device, dev);
 	struct kgsl_device_platform_data *pdata = pdev->dev.platform_data;
 
-	priv = pwrscale->priv = kzalloc(sizeof(struct msm_priv),
-		GFP_KERNEL);
-	if (pwrscale->priv == NULL)
-		return -ENOMEM;
+	if (the_msm_priv) {
+		priv = pwrscale->priv = the_msm_priv;
+	} else {
+		priv = pwrscale->priv = kzalloc(sizeof(struct msm_priv),
+			GFP_KERNEL);
+		if (pwrscale->priv == NULL)
+			return -ENOMEM;
 
-	priv->core_info = pdata->core_info;
-	tbl = priv->core_info->freq_tbl;
-	
-	low_level = pwr->num_pwrlevels - KGSL_PWRLEVEL_LAST_OFFSET;
-	for (i = 0; i <= low_level; i++)
-		tbl[i].freq =
-			pwr->pwrlevels[low_level - i].gpu_freq / 1000;
-	ret = msm_dcvs_register_core(device->name, 0, priv->core_info);
-	if (ret) {
-		KGSL_PWR_ERR(device, "msm_dcvs_register_core failed");
-		goto err;
+		priv->core_info = pdata->core_info;
+		tbl = priv->core_info->freq_tbl;
+		priv->floor_level = pwr->num_pwrlevels - 1;
+		/* Fill in frequency table from low to high, reversing order. */
+		low_level = pwr->num_pwrlevels - KGSL_PWRLEVEL_LAST_OFFSET;
+		for (i = 0; i <= low_level; i++)
+			tbl[i].freq =
+				pwr->pwrlevels[low_level - i].gpu_freq / 1000;
+		priv->dcvs_core_id =
+				msm_dcvs_register_core(MSM_DCVS_CORE_TYPE_GPU,
+				0,
+				priv->core_info,
+				msm_set_freq, msm_get_freq, msm_idle_enable,
+				msm_set_min_freq,
+				priv->core_info->sensors[0]);
+		if (priv->dcvs_core_id < 0) {
+			KGSL_PWR_ERR(device, "msm_dcvs_register_core failed");
+			goto err;
+		}
+		the_msm_priv = priv;
 	}
-
 	priv->device = device;
-	priv->idle_source.enable = msm_idle_enable;
-	priv->idle_source.core_name = device->name;
-	priv->handle = msm_dcvs_idle_source_register(&priv->idle_source);
-	if (priv->handle < 0) {
-		ret = priv->handle;
-		KGSL_PWR_ERR(device, "msm_dcvs_idle_source_register failed\n");
-		goto err;
-	}
-
-	priv->freq_sink.core_name = device->name;
-	priv->freq_sink.set_frequency = msm_set_freq;
-	priv->freq_sink.get_frequency = msm_get_freq;
-	ret = msm_dcvs_freq_sink_register(&priv->freq_sink);
+	ret = msm_dcvs_freq_sink_start(priv->dcvs_core_id);
 	if (ret >= 0) {
 		if (device->ftbl->isidle(device)) {
-			device->pwrscale.gpu_busy = 0;
-			msm_dcvs_idle(priv->handle, MSM_DCVS_IDLE_ENTER, 0);
+			priv->gpu_busy = 0;
+			msm_dcvs_idle(priv->dcvs_core_id,
+					MSM_DCVS_IDLE_ENTER, 0);
 		} else {
-			device->pwrscale.gpu_busy = 1;
+			priv->gpu_busy = 1;
 		}
+		msm_set_io_fraction(device, 0);
 		return 0;
 	}
 
 	KGSL_PWR_ERR(device, "msm_dcvs_freq_sink_register failed\n");
-	msm_dcvs_idle_source_unregister(&priv->idle_source);
 
 err:
-	kfree(pwrscale->priv);
+	if (!the_msm_priv)
+		kfree(pwrscale->priv);
 	pwrscale->priv = NULL;
 
 	return ret;
@@ -181,10 +254,9 @@
 
 	if (pwrscale->priv == NULL)
 		return;
-	msm_dcvs_idle_source_unregister(&priv->idle_source);
-	msm_dcvs_freq_sink_unregister(&priv->freq_sink);
-	kfree(pwrscale->priv);
+	msm_dcvs_freq_sink_stop(priv->dcvs_core_id);
 	pwrscale->priv = NULL;
+	msm_restore_io_fraction(device);
 }
 
 struct kgsl_pwrscale_policy kgsl_pwrscale_policy_msm = {
diff --git a/drivers/gpu/msm/kgsl_pwrscale_trustzone.c b/drivers/gpu/msm/kgsl_pwrscale_trustzone.c
index 5ae3fe0..aa6861e 100644
--- a/drivers/gpu/msm/kgsl_pwrscale_trustzone.c
+++ b/drivers/gpu/msm/kgsl_pwrscale_trustzone.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,7 +22,6 @@
 #include "kgsl.h"
 #include "kgsl_pwrscale.h"
 #include "kgsl_device.h"
-#include "kgsl_trace.h"
 
 #define TZ_GOVERNOR_PERFORMANCE 0
 #define TZ_GOVERNOR_ONDEMAND    1
@@ -31,41 +30,22 @@
 	int governor;
 	unsigned int no_switch_cnt;
 	unsigned int skip_cnt;
+	struct kgsl_power_stats bin;
 };
 spinlock_t tz_lock;
 
+/* FLOOR is 5msec to capture up to 3 re-draws
+ * per frame for 60fps content.
+ */
+#define FLOOR			5000
 #define SWITCH_OFF		200
 #define SWITCH_OFF_RESET_TH	40
 #define SKIP_COUNTER		500
 #define TZ_RESET_ID		0x3
 #define TZ_UPDATE_ID		0x4
-#define TZ_CMD_ID		0x90
-
-
-#define PARAM_INDEX_WRITE_DOWNTHRESHOLD 100
-#define PARAM_INDEX_WRITE_UPTHRESHOLD 101
-#define PARAM_INDEX_WRITE_MINGAPCOUNT 102
-#define PARAM_INDEX_WRITE_NUMGAPS 103
-#define PARAM_INDEX_WRITE_INITIDLEVECTOR 104
-#define PARAM_INDEX_WRITE_DOWNTHRESHOLD_PERCENT 105
-#define PARAM_INDEX_WRITE_UPTHRESHOLD_PERCENT 106
-#define PARAM_INDEX_WRITE_DOWNTHRESHOLD_COUNT 107
-#define PARAM_INDEX_WRITE_UPTHRESHOLD_COUNT 108
-#define PARAM_INDEX_WRITE_ALGORITHM 109
-
-
-#define PARAM_INDEX_READ_DOWNTHRESHOLD 200
-#define PARAM_INDEX_READ_UPTHRESHOLD 201
-#define PARAM_INDEX_READ_MINGAPCOUNT 202
-#define PARAM_INDEX_READ_NUMGAPS 203
-#define PARAM_INDEX_READ_INITIDLEVECTOR 204
-#define PARAM_INDEX_READ_DOWNTHRESHOLD_PERCENT 205
-#define PARAM_INDEX_READ_UPTHRESHOLD_PERCENT 206
-#define PARAM_INDEX_READ_DOWNTHRESHOLD_COUNT 207
-#define PARAM_INDEX_READ_UPTHRESHOLD_COUNT 208
-#define PARAM_INDEX_READ_ALGORITHM 209
 
 #ifdef CONFIG_MSM_SCM
+/* Trap into the TrustZone, and call funcs there. */
 static int __secure_tz_entry(u32 cmd, u32 val, u32 id)
 {
 	int ret;
@@ -80,7 +60,7 @@
 {
 	return 0;
 }
-#endif 
+#endif /* CONFIG_MSM_SCM */
 
 static ssize_t tz_governor_show(struct kgsl_device *device,
 				struct kgsl_pwrscale *pwrscale,
@@ -118,316 +98,16 @@
 		priv->governor = TZ_GOVERNOR_PERFORMANCE;
 
 	if (priv->governor == TZ_GOVERNOR_PERFORMANCE)
-		kgsl_pwrctrl_pwrlevel_change(device, pwr->thermal_pwrlevel);
+		kgsl_pwrctrl_pwrlevel_change(device, pwr->max_pwrlevel);
 
 	mutex_unlock(&device->mutex);
 	return count;
 }
 
-static ssize_t dcvs_downthreshold_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_DOWNTHRESHOLD);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_downthreshold_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_DOWNTHRESHOLD);
-
-	return count;
-}
-
-static ssize_t dcvs_upthreshold_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_UPTHRESHOLD);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_upthreshold_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_UPTHRESHOLD);
-
-	return count;
-}
-
-static ssize_t dcvs_down_count_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_MINGAPCOUNT);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_down_count_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_MINGAPCOUNT);
-
-	return count;
-}
-
-static ssize_t dcvs_numgaps_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_NUMGAPS);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_numgaps_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_NUMGAPS);
-
-	return count;
-}
-
-static ssize_t dcvs_init_idle_vector_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_INITIDLEVECTOR);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_init_idle_vector_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_INITIDLEVECTOR);
-
-	return count;
-}
-
-static ssize_t dcvs_algorithm_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_ALGORITHM);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_algorithm_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_ALGORITHM);
-
-	return count;
-}
-
-static ssize_t dcvs_upthreshold_percent_show(struct kgsl_device *device,
-					struct kgsl_pwrscale *pwrscale,
-					char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_UPTHRESHOLD_PERCENT);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_upthreshold_percent_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_UPTHRESHOLD_PERCENT);
-
-	return count;
-}
-
-static ssize_t dcvs_downthreshold_percent_show(struct kgsl_device *device,
-					struct kgsl_pwrscale *pwrscale,
-					char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_DOWNTHRESHOLD_PERCENT);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_downthreshold_percent_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_DOWNTHRESHOLD_PERCENT);
-
-	return count;
-}
-
-static ssize_t dcvs_upthreshold_count_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_UPTHRESHOLD_COUNT);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_upthreshold_count_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_UPTHRESHOLD_COUNT);
-
-	return count;
-}
-
-static ssize_t dcvs_downthreshold_count_show(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				char *buf)
-{
-	int val, ret;
-	val = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_READ_DOWNTHRESHOLD_COUNT);
-
-	ret = sprintf(buf, "%d\n", val);
-
-	return ret;
-}
-
-static ssize_t dcvs_downthreshold_count_store(struct kgsl_device *device,
-				struct kgsl_pwrscale *pwrscale,
-				const char *buf, size_t count)
-{
-	int val, ret;
-
-	ret = sscanf(buf, "%d", &val);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	__secure_tz_entry(TZ_CMD_ID, val, PARAM_INDEX_WRITE_DOWNTHRESHOLD_COUNT);
-
-	return count;
-}
-
 PWRSCALE_POLICY_ATTR(governor, 0644, tz_governor_show, tz_governor_store);
-PWRSCALE_POLICY_ATTR(dcvs_downthreshold, 0644, dcvs_downthreshold_show, dcvs_downthreshold_store);
-PWRSCALE_POLICY_ATTR(dcvs_upthreshold, 0644, dcvs_upthreshold_show, dcvs_upthreshold_store);
-PWRSCALE_POLICY_ATTR(dcvs_down_count, 0644, dcvs_down_count_show, dcvs_down_count_store);
-PWRSCALE_POLICY_ATTR(dcvs_numgaps, 0644, dcvs_numgaps_show, dcvs_numgaps_store);
-PWRSCALE_POLICY_ATTR(dcvs_init_idle_vector, 0644, dcvs_init_idle_vector_show, dcvs_init_idle_vector_store);
-PWRSCALE_POLICY_ATTR(dcvs_algorithm, 0644, dcvs_algorithm_show, dcvs_algorithm_store);
-PWRSCALE_POLICY_ATTR(dcvs_upthreshold_percent, 0644, dcvs_upthreshold_percent_show, dcvs_upthreshold_percent_store);
-PWRSCALE_POLICY_ATTR(dcvs_downthreshold_percent, 0644, dcvs_downthreshold_percent_show, dcvs_downthreshold_percent_store);
-PWRSCALE_POLICY_ATTR(dcvs_upthreshold_count, 0644, dcvs_upthreshold_count_show, dcvs_upthreshold_count_store);
-PWRSCALE_POLICY_ATTR(dcvs_downthreshold_count, 0644, dcvs_downthreshold_count_show, dcvs_downthreshold_count_store);
 
 static struct attribute *tz_attrs[] = {
 	&policy_attr_governor.attr,
-	&policy_attr_dcvs_downthreshold.attr,
-	&policy_attr_dcvs_upthreshold.attr,
-	&policy_attr_dcvs_down_count.attr,
-	&policy_attr_dcvs_numgaps.attr,
-	&policy_attr_dcvs_init_idle_vector.attr,
-	&policy_attr_dcvs_algorithm.attr,
-	&policy_attr_dcvs_upthreshold_percent.attr,
-	&policy_attr_dcvs_downthreshold_percent.attr,
-	&policy_attr_dcvs_upthreshold_count.attr,
-	&policy_attr_dcvs_downthreshold_count.attr,
 	NULL
 };
 
@@ -439,12 +119,9 @@
 {
 	struct tz_priv *priv = pwrscale->priv;
 	if (device->state != KGSL_STATE_NAP &&
-		priv->governor == TZ_GOVERNOR_ONDEMAND) {
-		trace_kgsl_pwrlevel(device, device->pwrctrl.default_pwrlevel,
-			device->pwrctrl.pwrlevels[device->pwrctrl.default_pwrlevel].gpu_freq);
+		priv->governor == TZ_GOVERNOR_ONDEMAND)
 		kgsl_pwrctrl_pwrlevel_change(device,
 					device->pwrctrl.default_pwrlevel);
-	}
 }
 
 static void tz_idle(struct kgsl_device *device, struct kgsl_pwrscale *pwrscale)
@@ -452,16 +129,26 @@
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	struct tz_priv *priv = pwrscale->priv;
 	struct kgsl_power_stats stats;
-	int val, idle, total_time;
+	int val, idle;
 
-
+	/* In "performance" mode the clock speed always stays
+	   the same */
 	if (priv->governor == TZ_GOVERNOR_PERFORMANCE)
 		return;
 
 	device->ftbl->power_stats(device, &stats);
-	if (stats.total_time == 0)
+	priv->bin.total_time += stats.total_time;
+	priv->bin.busy_time += stats.busy_time;
+	/* Do not waste CPU cycles running this algorithm if
+	 * the GPU just started, or if less than FLOOR time
+	 * has passed since the last run.
+	 */
+	if ((stats.total_time == 0) ||
+		(priv->bin.total_time < FLOOR))
 		return;
 
+	/* If the GPU has stayed in turbo mode for a while, *
+	 * stop writing out values. */
 	if (pwr->active_pwrlevel == 0) {
 		if (priv->no_switch_cnt > SWITCH_OFF) {
 			priv->skip_cnt++;
@@ -476,15 +163,11 @@
 		priv->no_switch_cnt = 0;
 	}
 
-	idle = stats.total_time - stats.busy_time;
+	idle = priv->bin.total_time - priv->bin.busy_time;
+	priv->bin.total_time = 0;
+	priv->bin.busy_time = 0;
 	idle = (idle > 0) ? idle : 0;
-
-	
-	total_time = stats.total_time & 0x0FFFFFFF;
-	total_time |= (pwr->active_pwrlevel) << 28;
-
-	val = __secure_tz_entry(TZ_UPDATE_ID, idle, total_time);
-
+	val = __secure_tz_entry(TZ_UPDATE_ID, idle, device->id);
 	if (val)
 		kgsl_pwrctrl_pwrlevel_change(device,
 					     pwr->active_pwrlevel + val);
@@ -501,21 +184,16 @@
 {
 	struct tz_priv *priv = pwrscale->priv;
 
-	trace_kgsl_pwrlevel(device, 0, 0);
-
 	__secure_tz_entry(TZ_RESET_ID, 0, device->id);
 	priv->no_switch_cnt = 0;
+	priv->bin.total_time = 0;
+	priv->bin.busy_time = 0;
 }
 
+#ifdef CONFIG_MSM_SCM
 static int tz_init(struct kgsl_device *device, struct kgsl_pwrscale *pwrscale)
 {
 	struct tz_priv *priv;
-	int ret;
-
-	
-	if (!(cpu_is_msm8x60() || cpu_is_msm8960() || cpu_is_apq8064() ||
-		cpu_is_msm8930() || cpu_is_msm8930aa() || cpu_is_msm8627()))
-		return -EINVAL;
 
 	priv = pwrscale->priv = kzalloc(sizeof(struct tz_priv), GFP_KERNEL);
 	if (pwrscale->priv == NULL)
@@ -525,15 +203,14 @@
 	spin_lock_init(&tz_lock);
 	kgsl_pwrscale_policy_add_files(device, pwrscale, &tz_attr_group);
 
-	ret = __secure_tz_entry(TZ_CMD_ID, 0, PARAM_INDEX_WRITE_ALGORITHM);
-
-	if(ret == 1)
-		pr_info("Using HTC GPU DCVS algorithm\n");
-	else
-		pr_info("Using QCT GPU DCVS algorithm\n");
-
 	return 0;
 }
+#else
+static int tz_init(struct kgsl_device *device, struct kgsl_pwrscale *pwrscale)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_MSM_SCM */
 
 static void tz_close(struct kgsl_device *device, struct kgsl_pwrscale *pwrscale)
 {
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
index d4a8f92..a345e58 100644
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -24,8 +24,7 @@
 #include "kgsl_cffdump.h"
 #include "kgsl_device.h"
 
-struct ion_client* kgsl_client = NULL;
-
+/* An attribute for showing per-process memory statistics */
 struct kgsl_mem_entry_attribute {
 	struct attribute attr;
 	int memtype;
@@ -43,23 +42,13 @@
 	.show = _show, \
 }
 
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-static ssize_t
-gpubusy_show(struct kgsl_process_private *priv, int type, char *buf)
-{
-	char* tmp = buf;
-	int i;
-
-	tmp = (char*)((int)tmp + snprintf(tmp, PAGE_SIZE, "%lld %lld", priv->gputime.total, priv->gputime.busy));
-	for(i=0;i<KGSL_MAX_PWRLEVELS;i++)
-		tmp = (char*)( (int)tmp + snprintf(tmp, PAGE_SIZE - (int)(tmp-buf), " %lld %lld", priv->gputime_in_state[i].total, priv->gputime_in_state[i].busy));
-	tmp = (char*)((int)tmp + snprintf(tmp, PAGE_SIZE, "\n"));
-	return (ssize_t)(tmp - buf);
-}
-
-static struct kgsl_mem_entry_attribute gpubusy = __MEM_ENTRY_ATTR(0, gpubusy, gpubusy_show);
-#endif
-
+/*
+ * A structure to hold the attributes for a particular memory type.
+ * For each memory type in each process we store the current and maximum
+ * memory usage and display the counts in sysfs.  This structure and
+ * the following macro allow us to simplify the definition for those
+ * adding new memory types
+ */
 
 struct mem_entry_stats {
 	int memtype;
@@ -77,9 +66,16 @@
 }
 
 
+/*
+ * One page allocation for a guard region to protect against over-zealous
+ * GPU pre-fetch
+ */
 
 static struct page *kgsl_guard_page;
 
+/**
+ * Given a kobj, find the process structure attached to it
+ */
 
 static struct kgsl_process_private *
 _get_priv_from_kobj(struct kobject *kobj)
@@ -101,6 +97,9 @@
 	return NULL;
 }
 
+/**
+ * Show the current amount of memory allocated for the given memtype
+ */
 
 static ssize_t
 mem_entry_show(struct kgsl_process_private *priv, int type, char *buf)
@@ -108,6 +107,10 @@
 	return snprintf(buf, PAGE_SIZE, "%d\n", priv->stats[type].cur);
 }
 
+/**
+ * Show the maximum memory allocated for the given memtype through the life of
+ * the process
+ */
 
 static ssize_t
 mem_entry_max_show(struct kgsl_process_private *priv, int type, char *buf)
@@ -174,9 +177,6 @@
 			&mem_stats[i].max_attr.attr);
 	}
 
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	sysfs_remove_file(&private->kobj, &gpubusy.attr);
-#endif
 	kobject_put(&private->kobj);
 }
 
@@ -193,15 +193,14 @@
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(mem_stats); i++) {
+		/* We need to check the value of sysfs_create_file, but we
+		 * don't really care if it passed or not */
 
 		ret = sysfs_create_file(&private->kobj,
 			&mem_stats[i].attr.attr);
 		ret = sysfs_create_file(&private->kobj,
 			&mem_stats[i].max_attr.attr);
 	}
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE
-	ret = sysfs_create_file(&private->kobj, &gpubusy.attr);
-#endif
 }
 
 static int kgsl_drv_memstat_show(struct device *dev,
@@ -314,82 +313,46 @@
 }
 #endif
 
-static int kgsl_ion_alloc_vmfault(struct kgsl_memdesc *memdesc,
-				struct vm_area_struct *vma,
-				struct vm_fault *vmf)
-{
-	unsigned long offset, pfn;
-	int ret;
-
-	offset = ((unsigned long) vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
-
-	pfn = (memdesc->sg[0].dma_address >> PAGE_SHIFT) + offset;
-	ret = vm_insert_pfn(vma, (unsigned long) vmf->virtual_address, pfn);
-
-	if (ret == -ENOMEM || ret == -EAGAIN)
-		return VM_FAULT_OOM;
-	else if (ret == -EFAULT)
-		return VM_FAULT_SIGBUS;
-
-	return 0;
-}
-
-static int kgsl_ion_alloc_vmflags(struct kgsl_memdesc *memdesc)
-{
-	return VM_RESERVED | VM_DONTEXPAND;
-}
-
-static void kgsl_ion_alloc_free(struct kgsl_memdesc *memdesc)
-{
-	kgsl_driver.stats.pre_alloc -= memdesc->size;
-	if (memdesc->handle)
-		ion_free(kgsl_client, memdesc->handle);
-
-	if (memdesc->hostptr) {
-		iounmap(memdesc->hostptr);
-		kgsl_driver.stats.vmalloc -= memdesc->size;
-	}
-
-	if (memdesc->private)
-		kgsl_process_sub_stats(memdesc->private, KGSL_MEM_ENTRY_PRE_ALLOC, memdesc->size);
-	else
-		kgsl_driver.stats.pre_alloc_kernel -= memdesc->size;
-}
-
-static int kgsl_ion_alloc_map_kernel(struct kgsl_memdesc *memdesc)
-{
-	if (!memdesc->hostptr) {
-		memdesc->hostptr = ioremap(memdesc->sg[0].dma_address, memdesc->sg[0].length);
-		if(IS_ERR_OR_NULL(memdesc->hostptr)) {
-			KGSL_CORE_ERR("kgsl: ion ioremap failed\n");
-			return -ENOMEM;
-		}
-		KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc,
-                kgsl_driver.stats.vmalloc_max);
-	}
-
-	return 0;
-}
-
 static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
-	unsigned long offset;
-	struct page *page;
-	int i;
+	int i, pgoff;
+	struct scatterlist *s = memdesc->sg;
+	unsigned int offset;
 
-	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
+	offset = ((unsigned long) vmf->virtual_address - vma->vm_start);
 
-	i = offset >> PAGE_SHIFT;
-	page = sg_page(&memdesc->sg[i]);
-	if (page == NULL)
+	if (offset >= memdesc->size)
 		return VM_FAULT_SIGBUS;
 
-	get_page(page);
+	pgoff = offset >> PAGE_SHIFT;
 
-	vmf->page = page;
-	return 0;
+	/*
+	 * The sglist might be comprised of mixed blocks of memory depending
+	 * on how many 64K pages were allocated.  This means we have to do math
+	 * to find the actual 4K page to map in user space
+	 */
+
+	for (i = 0; i < memdesc->sglen; i++) {
+		int npages = s->length >> PAGE_SHIFT;
+
+		if (pgoff < npages) {
+			struct page *page = sg_page(s);
+
+			page = nth_page(page, pgoff);
+
+			get_page(page);
+			vmf->page = page;
+
+			return 0;
+		}
+
+		pgoff -= npages;
+		s = sg_next(s);
+	}
+
+	return VM_FAULT_SIGBUS;
 }
 
 static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc)
@@ -403,8 +366,8 @@
 	struct scatterlist *sg;
 	int sglen = memdesc->sglen;
 
-	
-	if (memdesc->flags & KGSL_MEMDESC_GUARD_PAGE)
+	/* Don't free the guard page if it was used */
+	if (memdesc->priv & KGSL_MEMDESC_GUARD_PAGE)
 		sglen--;
 
 	kgsl_driver.stats.page_alloc -= memdesc->size;
@@ -415,12 +378,7 @@
 	}
 	if (memdesc->sg)
 		for_each_sg(memdesc->sg, sg, sglen, i)
-			__free_page(sg_page(sg));
-
-	if (memdesc->private)
-		kgsl_process_sub_stats(memdesc->private, KGSL_MEM_ENTRY_PAGE_ALLOC, memdesc->size);
-	else
-		kgsl_driver.stats.page_alloc_kernel -= memdesc->size;
+			__free_pages(sg_page(sg), get_order(sg->length));
 }
 
 static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
@@ -428,29 +386,46 @@
 	return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
 }
 
+/*
+ * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address
+ * space
+ *
+ * @memdesc - The memory descriptor which contains information about the memory
+ *
+ * Return: 0 on success else error code
+ */
 static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc)
 {
 	if (!memdesc->hostptr) {
 		pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
 		struct page **pages = NULL;
 		struct scatterlist *sg;
+		int npages = PAGE_ALIGN(memdesc->size) >> PAGE_SHIFT;
 		int sglen = memdesc->sglen;
-		int i;
+		int i, count = 0;
 
-		
-		if (memdesc->flags & KGSL_MEMDESC_GUARD_PAGE)
+		/* Don't map the guard page if it exists */
+		if (memdesc->priv & KGSL_MEMDESC_GUARD_PAGE)
 			sglen--;
 
-		
-		pages = vmalloc(sglen * sizeof(struct page *));
+		/* create a list of pages to call vmap */
+		pages = vmalloc(npages * sizeof(struct page *));
 		if (!pages) {
 			KGSL_CORE_ERR("vmalloc(%d) failed\n",
-				sglen * sizeof(struct page *));
+				npages * sizeof(struct page *));
 			return -ENOMEM;
 		}
-		for_each_sg(memdesc->sg, sg, sglen, i)
-			pages[i] = sg_page(sg);
-		memdesc->hostptr = vmap(pages, sglen,
+
+		for_each_sg(memdesc->sg, sg, sglen, i) {
+			struct page *page = sg_page(sg);
+			int j;
+
+			for (j = 0; j < sg->length >> PAGE_SHIFT; j++)
+				pages[count++] = page++;
+		}
+
+
+		memdesc->hostptr = vmap(pages, count,
 					VM_IOREMAP, page_prot);
 		KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc,
 				kgsl_driver.stats.vmalloc_max);
@@ -493,6 +468,20 @@
 	free_contiguous_memory_by_paddr(memdesc->physaddr);
 }
 
+static int kgsl_ebimem_map_kernel(struct kgsl_memdesc *memdesc)
+{
+	if (!memdesc->hostptr) {
+		memdesc->hostptr = ioremap(memdesc->physaddr, memdesc->size);
+		if (!memdesc->hostptr) {
+			KGSL_CORE_ERR("ioremap failed, addr:0x%p, size:0x%x\n",
+				memdesc->hostptr, memdesc->size);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static void kgsl_coherent_free(struct kgsl_memdesc *memdesc)
 {
 	kgsl_driver.stats.coherent -= memdesc->size;
@@ -500,6 +489,7 @@
 			  memdesc->hostptr, memdesc->physaddr);
 }
 
+/* Global - also used by kgsl_drm.c */
 struct kgsl_memdesc_ops kgsl_page_alloc_ops = {
 	.free = kgsl_page_alloc_free,
 	.vmflags = kgsl_page_alloc_vmflags,
@@ -508,19 +498,11 @@
 };
 EXPORT_SYMBOL(kgsl_page_alloc_ops);
 
-struct kgsl_memdesc_ops kgsl_ion_alloc_ops = {
-	.free = kgsl_ion_alloc_free,
-	.vmflags = kgsl_ion_alloc_vmflags,
-	.vmfault = kgsl_ion_alloc_vmfault,
-	.map_kernel_mem = kgsl_ion_alloc_map_kernel,
-};
-EXPORT_SYMBOL(kgsl_ion_alloc_ops);
-
-
 static struct kgsl_memdesc_ops kgsl_ebimem_ops = {
 	.free = kgsl_ebimem_free,
 	.vmflags = kgsl_contiguous_vmflags,
 	.vmfault = kgsl_contiguous_vmfault,
+	.map_kernel_mem = kgsl_ebimem_map_kernel,
 };
 
 static struct kgsl_memdesc_ops kgsl_coherent_ops = {
@@ -553,87 +535,158 @@
 			struct kgsl_pagetable *pagetable,
 			size_t size, unsigned int protflags)
 {
-	int i, order, ret = 0;
-	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
+	int pcount = 0, order, ret = 0;
+	int j, len, page_size, sglen_alloc, sglen = 0;
 	struct page **pages = NULL;
 	pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
 	void *ptr;
+	unsigned int align;
 
+	align = (memdesc->flags & KGSL_MEMALIGN_MASK) >> KGSL_MEMALIGN_SHIFT;
+
+	page_size = (align >= ilog2(SZ_64K) && size >= SZ_64K)
+			? SZ_64K : PAGE_SIZE;
+	/* update align flags for what we actually use */
+	kgsl_memdesc_set_align(memdesc, ilog2(page_size));
+
+	/*
+	 * There needs to be enough room in the sg structure to be able to
+	 * service the allocation entirely with PAGE_SIZE sized chunks
+	 */
+
+	sglen_alloc = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	/*
+	 * Add guard page to the end of the allocation when the
+	 * IOMMU is in use.
+	 */
 
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU)
-		sglen++;
+		sglen_alloc++;
 
 	memdesc->size = size;
 	memdesc->pagetable = pagetable;
-	memdesc->priv = KGSL_MEMFLAGS_CACHED;
 	memdesc->ops = &kgsl_page_alloc_ops;
 
-	memdesc->sg = kgsl_sg_alloc(sglen);
+	memdesc->sg = kgsl_sg_alloc(sglen_alloc);
 
 	if (memdesc->sg == NULL) {
 		KGSL_CORE_ERR("vmalloc(%d) failed\n",
-			sglen * sizeof(struct scatterlist));
+			sglen_alloc * sizeof(struct scatterlist));
 		ret = -ENOMEM;
 		goto done;
 	}
 
+	/*
+	 * Allocate space to store the list of pages to send to vmap.
+	 * This is an array of pointers so we can track 1024 pages per page of
+	 * allocation which means we can handle up to a 8MB buffer request with
+	 * two pages; well within the acceptable limits for using kmalloc.
+	 */
 
-	pages = kmalloc(sglen * sizeof(struct page *), GFP_KERNEL);
+	pages = kmalloc(sglen_alloc * sizeof(struct page *), GFP_KERNEL);
 
 	if (pages == NULL) {
 		KGSL_CORE_ERR("kmalloc (%d) failed\n",
-			sglen * sizeof(struct page *));
+			sglen_alloc * sizeof(struct page *));
 		ret = -ENOMEM;
 		goto done;
 	}
 
 	kmemleak_not_leak(memdesc->sg);
 
-	memdesc->sglen = sglen;
-	sg_init_table(memdesc->sg, sglen);
+	memdesc->sglen_alloc = sglen_alloc;
+	sg_init_table(memdesc->sg, sglen_alloc);
 
-	for (i = 0; i < PAGE_ALIGN(size) / PAGE_SIZE; i++) {
+	len = size;
 
+	while (len > 0) {
+		struct page *page;
+		unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM |
+			__GFP_NOWARN;
+		int j;
 
-		pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-		if (pages[i] == NULL) {
+		/* don't waste space at the end of the allocation*/
+		if (len < page_size)
+			page_size = PAGE_SIZE;
+
+		if (page_size != PAGE_SIZE)
+			gfp_mask |= __GFP_COMP;
+
+		page = alloc_pages(gfp_mask, get_order(page_size));
+
+		if (page == NULL) {
+			if (page_size != PAGE_SIZE) {
+				page_size = PAGE_SIZE;
+				continue;
+			}
+
+			KGSL_CORE_ERR(
+				"Out of memory: only allocated %dKB of %dKB requested\n",
+				(size - len) >> 10, size >> 10);
+
 			ret = -ENOMEM;
-			memdesc->sglen = i;
 			goto done;
 		}
 
-		sg_set_page(&memdesc->sg[i], pages[i], PAGE_SIZE, 0);
+		for (j = 0; j < page_size >> PAGE_SHIFT; j++)
+			pages[pcount++] = nth_page(page, j);
+
+		sg_set_page(&memdesc->sg[sglen++], page, page_size, 0);
+		len -= page_size;
 	}
 
-	
+	/* Add the guard page to the end of the sglist */
 
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU) {
+		/*
+		 * It doesn't matter if we use GFP_ZERO here, this never
+		 * gets mapped, and we only allocate it once in the life
+		 * of the system
+		 */
 
 		if (kgsl_guard_page == NULL)
 			kgsl_guard_page = alloc_page(GFP_KERNEL | __GFP_ZERO |
 				__GFP_HIGHMEM);
 
 		if (kgsl_guard_page != NULL) {
-			sg_set_page(&memdesc->sg[sglen - 1], kgsl_guard_page,
+			sg_set_page(&memdesc->sg[sglen++], kgsl_guard_page,
 				PAGE_SIZE, 0);
-			memdesc->flags |= KGSL_MEMDESC_GUARD_PAGE;
-		} else
-			memdesc->sglen--;
+			memdesc->priv |= KGSL_MEMDESC_GUARD_PAGE;
+		}
 	}
 
+	memdesc->sglen = sglen;
 
-	ptr = vmap(pages, i, VM_IOREMAP, page_prot);
+	/*
+	 * All memory that goes to the user has to be zeroed out before it gets
+	 * exposed to userspace. This means that the memory has to be mapped in
+	 * the kernel, zeroed (memset) and then unmapped.  This also means that
+	 * the dcache has to be flushed to ensure coherency between the kernel
+	 * and user pages. We used to pass __GFP_ZERO to alloc_page which mapped
+	 * zeroed and unmaped each individual page, and then we had to turn
+	 * around and call flush_dcache_page() on that page to clear the caches.
+	 * This was killing us for performance. Instead, we found it is much
+	 * faster to allocate the pages without GFP_ZERO, map the entire range,
+	 * memset it, flush the range and then unmap - this results in a factor
+	 * of 4 improvement for speed for large buffers.  There is a small
+	 * increase in speed for small buffers, but only on the order of a few
+	 * microseconds at best.  The only downside is that there needs to be
+	 * enough temporary space in vmalloc to accomodate the map. This
+	 * shouldn't be a problem, but if it happens, fall back to a much slower
+	 * path
+	 */
+
+	ptr = vmap(pages, pcount, VM_IOREMAP, page_prot);
 
 	if (ptr != NULL) {
 		memset(ptr, 0, memdesc->size);
 		dmac_flush_range(ptr, ptr + memdesc->size);
 		vunmap(ptr);
 	} else {
-		int j;
+		/* Very, very, very slow path */
 
-		
-
-		for (j = 0; j < i; j++) {
+		for (j = 0; j < pcount; j++) {
 			ptr = kmap_atomic(pages[j]);
 			memset(ptr, 0, PAGE_SIZE);
 			dmac_flush_range(ptr, ptr + PAGE_SIZE);
@@ -649,6 +702,9 @@
 	if (ret)
 		goto done;
 
+	KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc,
+		kgsl_driver.stats.page_alloc_max);
+
 	order = get_order(size);
 
 	if (order < 16)
@@ -657,9 +713,6 @@
 done:
 	kfree(pages);
 
-	KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc,
-		kgsl_driver.stats.page_alloc_max);
-
 	if (ret)
 		kgsl_sharedmem_free(memdesc);
 
@@ -675,191 +728,35 @@
 
 	size = ALIGN(size, PAGE_SIZE * 2);
 
-	kgsl_driver.stats.page_alloc_kernel += size;
 	ret =  _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
 		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
 	if (!ret)
 		ret = kgsl_page_alloc_map_kernel(memdesc);
-	if (ret) {
-		
-		kgsl_driver.stats.page_alloc_kernel += size;
+	if (ret)
 		kgsl_sharedmem_free(memdesc);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_page_alloc);
 
 int
 kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
-				struct kgsl_process_private *private,
 			    struct kgsl_pagetable *pagetable,
-			    size_t size, int flags)
+			    size_t size)
 {
 	unsigned int protflags;
-	int ret = 0;
 
 	if (size == 0)
 		return -EINVAL;
 
 	protflags = GSL_PT_PAGE_RV;
-	if (!(flags & KGSL_MEMFLAGS_GPUREADONLY))
+	if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
 		protflags |= GSL_PT_PAGE_WV;
 
-	ret = _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
-			protflags);
-
-	if (ret == 0 && private)
-		kgsl_process_add_stats(private, KGSL_MEM_ENTRY_PAGE_ALLOC, size);
-
-	return ret;
+	return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
+		protflags);
 }
 EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user);
 
-static int
-_kgsl_sharedmem_ion_alloc(struct kgsl_memdesc *memdesc,
-				struct kgsl_pagetable *pagetable,
-				size_t size, unsigned int protflags)
-{
-	int order, ret = 0;
-	int sglen = 1;
-	void *ptr;
-	struct ion_handle *handle = NULL;
-	ion_phys_addr_t pa = 0;
-	size_t len = 0;
-
-
-	
-	
-
-	memdesc->size = size;
-	memdesc->pagetable = pagetable;
-	memdesc->priv = KGSL_MEMFLAGS_CACHED;
-	memdesc->ops = &kgsl_ion_alloc_ops;
-
-	memdesc->sg = kgsl_sg_alloc(sglen);
-
-	if (memdesc->sg == NULL) {
-		KGSL_CORE_ERR("kgsl_sg_alloc vmalloc(%d) failed\n",
-		sglen * sizeof(struct scatterlist));
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	kmemleak_not_leak(memdesc->sg);
-
-	memdesc->sglen = sglen;
-	sg_init_table(memdesc->sg, sglen);
-
-	if (kgsl_client == NULL)
-		kgsl_client = msm_ion_client_create(-1, "KGSL");
-
-	handle = ion_alloc(kgsl_client, size, SZ_4K, 0x1 << ION_SF_HEAP_ID);
-	if (IS_ERR_OR_NULL(handle)) {
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	if (ion_phys(kgsl_client, handle, &pa, &len)) {
-		KGSL_CORE_ERR("kgsl: ion_phys() failed\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	memdesc->handle = handle;
-
-	memdesc->sg[0].length = memdesc->size;
-	memdesc->sg[0].offset = 0;
-	memdesc->sg[0].dma_address = pa;
-
-	
-	
-
-
-	ptr = ioremap(pa, memdesc->size);
-
-	if (ptr != NULL) {
-		memset(ptr, 0, memdesc->size);
-		dmac_flush_range(ptr, ptr + memdesc->size);
-		iounmap(ptr);
-	}
-
-	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen, KGSL_CACHE_OP_FLUSH);
-
-	ret = kgsl_mmu_map(pagetable, memdesc, protflags);
-
-	if (ret) {
-		KGSL_CORE_ERR("kgsl: kgsl_mmu_map failed\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	order = get_order(size);
-
-	if (order < 16)
-		kgsl_driver.stats.histogram[order]++;
-
-done:
-	KGSL_STATS_ADD(size, kgsl_driver.stats.pre_alloc, kgsl_driver.stats.pre_alloc_max);
-
-	if (ret)
-		kgsl_sharedmem_free(memdesc);
-
-	return ret;
-}
-
-int
-kgsl_sharedmem_ion_alloc(struct kgsl_memdesc *memdesc,
-                struct kgsl_pagetable *pagetable,
-                size_t size)
-{
-	int ret;
-
-	BUG_ON(size == 0);
-	size = PAGE_ALIGN(size);
-
-	kgsl_driver.stats.pre_alloc_kernel += size;
-	ret = _kgsl_sharedmem_ion_alloc(memdesc, pagetable, size,
-		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-
-	if (!ret)
-		ret = kgsl_ion_alloc_map_kernel(memdesc);
-
-	if (ret) {
-		
-		kgsl_driver.stats.pre_alloc_kernel += size;
-		kgsl_sharedmem_free(memdesc);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(kgsl_sharedmem_ion_alloc);
-
-int
-kgsl_sharedmem_ion_alloc_user(struct kgsl_memdesc *memdesc,
-                struct kgsl_process_private *private,
-				struct kgsl_pagetable *pagetable,
-				size_t size, int flags)
-{
-	unsigned int protflags;
-	int ret = 0;
-
-	BUG_ON(size == 0);
-
-	size = PAGE_ALIGN(size);
-
-	protflags = GSL_PT_PAGE_RV;
-	if (!(flags & KGSL_MEMFLAGS_GPUREADONLY))
-		protflags |= GSL_PT_PAGE_WV;
-
-	ret = _kgsl_sharedmem_ion_alloc(memdesc, pagetable, size,
-			protflags);
-
-	if (ret == 0 && private)
-		kgsl_process_add_stats(private, KGSL_MEM_ENTRY_PRE_ALLOC, size);
-
-	return ret;
-}
-EXPORT_SYMBOL(kgsl_sharedmem_ion_alloc_user);
-
 int
 kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size)
 {
@@ -882,7 +779,7 @@
 	if (result)
 		goto err;
 
-	
+	/* Record statistics */
 
 	KGSL_STATS_ADD(size, kgsl_driver.stats.coherent,
 		       kgsl_driver.stats.coherent_max);
@@ -906,7 +803,7 @@
 	if (memdesc->ops && memdesc->ops->free)
 		memdesc->ops->free(memdesc);
 
-	kgsl_sg_free(memdesc->sg, memdesc->sglen);
+	kgsl_sg_free(memdesc->sg, memdesc->sglen_alloc);
 
 	memset(memdesc, 0, sizeof(*memdesc));
 }
@@ -953,7 +850,7 @@
 int
 kgsl_sharedmem_ebimem_user(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
-			size_t size, int flags)
+			size_t size)
 {
 	size = ALIGN(size, PAGE_SIZE);
 	return _kgsl_sharedmem_ebimem(memdesc, pagetable, size);
@@ -1039,6 +936,15 @@
 }
 EXPORT_SYMBOL(kgsl_sharedmem_set);
 
+/*
+ * kgsl_sharedmem_map_vma - Map a user vma to physical memory
+ *
+ * @vma - The user vma to map
+ * @memdesc - The memory descriptor which contains information about the
+ * physical memory
+ *
+ * Return: 0 on success else error code
+ */
 int
 kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
 			const struct kgsl_memdesc *memdesc)
@@ -1059,3 +965,42 @@
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_map_vma);
+
+static const char * const memtype_str[] = {
+	[KGSL_MEMTYPE_OBJECTANY] = "any(0)",
+	[KGSL_MEMTYPE_FRAMEBUFFER] = "framebuffer",
+	[KGSL_MEMTYPE_RENDERBUFFER] = "renderbuffer",
+	[KGSL_MEMTYPE_ARRAYBUFFER] = "arraybuffer",
+	[KGSL_MEMTYPE_ELEMENTARRAYBUFFER] = "elementarraybuffer",
+	[KGSL_MEMTYPE_VERTEXARRAYBUFFER] = "vertexarraybuffer",
+	[KGSL_MEMTYPE_TEXTURE] = "texture",
+	[KGSL_MEMTYPE_SURFACE] = "surface",
+	[KGSL_MEMTYPE_EGL_SURFACE] = "egl_surface",
+	[KGSL_MEMTYPE_GL] = "gl",
+	[KGSL_MEMTYPE_CL] = "cl",
+	[KGSL_MEMTYPE_CL_BUFFER_MAP] = "cl_buffer_map",
+	[KGSL_MEMTYPE_CL_BUFFER_NOMAP] = "cl_buffer_nomap",
+	[KGSL_MEMTYPE_CL_IMAGE_MAP] = "cl_image_map",
+	[KGSL_MEMTYPE_CL_IMAGE_NOMAP] = "cl_image_nomap",
+	[KGSL_MEMTYPE_CL_KERNEL_STACK] = "cl_kernel_stack",
+	[KGSL_MEMTYPE_COMMAND] = "command",
+	[KGSL_MEMTYPE_2D] = "2d",
+	[KGSL_MEMTYPE_EGL_IMAGE] = "egl_image",
+	[KGSL_MEMTYPE_EGL_SHADOW] = "egl_shadow",
+	[KGSL_MEMTYPE_MULTISAMPLE] = "egl_multisample",
+	/* KGSL_MEMTYPE_KERNEL handled below, to avoid huge array */
+};
+
+void kgsl_get_memory_usage(char *name, size_t name_size, unsigned int memflags)
+{
+	unsigned char type;
+
+	type = (memflags & KGSL_MEMTYPE_MASK) >> KGSL_MEMTYPE_SHIFT;
+	if (type == KGSL_MEMTYPE_KERNEL)
+		strlcpy(name, "kernel", name_size);
+	else if (type < ARRAY_SIZE(memtype_str) && memtype_str[type] != NULL)
+		strlcpy(name, memtype_str[type], name_size);
+	else
+		snprintf(name, name_size, "unknown(%3d)", type);
+}
+EXPORT_SYMBOL(kgsl_get_memory_usage);
diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h
index 9c7eb70..3109ef2 100644
--- a/drivers/gpu/msm/kgsl_sharedmem.h
+++ b/drivers/gpu/msm/kgsl_sharedmem.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,7 +19,8 @@
 #include "kgsl_mmu.h"
 #include <linux/slab.h>
 #include <linux/kmemleak.h>
-#include <linux/sched.h>
+
+#include "kgsl_log.h"
 
 struct kgsl_device;
 struct kgsl_process_private;
@@ -28,33 +29,20 @@
 #define KGSL_CACHE_OP_FLUSH     0x02
 #define KGSL_CACHE_OP_CLEAN     0x03
 
-#define KGSL_MEMFLAGS_CACHED    0x00000001
-#define KGSL_MEMFLAGS_GLOBAL    0x00000002
-
 extern struct kgsl_memdesc_ops kgsl_page_alloc_ops;
 
-int kgsl_sharedmem_ion_alloc(struct kgsl_memdesc *memdesc,
-				struct kgsl_pagetable *pagetable, size_t size);
-
-int kgsl_sharedmem_ion_alloc_user(struct kgsl_memdesc *memdesc,
-				struct kgsl_process_private *private,
-				struct kgsl_pagetable *pagetable,
-				size_t size, int flags);
-
-
 int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			   struct kgsl_pagetable *pagetable, size_t size);
 
 int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
-				struct kgsl_process_private *private,
 				struct kgsl_pagetable *pagetable,
-				size_t size, int flags);
+				size_t size);
 
 int kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size);
 
 int kgsl_sharedmem_ebimem_user(struct kgsl_memdesc *memdesc,
 			     struct kgsl_pagetable *pagetable,
-			     size_t size, int flags);
+			     size_t size);
 
 int kgsl_sharedmem_ebimem(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
@@ -82,8 +70,42 @@
 int kgsl_sharedmem_init_sysfs(void);
 void kgsl_sharedmem_uninit_sysfs(void);
 
+/*
+ * kgsl_memdesc_get_align - Get alignment flags from a memdesc
+ * @memdesc - the memdesc
+ *
+ * Returns the alignment requested, as power of 2 exponent.
+ */
+static inline int
+kgsl_memdesc_get_align(const struct kgsl_memdesc *memdesc)
+{
+	return (memdesc->flags & KGSL_MEMALIGN_MASK) >> KGSL_MEMALIGN_SHIFT;
+}
+
+/*
+ * kgsl_memdesc_set_align - Set alignment flags of a memdesc
+ * @memdesc - the memdesc
+ * @align - alignment requested, as a power of 2 exponent.
+ */
+static inline int
+kgsl_memdesc_set_align(struct kgsl_memdesc *memdesc, unsigned int align)
+{
+	if (align > 32) {
+		KGSL_CORE_ERR("Alignment too big, restricting to 2^32\n");
+		align = 32;
+	}
+
+	memdesc->flags &= ~KGSL_MEMALIGN_MASK;
+	memdesc->flags |= (align << KGSL_MEMALIGN_SHIFT) & KGSL_MEMALIGN_MASK;
+	return 0;
+}
+
 static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg)
 {
+	/*
+	 * Try sg_dma_address first to support ion carveout
+	 * regions which do not work with sg_phys().
+	 */
 	unsigned int pa = sg_dma_address(sg);
 	if (pa == 0)
 		pa = sg_phys(sg);
@@ -94,6 +116,12 @@
 kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
 			const struct kgsl_memdesc *memdesc);
 
+/*
+ * For relatively small sglists, it is preferable to use kzalloc
+ * rather than going down the vmalloc rat hole.  If the size of
+ * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to
+ * vmalloc
+ */
 
 static inline void *kgsl_sg_alloc(unsigned int sglen)
 {
@@ -116,7 +144,7 @@
 		unsigned int physaddr, unsigned int size)
 {
 	memdesc->sg = kgsl_sg_alloc(1);
-	if (memdesc->sg == NULL)
+	if (!memdesc->sg)
 		return -ENOMEM;
 
 	kmemleak_not_leak(memdesc->sg);
@@ -133,37 +161,25 @@
 kgsl_allocate(struct kgsl_memdesc *memdesc,
 		struct kgsl_pagetable *pagetable, size_t size)
 {
-	int ret = 1;
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem(memdesc, pagetable, size);
-
-	if(size >= SZ_4M)
-		ret = kgsl_sharedmem_ion_alloc(memdesc, pagetable, size);
-
-	if(ret)
-		return kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
-	return ret;
+	memdesc->flags |= (KGSL_MEMTYPE_KERNEL << KGSL_MEMTYPE_SHIFT);
+	return kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
 }
 
 static inline int
 kgsl_allocate_user(struct kgsl_memdesc *memdesc,
-		struct kgsl_process_private *private,
 		struct kgsl_pagetable *pagetable,
 		size_t size, unsigned int flags)
 {
-	int ret = 1;
-	char task_comm[TASK_COMM_LEN];
+	int ret;
+
+	memdesc->flags = flags;
 
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
-		return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size,
-						  flags);
-	if(size >= SZ_4M)
-		ret = kgsl_sharedmem_ion_alloc_user(memdesc, private, pagetable, size, flags);
-	else if ( size >= SZ_1M && strcmp("om.htc.launcher", get_task_comm(task_comm, current->group_leader)) == 0 )
-		ret = kgsl_sharedmem_ion_alloc_user(memdesc, private, pagetable, size, flags);
-
-	if(ret)
-		return kgsl_sharedmem_page_alloc_user(memdesc, private, pagetable, size, flags);
+		ret = kgsl_sharedmem_ebimem_user(memdesc, pagetable, size);
+	else
+		ret = kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size);
 
 	return ret;
 }
@@ -174,6 +190,8 @@
 	int ret  = kgsl_sharedmem_alloc_coherent(memdesc, size);
 	if (!ret && (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE))
 		memdesc->gpuaddr = memdesc->physaddr;
+
+	memdesc->flags |= (KGSL_MEMTYPE_KERNEL << KGSL_MEMTYPE_SHIFT);
 	return ret;
 }
 
@@ -188,4 +206,4 @@
 
 	return size;
 }
-#endif 
+#endif /* __KGSL_SHAREDMEM_H */
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index 9704e2b..a5aa42f 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -23,6 +23,7 @@
 #include "kgsl_sharedmem.h"
 #include "kgsl_snapshot.h"
 
+/* Placeholder for the list of memory objects frozen after a hang */
 
 struct kgsl_snapshot_object {
 	unsigned int gpuaddr;
@@ -35,10 +36,10 @@
 };
 
 struct snapshot_obj_itr {
-	void *buf;      
-	int pos;        
-	loff_t offset;  
-	size_t remain;  
+	void *buf;      /* Buffer pointer to write to */
+	int pos;        /* Current position in the sequence */
+	loff_t offset;  /* file offset to start writing from */
+	size_t remain;  /* Bytes remaining in buffer */
 	size_t write;   /* Bytes written so far */
 };
 
@@ -60,13 +61,13 @@
 	if ((itr->pos + size) <= itr->offset)
 		goto done;
 
-	
+	/* Handle the case that offset is in the middle of the buffer */
 
 	if (itr->offset > itr->pos) {
 		src += (itr->offset - itr->pos);
 		size -= (itr->offset - itr->pos);
 
-		
+		/* Advance pos to the offset start */
 		itr->pos = itr->offset;
 	}
 
@@ -84,6 +85,7 @@
 	return size;
 }
 
+/* idr_for_each function to count the number of contexts */
 
 static int snapshot_context_count(int id, void *ptr, void *data)
 {
@@ -93,6 +95,10 @@
 	return 0;
 }
 
+/*
+ * To simplify the iterator loop use a global pointer instead of trying
+ * to pass around double star references to the snapshot data
+ */
 
 static void *_ctxtptr;
 
@@ -104,6 +110,9 @@
 
 	header->id = id;
 
+	/* Future-proof for per-context timestamps - for now, just
+	 * return the global timestamp for all contexts
+	 */
 
 	header->timestamp_queued = kgsl_readtimestamp(device, context,
 						      KGSL_TIMESTAMP_QUEUED);
@@ -115,6 +124,7 @@
 	return 0;
 }
 
+/* Snapshot the Linux specific information */
 static int snapshot_os(struct kgsl_device *device,
 	void *snapshot, int remain, void *priv)
 {
@@ -126,12 +136,14 @@
 	int ctxtcount = 0;
 	int size = sizeof(*header);
 
+	/* Figure out how many active contexts there are - these will
+	 * be appended on the end of the structure */
 
 	idr_for_each(&device->context_idr, snapshot_context_count, &ctxtcount);
 
 	size += ctxtcount * sizeof(struct kgsl_snapshot_linux_context);
 
-	
+	/* Make sure there is enough room for the data */
 	if (remain < size) {
 		SNAPSHOT_ERR_NOMEM(device, "OS");
 		return 0;
@@ -143,27 +155,28 @@
 
 	header->state = hang ? SNAPSHOT_STATE_HUNG : SNAPSHOT_STATE_RUNNING;
 
-	
+	/* Get the kernel build information */
 	strlcpy(header->release, utsname()->release, sizeof(header->release));
 	strlcpy(header->version, utsname()->version, sizeof(header->version));
 
-	
+	/* Get the Unix time for the timestamp */
 	header->seconds = get_seconds();
 
-	
+	/* Remember the power information */
 	header->power_flags = pwr->power_flags;
 	header->power_level = pwr->active_pwrlevel;
 	header->power_interval_timeout = pwr->interval_timeout;
 	header->grpclk = kgsl_get_clkrate(pwr->grp_clks[0]);
 	header->busclk = kgsl_get_clkrate(pwr->ebi1_clk);
 
-	
+	/* Future proof for per-context timestamps */
 	header->current_context = -1;
 
-	
+	/* Get the current PT base */
 	header->ptbase = kgsl_mmu_get_current_ptbase(&device->mmu);
-	
-	pid = header->pid = kgsl_mmu_get_ptname_from_ptbase(header->ptbase);
+	/* And the PID for the task leader */
+	pid = header->pid = kgsl_mmu_get_ptname_from_ptbase(&device->mmu,
+								header->ptbase);
 
 	task = find_task_by_vpid(pid);
 
@@ -172,13 +185,23 @@
 
 	header->ctxtcount = ctxtcount;
 
-	
+	/* append information for each context */
 	_ctxtptr = snapshot + sizeof(*header);
 	idr_for_each(&device->context_idr, snapshot_context_info, NULL);
 
-	
+	/* Return the size of the data segment */
 	return size;
 }
+/*
+ * kgsl_snapshot_dump_indexed_regs - helper function to dump indexed registers
+ * @device - the device to dump registers from
+ * @snapshot - pointer to the start of the region of memory for the snapshot
+ * @remain - a pointer to the number of bytes remaining in the snapshot
+ * @priv - A pointer to the kgsl_snapshot_indexed_registers data
+ *
+ * Given a indexed register cmd/data pair and a count, dump each indexed
+ * register
+ */
 
 static int kgsl_snapshot_dump_indexed_regs(struct kgsl_device *device,
 	void *snapshot, int remain, void *priv)
@@ -220,6 +243,10 @@
 	sect.magic = SNAPSHOT_SECTION_MAGIC;
 	sect.id = KGSL_SNAPSHOT_SECTION_GPU_OBJECT;
 
+	/*
+	 * Header size is in dwords, object size is in bytes -
+	 * round up if the object size isn't dword aligned
+	 */
 
 	sect.size = GPU_OBJ_HEADER_SZ + ALIGN(obj->size, 4);
 
@@ -241,7 +268,7 @@
 	if (ret == 0)
 		return 0;
 
-	
+	/* Pad the end to a dword boundary if we need to */
 
 	if (obj->size % 4) {
 		unsigned int dummy = 0;
@@ -262,25 +289,15 @@
 	kfree(obj);
 }
 
-#if 0
-int kgsl_snapshot_have_object(struct kgsl_device *device, unsigned int ptbase,
-	unsigned int gpuaddr, unsigned int size)
-{
-	struct kgsl_snapshot_object *obj;
-
-	list_for_each_entry(obj, &device->snapshot_obj_list, node) {
-		if (obj->ptbase != ptbase)
-			continue;
-
-		if ((gpuaddr >= obj->gpuaddr) &&
-			((gpuaddr + size) <= (obj->gpuaddr + obj->size)))
-			return 1;
-	}
-
-	return 0;
-}
-#endif
-
+/* ksgl_snapshot_have_object - Return 1 if the object has been processed
+ *@device - the device that is being snapshotted
+ * @ptbase - the pagetable base of the object to freeze
+ * @gpuaddr - The gpu address of the object to freeze
+ * @size - the size of the object (may not always be the size of the region)
+ *
+ * Return 1 if the object is already in the list - this can save us from
+ * having to parse the sme thing over again.
+*/
 int kgsl_snapshot_have_object(struct kgsl_device *device, unsigned int ptbase,
 	unsigned int gpuaddr, unsigned int size)
 {
@@ -298,6 +315,17 @@
 	return 0;
 }
 
+/* kgsl_snapshot_get_object - Mark a GPU buffer to be frozen
+ * @device - the device that is being snapshotted
+ * @ptbase - the pagetable base of the object to freeze
+ * @gpuaddr - The gpu address of the object to freeze
+ * @size - the size of the object (may not always be the size of the region)
+ * @type - the type of object being saved (shader, vbo, etc)
+ *
+ * Mark and freeze a GPU buffer object.  This will prevent it from being
+ * freed until it can be copied out as part of the snapshot dump.  Returns the
+ * size of the object being frozen
+ */
 
 int kgsl_snapshot_get_object(struct kgsl_device *device, unsigned int ptbase,
 	unsigned int gpuaddr, unsigned int size, unsigned int type)
@@ -306,27 +334,33 @@
 	struct kgsl_snapshot_object *obj;
 	int offset;
 
-	entry = kgsl_get_mem_entry(ptbase, gpuaddr, size);
+	entry = kgsl_get_mem_entry(device, ptbase, gpuaddr, size);
 
 	if (entry == NULL) {
 		KGSL_DRV_ERR(device, "Unable to find GPU buffer %8.8X\n",
 				gpuaddr);
-		return 0;
+		return -EINVAL;
 	}
 
-	
+	/* We can't freeze external memory, because we don't own it */
 	if (entry->memtype != KGSL_MEM_ENTRY_KERNEL) {
 		KGSL_DRV_ERR(device,
 			"Only internal GPU buffers can be frozen\n");
-		return 0;
+		return -EINVAL;
 	}
 
+	/*
+	 * size indicates the number of bytes in the region to save. This might
+	 * not always be the entire size of the region because some buffers are
+	 * sub-allocated from a larger region.  However, if size 0 was passed
+	 * thats a flag that the caller wants to capture the entire buffer
+	 */
 
 	if (size == 0) {
 		size = entry->memdesc.size;
 		offset = 0;
 
-		
+		/* Adjust the gpuaddr to the start of the object */
 		gpuaddr = entry->memdesc.gpuaddr;
 	} else {
 		offset = gpuaddr - entry->memdesc.gpuaddr;
@@ -335,13 +369,13 @@
 	if (size + offset > entry->memdesc.size) {
 		KGSL_DRV_ERR(device, "Invalid size for GPU buffer %8.8X\n",
 				gpuaddr);
-		return 0;
+		return -EINVAL;
 	}
 
-	
+	/* If the buffer is already on the list, skip it */
 	list_for_each_entry(obj, &device->snapshot_obj_list, node) {
 		if (obj->gpuaddr == gpuaddr && obj->ptbase == ptbase) {
-			
+			/* If the size is different, use the new size */
 			if (obj->size != size)
 				obj->size = size;
 
@@ -352,17 +386,17 @@
 	if (kgsl_memdesc_map(&entry->memdesc) == NULL) {
 		KGSL_DRV_ERR(device, "Unable to map GPU buffer %X\n",
 				gpuaddr);
-		return 0;
+		return -EINVAL;
 	}
 
 	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
 
 	if (obj == NULL) {
 		KGSL_DRV_ERR(device, "Unable to allocate memory\n");
-		return 0;
+		return -EINVAL;
 	}
 
-	
+	/* Ref count the mem entry */
 	kgsl_mem_entry_get(entry);
 
 	obj->type = type;
@@ -374,6 +408,13 @@
 
 	list_add(&obj->node, &device->snapshot_obj_list);
 
+	/*
+	 * Return the size of the entire mem entry that was frozen - this gets
+	 * used for tracking how much memory is frozen for a hang.  Also, mark
+	 * the memory entry as frozen. If the entry was already marked as
+	 * frozen, then another buffer already got to it.  In that case, return
+	 * 0 so it doesn't get counted twice
+	 */
 
 	if (entry->flags & KGSL_MEM_ENTRY_FROZEN)
 		return 0;
@@ -384,21 +425,39 @@
 }
 EXPORT_SYMBOL(kgsl_snapshot_get_object);
 
+/*
+ * kgsl_snapshot_dump_regs - helper function to dump device registers
+ * @device - the device to dump registers from
+ * @snapshot - pointer to the start of the region of memory for the snapshot
+ * @remain - a pointer to the number of bytes remaining in the snapshot
+ * @priv - A pointer to the kgsl_snapshot_registers data
+ *
+ * Given an array of register ranges pairs (start,end [inclusive]), dump the
+ * registers into a snapshot register section.  The snapshot region stores a
+ * part of dwords for each register - the word address of the register, and
+ * the value.
+ */
 int kgsl_snapshot_dump_regs(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
 {
+	struct kgsl_snapshot_registers_list *list = priv;
+
 	struct kgsl_snapshot_regs *header = snapshot;
-	struct kgsl_snapshot_registers *regs = priv;
+	struct kgsl_snapshot_registers *regs;
 	unsigned int *data = snapshot + sizeof(*header);
-	int count = 0, i, j;
+	int count = 0, i, j, k;
 
-	
+	/* Figure out how many registers we are going to dump */
 
-	for (i = 0; i < regs->count; i++) {
-		int start = regs->regs[i * 2];
-		int end = regs->regs[i * 2 + 1];
+	for (i = 0; i < list->count; i++) {
+		regs = &(list->registers[i]);
 
-		count += (end - start + 1);
+		for (j = 0; j < regs->count; j++) {
+			int start = regs->regs[j * 2];
+			int end = regs->regs[j * 2 + 1];
+
+			count += (end - start + 1);
+		}
 	}
 
 	if (remain < (count * 8) + sizeof(*header)) {
@@ -406,22 +465,26 @@
 		return 0;
 	}
 
-	for (i = 0; i < regs->count; i++) {
-		unsigned int start = regs->regs[i * 2];
-		unsigned int end = regs->regs[i * 2 + 1];
 
-		for (j = start; j <= end; j++) {
-			unsigned int val;
+	for (i = 0; i < list->count; i++) {
+		regs = &(list->registers[i]);
+		for (j = 0; j < regs->count; j++) {
+			unsigned int start = regs->regs[j * 2];
+			unsigned int end = regs->regs[j * 2 + 1];
 
-			kgsl_regread(device, j, &val);
-			*data++ = j;
-			*data++ = val;
+			for (k = start; k <= end; k++) {
+				unsigned int val;
+
+				kgsl_regread(device, k, &val);
+				*data++ = k;
+				*data++ = val;
+			}
 		}
 	}
 
 	header->count = count;
 
-	
+	/* Return the size of the section */
 	return (count * 8) + sizeof(*header);
 }
 EXPORT_SYMBOL(kgsl_snapshot_dump_regs);
@@ -443,15 +506,28 @@
 }
 EXPORT_SYMBOL(kgsl_snapshot_indexed_registers);
 
+/*
+ * kgsl_snapshot - construct a device snapshot
+ * @device - device to snapshot
+ * @hang - set to 1 if the snapshot was triggered following a hnag
+ * Given a device, construct a binary snapshot dump of the current device state
+ * and store it in the device snapshot memory.
+ */
 int kgsl_device_snapshot(struct kgsl_device *device, int hang)
 {
 	struct kgsl_snapshot_header *header = device->snapshot;
 	int remain = device->snapshot_maxsize - sizeof(*header);
 	void *snapshot;
-	struct platform_device *pdev =
-		container_of(device->parentdev, struct platform_device, dev);
-	struct kgsl_device_platform_data *pdata = pdev->dev.platform_data;
+	struct timespec boot;
 
+	/*
+	 * The first hang is always the one we are interested in. To
+	 * avoid a subsequent hang blowing away the first, the snapshot
+	 * is frozen until it is dumped via sysfs.
+	 *
+	 * Note that triggered snapshots are always taken regardless
+	 * of the state and never frozen.
+	 */
 
 	if (hang && device->snapshot_frozen == 1)
 		return 0;
@@ -472,34 +548,40 @@
 
 	header->gpuid = kgsl_gpuid(device, &header->chipid);
 
-	
+	/* Get a pointer to the first section (right after the header) */
 	snapshot = ((void *) device->snapshot) + sizeof(*header);
 
-	
+	/* Build the Linux specific header */
 	snapshot = kgsl_snapshot_add_section(device, KGSL_SNAPSHOT_SECTION_OS,
 		snapshot, &remain, snapshot_os, (void *) hang);
 
-	
+	/* Get the device specific sections */
 	if (device->ftbl->snapshot)
 		snapshot = device->ftbl->snapshot(device, snapshot, &remain,
 			hang);
 
-	device->snapshot_timestamp = get_seconds();
+	/*
+	 * The timestamp is the seconds since boot so it is easier to match to
+	 * the kernel log
+	 */
+
+	getboottime(&boot);
+	device->snapshot_timestamp = get_seconds() - boot.tv_sec;
 	device->snapshot_size = (int) (snapshot - device->snapshot);
 
-	
+	/* Freeze the snapshot on a hang until it gets read */
 	device->snapshot_frozen = (hang) ? 1 : 0;
 
-	
-	KGSL_DRV_ERR(device,"snapshot created at va %p pa %x size %d\n",
-			device->snapshot, pdata->snapshot_address,
-			device->snapshot_size);
+	/* log buffer info to aid in ramdump fault tolerance */
+	KGSL_DRV_ERR(device, "snapshot created at pa %lx size %d\n",
+			__pa(device->snapshot),	device->snapshot_size);
 	if (hang)
 		sysfs_notify(&device->snapshot_kobj, NULL, "timestamp");
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_device_snapshot);
 
+/* An attribute for showing snapshot details */
 struct kgsl_snapshot_attribute {
 	struct attribute attr;
 	ssize_t (*show)(struct kgsl_device *device, char *buf);
@@ -513,6 +595,7 @@
 #define kobj_to_device(a) \
 container_of(a, struct kgsl_device, snapshot_kobj)
 
+/* Dump the sysfs binary data to the user */
 static ssize_t snapshot_show(struct file *filep, struct kobject *kobj,
 	struct bin_attribute *attr, char *buf, loff_t off,
 	size_t count)
@@ -526,11 +609,11 @@
 	if (device == NULL)
 		return 0;
 
-	
+	/* Return nothing if we haven't taken a snapshot yet */
 	if (device->snapshot_timestamp == 0)
 		return 0;
 
-	
+	/* Get the mutex to keep things from changing while we are dumping */
 	mutex_lock(&device->mutex);
 
 	obj_itr_init(&itr, buf, off, count);
@@ -575,11 +658,13 @@
 	return itr.write;
 }
 
+/* Show the timestamp of the last collected snapshot */
 static ssize_t timestamp_show(struct kgsl_device *device, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%x\n", device->snapshot_timestamp);
+	return snprintf(buf, PAGE_SIZE, "%d\n", device->snapshot_timestamp);
 }
 
+/* manually trigger a new snapshot to be collected */
 static ssize_t trigger_store(struct kgsl_device *device, const char *buf,
 	size_t count)
 {
@@ -592,23 +677,6 @@
 	return count;
 }
 
-static ssize_t no_panic_show(struct kgsl_device *device, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%x\n", device->snapshot_no_panic);
-}
-
-static ssize_t no_panic_store(struct kgsl_device *device, const char *buf,
-	size_t count)
-{
-	if (device && count > 0) {
-		mutex_lock(&device->mutex);
-		device->snapshot_no_panic = simple_strtol(buf, NULL, 10);
-		mutex_unlock(&device->mutex);
-	}
-
-	return count;
-}
-
 static struct bin_attribute snapshot_attr = {
 	.attr.name = "dump",
 	.attr.mode = 0444,
@@ -625,7 +693,6 @@
 
 SNAPSHOT_ATTR(trigger, 0600, NULL, trigger_store);
 SNAPSHOT_ATTR(timestamp, 0444, timestamp_show, NULL);
-SNAPSHOT_ATTR(no_panic, 0644, no_panic_show, no_panic_store);
 
 static void snapshot_sysfs_release(struct kobject *kobj)
 {
@@ -672,21 +739,19 @@
 	.release = snapshot_sysfs_release,
 };
 
+/* kgsl_device_snapshot_init - Add resources for the device GPU snapshot
+ * @device - The device to initalize
+ *
+ * Allocate memory for a GPU snapshot for the specified device,
+ * and create the sysfs files to manage it
+ */
 
 int kgsl_device_snapshot_init(struct kgsl_device *device)
 {
 	int ret;
-	struct platform_device *pdev =
-		container_of(device->parentdev, struct platform_device, dev);
-	struct kgsl_device_platform_data *pdata = pdev->dev.platform_data;
 
-	if (device->snapshot == NULL) {
-		if(pdata->snapshot_address) {
-			device->snapshot = ioremap(pdata->snapshot_address, KGSL_SNAPSHOT_MEMSIZE);
-			KGSL_DRV_INFO(device, "snapshot created at va %p pa %x\n", device->snapshot, pdata->snapshot_address);
-		} else
-			device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL);
-	}
+	if (device->snapshot == NULL)
+		device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL);
 
 	if (device->snapshot == NULL)
 		return -ENOMEM;
@@ -710,16 +775,18 @@
 		goto done;
 
 	ret  = sysfs_create_file(&device->snapshot_kobj, &attr_timestamp.attr);
-	if (ret)
-		goto done;
-
-	ret  = sysfs_create_file(&device->snapshot_kobj, &attr_no_panic.attr);
 
 done:
 	return ret;
 }
 EXPORT_SYMBOL(kgsl_device_snapshot_init);
 
+/* kgsl_device_snapshot_close - Take down snapshot memory for a device
+ * @device - Pointer to the kgsl_device
+ *
+ * Remove the sysfs files and free the memory allocated for the GPU
+ * snapshot
+ */
 
 void kgsl_device_snapshot_close(struct kgsl_device *device)
 {
diff --git a/drivers/gpu/msm/kgsl_snapshot.h b/drivers/gpu/msm/kgsl_snapshot.h
index baee17d..327d18a 100644
--- a/drivers/gpu/msm/kgsl_snapshot.h
+++ b/drivers/gpu/msm/kgsl_snapshot.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -16,25 +16,33 @@
 
 #include <linux/types.h>
 
+/* Snapshot header */
 
+/* High word is static, low word is snapshot version ID */
 #define SNAPSHOT_MAGIC 0x504D0002
 
+/* GPU ID scheme:
+ * [16:31] - core identifer (0x0002 for 2D or 0x0003 for 3D)
+ * [00:16] - GPU specific identifier
+ */
 
 struct kgsl_snapshot_header {
-	__u32 magic; 
-	__u32 gpuid; 
-	
-	__u32 chipid; 
+	__u32 magic; /* Magic identifier */
+	__u32 gpuid; /* GPU ID - see above */
+	/* Added in snapshot version 2 */
+	__u32 chipid; /* Chip ID from the GPU */
 } __packed;
 
+/* Section header */
 #define SNAPSHOT_SECTION_MAGIC 0xABCD
 
 struct kgsl_snapshot_section_header {
-	__u16 magic; 
-	__u16 id;    
-	__u32 size;  
+	__u16 magic; /* Magic identifier */
+	__u16 id;    /* Type of section */
+	__u32 size;  /* Size of the section including this header */
 } __packed;
 
+/* Section identifiers */
 #define KGSL_SNAPSHOT_SECTION_OS           0x0101
 #define KGSL_SNAPSHOT_SECTION_REGS         0x0201
 #define KGSL_SNAPSHOT_SECTION_RB           0x0301
@@ -47,89 +55,105 @@
 
 #define KGSL_SNAPSHOT_SECTION_END          0xFFFF
 
+/* OS sub-section header */
 #define KGSL_SNAPSHOT_OS_LINUX             0x0001
 
+/* Linux OS specific information */
 
 #define SNAPSHOT_STATE_HUNG 0
 #define SNAPSHOT_STATE_RUNNING 1
 
 struct kgsl_snapshot_linux {
-	int osid;                   
-	int state;		    
-	__u32 seconds;		    
-	__u32 power_flags;            
-	__u32 power_level;            
-	__u32 power_interval_timeout; 
-	__u32 grpclk;                 
-	__u32 busclk;		    
-	__u32 ptbase;		    
-	__u32 pid;		    
-	__u32 current_context;	    
-	__u32 ctxtcount;	    
-	unsigned char release[32];  
-	unsigned char version[32];  
-	unsigned char comm[16];	    
+	int osid;                   /* subsection OS identifier */
+	int state;		    /* 1 if the thread is running, 0 for hung */
+	__u32 seconds;		    /* Unix timestamp for the snapshot */
+	__u32 power_flags;            /* Current power flags */
+	__u32 power_level;            /* Current power level */
+	__u32 power_interval_timeout; /* Power interval timeout */
+	__u32 grpclk;                 /* Current GP clock value */
+	__u32 busclk;		    /* Current busclk value */
+	__u32 ptbase;		    /* Current ptbase */
+	__u32 pid;		    /* PID of the process that owns the PT */
+	__u32 current_context;	    /* ID of the current context */
+	__u32 ctxtcount;	    /* Number of contexts appended to section */
+	unsigned char release[32];  /* kernel release */
+	unsigned char version[32];  /* kernel version */
+	unsigned char comm[16];	    /* Name of the process that owns the PT */
 } __packed;
 
+/*
+ * This structure contains a record of an active context.
+ * These are appended one after another in the OS section below
+ * the header above
+ */
 
 struct kgsl_snapshot_linux_context {
-	__u32 id;			
-	__u32 timestamp_queued;		
-	__u32 timestamp_retired;	
+	__u32 id;			/* The context ID */
+	__u32 timestamp_queued;		/* The last queued timestamp */
+	__u32 timestamp_retired;	/* The last timestamp retired by HW */
 };
 
+/* Ringbuffer sub-section header */
 struct kgsl_snapshot_rb {
-	int start;  
-	int end;    
-	int rbsize; 
-	int wptr;   
-	int rptr;   
-	int count;  
+	int start;  /* dword at the start of the dump */
+	int end;    /* dword at the end of the dump */
+	int rbsize; /* Size (in dwords) of the ringbuffer */
+	int wptr;   /* Current index of the CPU write pointer */
+	int rptr;   /* Current index of the GPU read pointer */
+	int count;  /* Number of dwords in the dump */
 } __packed;
 
+/* Indirect buffer sub-section header */
 struct kgsl_snapshot_ib {
-	__u32 gpuaddr; 
-	__u32 ptbase;  
-	int size;    
+	__u32 gpuaddr; /* GPU address of the the IB */
+	__u32 ptbase;  /* Base for the pagetable the GPU address is valid in */
+	int size;    /* Size of the IB */
 } __packed;
 
+/* Register sub-section header */
 struct kgsl_snapshot_regs {
-	__u32 count; 
+	__u32 count; /* Number of register pairs in the section */
 } __packed;
 
+/* Indexed register sub-section header */
 struct kgsl_snapshot_indexed_regs {
-	__u32 index_reg; 
-	__u32 data_reg;  
-	int start;     
-	int count;     
+	__u32 index_reg; /* Offset of the index register for this section */
+	__u32 data_reg;  /* Offset of the data register for this section */
+	int start;     /* Starting index */
+	int count;     /* Number of dwords in the data */
 } __packed;
 
+/* Istore sub-section header */
 struct kgsl_snapshot_istore {
-	int count;   
+	int count;   /* Number of instructions in the istore */
 } __packed;
 
+/* Debug data sub-section header */
 
+/* A2XX debug sections */
 #define SNAPSHOT_DEBUG_SX         1
 #define SNAPSHOT_DEBUG_CP         2
 #define SNAPSHOT_DEBUG_SQ         3
 #define SNAPSHOT_DEBUG_SQTHREAD   4
 #define SNAPSHOT_DEBUG_MIU        5
 
+/* A3XX debug sections */
 #define SNAPSHOT_DEBUG_VPC_MEMORY 6
 #define SNAPSHOT_DEBUG_CP_MEQ     7
 #define SNAPSHOT_DEBUG_CP_PM4_RAM 8
 #define SNAPSHOT_DEBUG_CP_PFP_RAM 9
 #define SNAPSHOT_DEBUG_CP_ROQ     10
 #define SNAPSHOT_DEBUG_SHADER_MEMORY 11
+#define SNAPSHOT_DEBUG_CP_MERCIU 12
 
 struct kgsl_snapshot_debug {
-	int type;    
-	int size;   
+	int type;    /* Type identifier for the attached tata */
+	int size;   /* Size of the section in dwords */
 } __packed;
 
 struct kgsl_snapshot_debugbus {
-	int id;	   
-	int count; 
+	int id;	   /* Debug bus ID */
+	int count; /* Number of dwords in the dump */
 } __packed;
 
 #define SNAPSHOT_GPU_OBJECT_SHADER  1
@@ -137,22 +161,40 @@
 #define SNAPSHOT_GPU_OBJECT_GENERIC 3
 
 struct kgsl_snapshot_gpu_object {
-	int type;      
-	__u32 gpuaddr; 
-	__u32 ptbase;  
-	int size;    
+	int type;      /* Type of GPU object */
+	__u32 gpuaddr; /* GPU address of the the object */
+	__u32 ptbase;  /* Base for the pagetable the GPU address is valid in */
+	int size;    /* Size of the object (in dwords) */
 };
 
 #ifdef __KERNEL__
 
+/* Allocate 512K for each device snapshot */
 #define KGSL_SNAPSHOT_MEMSIZE (512 * 1024)
 
 struct kgsl_device;
+/*
+ * A helper macro to print out "not enough memory functions" - this
+ * makes it easy to standardize the messages as well as cut down on
+ * the number of strings in the binary
+ */
 
 #define SNAPSHOT_ERR_NOMEM(_d, _s) \
 	KGSL_DRV_ERR((_d), \
 	"snapshot: not enough snapshot memory for section %s\n", (_s))
 
+/*
+ * kgsl_snapshot_add_section - Add a new section to the GPU snapshot
+ * @device - the KGSL device being snapshotted
+ * @id - the section id
+ * @snapshot - pointer to the memory for the snapshot
+ * @remain - pointer to the number of bytes left in the snapshot region
+ * @func - Function pointer to fill the section
+ * @priv - Priv pointer to pass to the function
+ *
+ * Set up a KGSL snapshot header by filling the memory with the callback
+ * function and adding the standard section header
+ */
 
 static inline void *kgsl_snapshot_add_section(struct kgsl_device *device,
 	u16 id, void *snapshot, int *remain,
@@ -162,15 +204,25 @@
 	void *data = snapshot + sizeof(*header);
 	int ret = 0;
 
+	/*
+	 * Sanity check to make sure there is enough for the header.  The
+	 * callback will check to make sure there is enough for the rest
+	 * of the data.  If there isn't enough room then don't advance the
+	 * pointer.
+	 */
 
 	if (*remain < sizeof(*header))
 		return snapshot;
 
-	
+	/* It is legal to have no function (i.e. - make an empty section) */
 
 	if (func) {
 		ret = func(device, data, *remain, priv);
 
+		/*
+		 * If there wasn't enough room for the data then don't bother
+		 * setting up the header.
+		 */
 
 		if (ret == 0)
 			return snapshot;
@@ -180,34 +232,82 @@
 	header->id = id;
 	header->size = ret + sizeof(*header);
 
-	
+	/* Decrement the room left in the snapshot region */
 	*remain -= header->size;
-	
+	/* Advance the pointer to the end of the next function */
 	return snapshot + header->size;
 }
 
+/* A common helper function to dump a range of registers.  This will be used in
+ * the GPU specific devices like this:
+ *
+ * struct kgsl_snapshot_registers_list list;
+ * struct kgsl_snapshot_registers priv[2];
+ *
+ * priv[0].regs = registers_array;;
+ * priv[o].count = num_registers;
+ * priv[1].regs = registers_array_new;;
+ * priv[1].count = num_registers_new;
+ *
+ * list.registers = priv;
+ * list.count = 2;
+ *
+ * kgsl_snapshot_add_section(device, KGSL_SNAPSHOT_SECTION_REGS, snapshot,
+ *	remain, kgsl_snapshot_dump_regs, &list).
+ *
+ * Pass in a struct pointing to a list of register definitions as described
+ * below:
+ *
+ * Pass in an array of register range pairs in the form of:
+ * start reg, stop reg
+ * All the registers between start and stop inclusive will be dumped
+ */
 
 struct kgsl_snapshot_registers {
-	unsigned int *regs;  
-	int count;	     
+	unsigned int *regs;  /* Pointer to the array of register ranges */
+	int count;	     /* Number of entries in the array */
+};
+
+struct kgsl_snapshot_registers_list {
+	/* Pointer to an array of register lists */
+	struct kgsl_snapshot_registers *registers;
+	/* Number of registers lists in the array */
+	int count;
 };
 
 int kgsl_snapshot_dump_regs(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv);
 
+/*
+ * A common helper function to dump a set of indexed registers. Use it
+ * like this:
+ *
+ * struct kgsl_snapshot_indexed_registers priv;
+ * priv.index = REG_INDEX;
+ * priv.data = REG_DATA;
+ * priv.count = num_registers
+ *
+ * kgsl_snapshot_add_section(device, KGSL_SNAPSHOT_SECTION_INDEXED_REGS,
+ *	snapshot, remain, kgsl_snapshot_dump_indexed_regs, &priv).
+ *
+ * The callback function will write an index from 0 to priv.count to
+ * the index register and read the data from the data register.
+ */
 
 struct kgsl_snapshot_indexed_registers {
-	unsigned int index; 
-	unsigned int data;  
-	unsigned int start;	
-	unsigned int count; 
+	unsigned int index; /* Offset of the index register */
+	unsigned int data;  /* Offset of the data register */
+	unsigned int start;	/* Index to start with */
+	unsigned int count; /* Number of values to read from the pair */
 };
 
+/* Helper function to snapshot a section of indexed registers */
 
 void *kgsl_snapshot_indexed_registers(struct kgsl_device *device,
 	void *snapshot, int *remain, unsigned int index,
 	unsigned int data, unsigned int start, unsigned int count);
 
+/* Freeze a GPU buffer so it can be dumped in the snapshot */
 int kgsl_snapshot_get_object(struct kgsl_device *device, unsigned int ptbase,
 	unsigned int gpuaddr, unsigned int size, unsigned int type);
 
diff --git a/drivers/gpu/msm/kgsl_sync.c b/drivers/gpu/msm/kgsl_sync.c
new file mode 100644
index 0000000..d9ab081
--- /dev/null
+++ b/drivers/gpu/msm/kgsl_sync.c
@@ -0,0 +1,218 @@
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "kgsl_sync.h"
+
+struct sync_pt *kgsl_sync_pt_create(struct sync_timeline *timeline,
+	unsigned int timestamp)
+{
+	struct sync_pt *pt;
+	pt = sync_pt_create(timeline, (int) sizeof(struct kgsl_sync_pt));
+	if (pt) {
+		struct kgsl_sync_pt *kpt = (struct kgsl_sync_pt *) pt;
+		kpt->timestamp = timestamp;
+	}
+	return pt;
+}
+
+/*
+ * This should only be called on sync_pts which have been created but
+ * not added to a fence.
+ */
+void kgsl_sync_pt_destroy(struct sync_pt *pt)
+{
+	sync_pt_free(pt);
+}
+
+static struct sync_pt *kgsl_sync_pt_dup(struct sync_pt *pt)
+{
+	struct kgsl_sync_pt *kpt = (struct kgsl_sync_pt *) pt;
+	return kgsl_sync_pt_create(pt->parent, kpt->timestamp);
+}
+
+static int kgsl_sync_pt_has_signaled(struct sync_pt *pt)
+{
+	struct kgsl_sync_pt *kpt = (struct kgsl_sync_pt *) pt;
+	struct kgsl_sync_timeline *ktimeline =
+		 (struct kgsl_sync_timeline *) pt->parent;
+	unsigned int ts = kpt->timestamp;
+	unsigned int last_ts = ktimeline->last_timestamp;
+	if (timestamp_cmp(last_ts, ts) >= 0) {
+		/* signaled */
+		return 1;
+	}
+	return 0;
+}
+
+static int kgsl_sync_pt_compare(struct sync_pt *a, struct sync_pt *b)
+{
+	struct kgsl_sync_pt *kpt_a = (struct kgsl_sync_pt *) a;
+	struct kgsl_sync_pt *kpt_b = (struct kgsl_sync_pt *) b;
+	unsigned int ts_a = kpt_a->timestamp;
+	unsigned int ts_b = kpt_b->timestamp;
+	return timestamp_cmp(ts_a, ts_b);
+}
+
+struct kgsl_fence_event_priv {
+	struct kgsl_context *context;
+	unsigned int timestamp;
+};
+
+/**
+ * kgsl_fence_event_cb - Event callback for a fence timestamp event
+ * @device - The KGSL device that expired the timestamp
+ * @priv - private data for the event
+ * @context_id - the context id that goes with the timestamp
+ * @timestamp - the timestamp that triggered the event
+ *
+ * Signal a fence following the expiration of a timestamp
+ */
+
+static inline void kgsl_fence_event_cb(struct kgsl_device *device,
+	void *priv, u32 context_id, u32 timestamp)
+{
+	struct kgsl_fence_event_priv *ev = priv;
+	kgsl_sync_timeline_signal(ev->context->timeline, ev->timestamp);
+	kgsl_context_put(ev->context);
+	kfree(ev);
+}
+
+/**
+ * kgsl_add_fence_event - Create a new fence event
+ * @device - KGSL device to create the event on
+ * @timestamp - Timestamp to trigger the event
+ * @data - Return fence fd stored in struct kgsl_timestamp_event_fence
+ * @len - length of the fence event
+ * @owner - driver instance that owns this event
+ * @returns 0 on success or error code on error
+ *
+ * Create a fence and register an event to signal the fence when
+ * the timestamp expires
+ */
+
+int kgsl_add_fence_event(struct kgsl_device *device,
+	u32 context_id, u32 timestamp, void __user *data, int len,
+	struct kgsl_device_private *owner)
+{
+	struct kgsl_fence_event_priv *event;
+	struct kgsl_timestamp_event_fence priv;
+	struct kgsl_context *context;
+	struct sync_pt *pt;
+	struct sync_fence *fence = NULL;
+	int ret = -EINVAL;
+
+	if (len != sizeof(priv))
+		return -EINVAL;
+
+	context = kgsl_find_context(owner, context_id);
+	if (context == NULL)
+		return -EINVAL;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (event == NULL)
+		return -ENOMEM;
+	event->context = context;
+	event->timestamp = timestamp;
+	kgsl_context_get(context);
+
+	pt = kgsl_sync_pt_create(context->timeline, timestamp);
+	if (pt == NULL) {
+		KGSL_DRV_ERR(device, "kgsl_sync_pt_create failed\n");
+		ret = -ENOMEM;
+		goto fail_pt;
+	}
+
+	fence = sync_fence_create("kgsl-fence", pt);
+	if (fence == NULL) {
+		/* only destroy pt when not added to fence */
+		kgsl_sync_pt_destroy(pt);
+		KGSL_DRV_ERR(device, "sync_fence_create failed\n");
+		ret = -ENOMEM;
+		goto fail_fence;
+	}
+
+	priv.fence_fd = get_unused_fd_flags(0);
+	if (priv.fence_fd < 0) {
+		KGSL_DRV_ERR(device, "invalid fence fd\n");
+		ret = -EINVAL;
+		goto fail_fd;
+	}
+	sync_fence_install(fence, priv.fence_fd);
+
+	if (copy_to_user(data, &priv, sizeof(priv))) {
+		ret = -EFAULT;
+		goto fail_copy_fd;
+	}
+
+	ret = kgsl_add_event(device, context_id, timestamp,
+			kgsl_fence_event_cb, event, owner);
+	if (ret)
+		goto fail_event;
+
+	return 0;
+
+fail_event:
+fail_copy_fd:
+	/* clean up sync_fence_install */
+	sync_fence_put(fence);
+	put_unused_fd(priv.fence_fd);
+fail_fd:
+	/* clean up sync_fence_create */
+	sync_fence_put(fence);
+fail_fence:
+fail_pt:
+	kgsl_context_put(context);
+	kfree(event);
+	return ret;
+}
+
+static const struct sync_timeline_ops kgsl_sync_timeline_ops = {
+	.dup = kgsl_sync_pt_dup,
+	.has_signaled = kgsl_sync_pt_has_signaled,
+	.compare = kgsl_sync_pt_compare,
+};
+
+int kgsl_sync_timeline_create(struct kgsl_context *context)
+{
+	struct kgsl_sync_timeline *ktimeline;
+
+	context->timeline = sync_timeline_create(&kgsl_sync_timeline_ops,
+		(int) sizeof(struct kgsl_sync_timeline), "kgsl-timeline");
+	if (context->timeline == NULL)
+		return -EINVAL;
+
+	ktimeline = (struct kgsl_sync_timeline *) context->timeline;
+	ktimeline->last_timestamp = 0;
+
+	return 0;
+}
+
+void kgsl_sync_timeline_signal(struct sync_timeline *timeline,
+	unsigned int timestamp)
+{
+	struct kgsl_sync_timeline *ktimeline =
+		(struct kgsl_sync_timeline *) timeline;
+
+	if (timestamp_cmp(timestamp, ktimeline->last_timestamp > 0))
+		ktimeline->last_timestamp = timestamp;
+	sync_timeline_signal(timeline);
+}
+
+void kgsl_sync_timeline_destroy(struct kgsl_context *context)
+{
+	sync_timeline_destroy(context->timeline);
+}
diff --git a/drivers/gpu/msm/kgsl_sync.h b/drivers/gpu/msm/kgsl_sync.h
new file mode 100644
index 0000000..06b3ad0
--- /dev/null
+++ b/drivers/gpu/msm/kgsl_sync.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __KGSL_SYNC_H
+#define __KGSL_SYNC_H
+
+#include <linux/sync.h>
+#include "kgsl_device.h"
+
+struct kgsl_sync_timeline {
+	struct sync_timeline timeline;
+	unsigned int last_timestamp;
+};
+
+struct kgsl_sync_pt {
+	struct sync_pt pt;
+	unsigned int timestamp;
+};
+
+#if defined(CONFIG_SYNC)
+struct sync_pt *kgsl_sync_pt_create(struct sync_timeline *timeline,
+	unsigned int timestamp);
+void kgsl_sync_pt_destroy(struct sync_pt *pt);
+int kgsl_add_fence_event(struct kgsl_device *device,
+	u32 context_id, u32 timestamp, void __user *data, int len,
+	struct kgsl_device_private *owner);
+int kgsl_sync_timeline_create(struct kgsl_context *context);
+void kgsl_sync_timeline_signal(struct sync_timeline *timeline,
+	unsigned int timestamp);
+void kgsl_sync_timeline_destroy(struct kgsl_context *context);
+#else
+static inline struct sync_pt
+*kgsl_sync_pt_create(struct sync_timeline *timeline, unsigned int timestamp)
+{
+	return NULL;
+}
+
+static inline void kgsl_sync_pt_destroy(struct sync_pt *pt)
+{
+}
+
+static inline int kgsl_add_fence_event(struct kgsl_device *device,
+	u32 context_id, u32 timestamp, void __user *data, int len,
+	struct kgsl_device_private *owner)
+{
+	return -EINVAL;
+}
+
+static int kgsl_sync_timeline_create(struct kgsl_context *context)
+{
+	context->timeline = NULL;
+	return 0;
+}
+
+static inline void
+kgsl_sync_timeline_signal(struct sync_timeline *timeline,
+	unsigned int timestamp)
+{
+}
+
+static inline void kgsl_sync_timeline_destroy(struct kgsl_context *context)
+{
+}
+#endif
+
+#endif /* __KGSL_SYNC_H */
diff --git a/drivers/gpu/msm/kgsl_trace.c b/drivers/gpu/msm/kgsl_trace.c
index 8bdf996..e432729 100644
--- a/drivers/gpu/msm/kgsl_trace.c
+++ b/drivers/gpu/msm/kgsl_trace.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -14,5 +14,6 @@
 #include "kgsl.h"
 #include "kgsl_device.h"
 
+/* Instantiate tracepoints */
 #define CREATE_TRACE_POINTS
 #include "kgsl_trace.h"
diff --git a/drivers/gpu/msm/kgsl_trace.h b/drivers/gpu/msm/kgsl_trace.h
index 0df8409..c54445c 100644
--- a/drivers/gpu/msm/kgsl_trace.h
+++ b/drivers/gpu/msm/kgsl_trace.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -28,6 +28,9 @@
 struct kgsl_ringbuffer_issueibcmds;
 struct kgsl_device_waittimestamp;
 
+/*
+ * Tracepoint for kgsl issue ib commands
+ */
 TRACE_EVENT(kgsl_issueibcmds,
 
 	TP_PROTO(struct kgsl_device *device,
@@ -75,6 +78,9 @@
 	)
 );
 
+/*
+ * Tracepoint for kgsl readtimestamp
+ */
 TRACE_EVENT(kgsl_readtimestamp,
 
 	TP_PROTO(struct kgsl_device *device,
@@ -107,6 +113,9 @@
 	)
 );
 
+/*
+ * Tracepoint for kgsl waittimestamp entry
+ */
 TRACE_EVENT(kgsl_waittimestamp_entry,
 
 	TP_PROTO(struct kgsl_device *device,
@@ -143,6 +152,9 @@
 	)
 );
 
+/*
+ * Tracepoint for kgsl waittimestamp exit
+ */
 TRACE_EVENT(kgsl_waittimestamp_exit,
 
 	TP_PROTO(struct kgsl_device *device, unsigned int curr_ts,
@@ -212,47 +224,6 @@
 	TP_ARGS(device, on)
 );
 
-#ifdef CONFIG_MSM_KGSL_GPU_USAGE_SYSTRACE
-TRACE_EVENT(kgsl_usage,
-
-	TP_PROTO(struct kgsl_device *device, int on, int pid, s64 total_time, s64 busy_time,
-		unsigned int pwrlevel, unsigned int freq),
-
-	TP_ARGS(device, on, pid, total_time, busy_time, pwrlevel, freq),
-
-	TP_STRUCT__entry(
-		__string(device_name, device->name)
-		__field(int, on)
-		__field(int, pid)
-		__field(s64, total_time)
-		__field(s64, busy_time)
-		__field(unsigned int, pwrlevel)
-		__field(unsigned int, freq)
-	),
-
-	TP_fast_assign(
-		__assign_str(device_name, device->name);
-		__entry->on = on;
-		__entry->pid = pid;
-		__entry->total_time =total_time;
-		__entry->busy_time = busy_time;
-		__entry->pwrlevel = pwrlevel;
-		__entry->freq = freq;
-	),
-
-	TP_printk(
-		"d_name=%s %s pid=%d total=%lld busy=%lld pwrlevel=%d freq=%d",
-		__get_str(device_name),
-		__entry->on ? "on" : "off",
-		__entry->pid,
-		__entry->total_time,
-		__entry->busy_time,
-		__entry->pwrlevel,
-		__entry->freq
-	)
-);
-#endif
-
 TRACE_EVENT(kgsl_pwrlevel,
 
 	TP_PROTO(struct kgsl_device *device, unsigned int pwrlevel,
@@ -280,6 +251,29 @@
 	)
 );
 
+TRACE_EVENT(kgsl_mpdcvs,
+
+	TP_PROTO(struct kgsl_device *device, unsigned int state),
+
+	TP_ARGS(device, state),
+
+	TP_STRUCT__entry(
+		__string(device_name, device->name)
+		__field(unsigned int, state)
+	),
+
+	TP_fast_assign(
+		__assign_str(device_name, device->name);
+		__entry->state = state;
+	),
+
+	TP_printk(
+		"d_name=%s %s",
+		__get_str(device_name),
+		__entry->state ? "BUSY" : "IDLE"
+	)
+);
+
 DECLARE_EVENT_CLASS(kgsl_pwrstate_template,
 	TP_PROTO(struct kgsl_device *device, unsigned int state),
 
@@ -321,16 +315,22 @@
 	TP_STRUCT__entry(
 		__field(unsigned int, gpuaddr)
 		__field(unsigned int, size)
+		__field(unsigned int, tgid)
+		__array(char, usage, 16)
 	),
 
 	TP_fast_assign(
 		__entry->gpuaddr = mem_entry->memdesc.gpuaddr;
 		__entry->size = mem_entry->memdesc.size;
+		__entry->tgid = mem_entry->priv->pid;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
 	),
 
 	TP_printk(
-		"gpuaddr=0x%08x size=%d",
-		__entry->gpuaddr, __entry->size
+		"gpuaddr=0x%08x size=%d tgid=%d usage=%s",
+		__entry->gpuaddr, __entry->size, __entry->tgid,
+		__entry->usage
 	)
 );
 
@@ -345,6 +345,8 @@
 		__field(unsigned int, size)
 		__field(int, fd)
 		__field(int, type)
+		__field(unsigned int, tgid)
+		__array(char, usage, 16)
 	),
 
 	TP_fast_assign(
@@ -352,12 +354,16 @@
 		__entry->size = mem_entry->memdesc.size;
 		__entry->fd = fd;
 		__entry->type = mem_entry->memtype;
+		__entry->tgid = mem_entry->priv->pid;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
 	),
 
 	TP_printk(
-		"gpuaddr=0x%08x size=%d type=%d fd=%d",
+		"gpuaddr=0x%08x size=%d type=%d fd=%d tgid=%d usage %s",
 		__entry->gpuaddr, __entry->size,
-		__entry->type, __entry->fd
+		__entry->type, __entry->fd, __entry->tgid,
+		__entry->usage
 	)
 );
 
@@ -372,17 +378,23 @@
 		__field(unsigned int, size)
 		__field(int, type)
 		__field(int, fd)
+		__field(unsigned int, tgid)
+		__array(char, usage, 16)
 	),
 
 	TP_fast_assign(
 		__entry->gpuaddr = mem_entry->memdesc.gpuaddr;
 		__entry->size = mem_entry->memdesc.size;
 		__entry->type = mem_entry->memtype;
+		__entry->tgid = mem_entry->priv->pid;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
 	),
 
 	TP_printk(
-		"gpuaddr=0x%08x size=%d type=%d",
-		__entry->gpuaddr, __entry->size, __entry->type
+		"gpuaddr=0x%08x size=%d type=%d tgid=%d usage=%s",
+		__entry->gpuaddr, __entry->size, __entry->type,
+		__entry->tgid, __entry->usage
 	)
 );
 
@@ -398,6 +410,7 @@
 		__field(unsigned int, gpuaddr)
 		__field(unsigned int, size)
 		__field(int, type)
+		__array(char, usage, 16)
 		__field(unsigned int, drawctxt_id)
 		__field(unsigned int, curr_ts)
 		__field(unsigned int, free_ts)
@@ -407,6 +420,8 @@
 		__assign_str(device_name, device->name);
 		__entry->gpuaddr = mem_entry->memdesc.gpuaddr;
 		__entry->size = mem_entry->memdesc.size;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
 		__entry->drawctxt_id = id;
 		__entry->type = mem_entry->memtype;
 		__entry->curr_ts = curr_ts;
@@ -414,12 +429,13 @@
 	),
 
 	TP_printk(
-		"d_name=%s gpuaddr=0x%08x size=%d type=%d ctx=%u"
+		"d_name=%s gpuaddr=0x%08x size=%d type=%d usage=%s ctx=%u"
 		" curr_ts=0x%x free_ts=0x%x",
 		__get_str(device_name),
 		__entry->gpuaddr,
 		__entry->size,
 		__entry->type,
+		__entry->usage,
 		__entry->drawctxt_id,
 		__entry->curr_ts,
 		__entry->free_ts
@@ -513,12 +529,48 @@
 	),
 
 	TP_printk(
-		"d_name=%s page=0x%08x pt=%d op=%s\n",
+		"d_name=%s page=0x%08x pt=%d op=%s",
 		__get_str(device_name), __entry->page, __entry->pt,
 		__get_str(op)
 	)
 );
 
-#endif 
+TRACE_EVENT(kgsl_register_event,
+		TP_PROTO(unsigned int id, unsigned int timestamp),
+		TP_ARGS(id, timestamp),
+		TP_STRUCT__entry(
+			__field(unsigned int, id)
+			__field(unsigned int, timestamp)
+		),
+		TP_fast_assign(
+			__entry->id = id;
+			__entry->timestamp = timestamp;
+		),
+		TP_printk(
+			"ctx=%d ts=%d",
+			__entry->id, __entry->timestamp)
+);
 
+TRACE_EVENT(kgsl_fire_event,
+		TP_PROTO(unsigned int id, unsigned int ts,
+			unsigned int age),
+		TP_ARGS(id, ts, age),
+		TP_STRUCT__entry(
+			__field(unsigned int, id)
+			__field(unsigned int, ts)
+			__field(unsigned int, age)
+		),
+		TP_fast_assign(
+			__entry->id = id;
+			__entry->ts = ts;
+			__entry->age = age;
+		),
+		TP_printk(
+			"ctx=%d ts=%d age=%u",
+			__entry->id, __entry->ts, __entry->age)
+);
+
+#endif /* _KGSL_TRACE_H */
+
+/* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c
index fd03d5e..484630d 100644
--- a/drivers/gpu/msm/z180.c
+++ b/drivers/gpu/msm/z180.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -25,9 +25,6 @@
 #define DRIVER_VERSION_MAJOR   3
 #define DRIVER_VERSION_MINOR   1
 
-#define Z180_DEVICE(device) \
-		KGSL_CONTAINER_OF(device, struct z180_device, dev)
-
 #define GSL_VGC_INT_MASK \
 	 (REG_VGC_IRQSTATUS__MH_MASK | \
 	  REG_VGC_IRQSTATUS__G2D_MASK | \
@@ -41,16 +38,12 @@
 #define VGV3_CONTROL_MARKADD_FSHIFT 0
 #define VGV3_CONTROL_MARKADD_FMASK 0xfff
 
-#define Z180_PACKET_SIZE 15
 #define Z180_MARKER_SIZE 10
 #define Z180_CALL_CMD     0x1000
 #define Z180_MARKER_CMD   0x8000
 #define Z180_STREAM_END_CMD 0x9000
 #define Z180_STREAM_PACKET 0x7C000176
 #define Z180_STREAM_PACKET_CALL 0x7C000275
-#define Z180_PACKET_COUNT 8
-#define Z180_RB_SIZE (Z180_PACKET_SIZE*Z180_PACKET_COUNT \
-			  *sizeof(uint32_t))
 
 #define NUMTEXUNITS             4
 #define TEXUNITREGCOUNT         25
@@ -69,6 +62,7 @@
 
 #define Z180_INVALID_CONTEXT UINT_MAX
 
+/* z180 MH arbiter config*/
 #define Z180_CFG_MHARB \
 	(0x10 \
 		| (0 << MH_ARBITER_CONFIG__SAME_PAGE_GRANULARITY__SHIFT) \
@@ -140,6 +134,9 @@
 			.mharb = Z180_CFG_MHARB,
 			.mh_intf_cfg1 = 0x00032f07,
 			.mh_intf_cfg2 = 0x004b274f,
+			/* turn off memory protection unit by setting
+			   acceptable physical address range to include
+			   all pages. */
 			.mpu_base = 0x00000000,
 			.mpu_range =  0xFFFFF000,
 		},
@@ -164,6 +161,9 @@
 			.mharb = Z180_CFG_MHARB,
 			.mh_intf_cfg1 = 0x00032f07,
 			.mh_intf_cfg2 = 0x004b274f,
+			/* turn off memory protection unit by setting
+			   acceptable physical address range to include
+			   all pages. */
 			.mpu_base = 0x00000000,
 			.mpu_range =  0xFFFFF000,
 		},
@@ -406,6 +406,10 @@
 	}
 	cmd = ibdesc[0].gpuaddr;
 	sizedwords = ibdesc[0].sizedwords;
+	/*
+	 * Get a kernel mapping to the IB for monkey patching.
+	 * See the end of this function.
+	 */
 	entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd,
 		sizedwords);
 	if (entry == NULL) {
@@ -414,6 +418,11 @@
 		result = -EINVAL;
 		goto error;
 	}
+	/*
+	 * This will only map memory if it exists, otherwise it will reuse the
+	 * mapping. And the 2d userspace reuses IBs so we likely won't create
+	 * too many mappings.
+	 */
 	if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) {
 		KGSL_DRV_ERR(device,
 			     "Cannot make kernel mapping for gpuaddr 0x%x\n",
@@ -424,7 +433,7 @@
 
 	KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n",
 		context->id, cmd, sizedwords);
-	
+	/* context switch */
 	if ((context->id != (int)z180_dev->ringbuffer.prevctx) ||
 	    (ctrl & KGSL_CONTEXT_CTX_SWITCH)) {
 		KGSL_CMD_INFO(device, "context switch %d -> %d\n",
@@ -458,10 +467,10 @@
 	addcmd(&z180_dev->ringbuffer, old_timestamp, cmd + ofs, cnt);
 	kgsl_pwrscale_busy(device);
 
-	
+	/* Make sure the next ringbuffer entry has a marker */
 	addmarker(&z180_dev->ringbuffer, z180_dev->current_timestamp);
 
-	
+	/* monkey patch the IB so that it jumps back to the ringbuffer */
 	kgsl_sharedmem_writel(&entry->memdesc,
 		      ((sizedwords + 1) * sizeof(unsigned int)),
 		      rb_gpuaddr(z180_dev, z180_dev->current_timestamp));
@@ -469,7 +478,7 @@
 			      ((sizedwords + 2) * sizeof(unsigned int)),
 			      nextcnt);
 
-	
+	/* sync memory before activating the hardware for the new command*/
 	mb();
 
 	cmd = (int)(((2) & VGV3_CONTROL_MARKADD_FMASK)
@@ -550,7 +559,7 @@
 
 	kgsl_pwrctrl_enable(device);
 
-	
+	/* Set interrupts to 0 to ensure a good state */
 	z180_regwrite(device, (ADDR_VGC_IRQENABLE >> 2), 0x0);
 
 	kgsl_mh_start(device);
@@ -581,7 +590,7 @@
 
 	kgsl_mmu_stop(&device->mmu);
 
-	
+	/* Disable the clocks before the power rail. */
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
 
 	kgsl_pwrctrl_disable(device);
@@ -658,6 +667,9 @@
 	return 0;
 }
 
+/* Not all Z180 registers are directly accessible.
+ * The _z180_(read|write)_simple functions below handle the ones that are.
+ */
 static void _z180_regread_simple(struct kgsl_device *device,
 				unsigned int offsetwords,
 				unsigned int *value)
@@ -668,6 +680,8 @@
 
 	reg = (unsigned int *)(device->reg_virt + (offsetwords << 2));
 
+	/*ensure this read finishes before the next one.
+	 * i.e. act like normal readl() */
 	*value = __raw_readl(reg);
 	rmb();
 
@@ -683,11 +697,18 @@
 
 	reg = (unsigned int *)(device->reg_virt + (offsetwords << 2));
 	kgsl_cffdump_regwrite(device->id, offsetwords << 2, value);
+	/*ensure previous writes post before this one,
+	 * i.e. act like normal writel() */
 	wmb();
 	__raw_writel(value, reg);
 }
 
 
+/* The MH registers must be accessed through via a 2 step write, (read|write)
+ * process. These registers may be accessed from interrupt context during
+ * the handling of MH or MMU error interrupts. Therefore a spin lock is used
+ * to ensure that the 2 step sequence is not interrupted.
+ */
 static void _z180_regread_mmu(struct kgsl_device *device,
 			     unsigned int offsetwords,
 			     unsigned int *value)
@@ -723,6 +744,9 @@
 	spin_unlock_irqrestore(&z180_dev->cmdwin_lock, flags);
 }
 
+/* the rest of the code doesn't want to think about if it is writing mmu
+ * registers or normal registers so handle it here
+ */
 static void z180_regread(struct kgsl_device *device,
 			unsigned int offsetwords,
 			unsigned int *value)
@@ -776,7 +800,7 @@
 {
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 	(void)context;
-	
+	/* get current EOP timestamp */
 	return z180_dev->timestamp;
 }
 
@@ -787,7 +811,7 @@
 {
 	int status = -EINVAL;
 
-	
+	/* Don't wait forever, set a max (10 sec) value for now */
 	if (msecs == -1)
 		msecs = 10 * MSEC_PER_SEC;
 
@@ -816,6 +840,7 @@
 	else if (timeout == 0) {
 		status = -ETIMEDOUT;
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_HUNG);
+		kgsl_postmortem_dump(device, 0);
 	} else
 		status = timeout;
 
@@ -858,7 +883,7 @@
 
 static void z180_irqctrl(struct kgsl_device *device, int state)
 {
-	
+	/* Control interrupts for Z180 and the Z180 MMU */
 
 	if (state) {
 		z180_regwrite(device, (ADDR_VGC_IRQENABLE >> 2), 3);
@@ -875,12 +900,16 @@
 	if (chipid != NULL)
 		*chipid = 0;
 
+	/* Standard KGSL gpuid format:
+	 * top word is 0x0002 for 2D or 0x0003 for 3D
+	 * Bottom word is core specific identifer
+	 */
 
 	return (0x0002 << 16) | 180;
 }
 
 static const struct kgsl_functable z180_functable = {
-	
+	/* Mandatory functions */
 	.regread = z180_regread,
 	.regwrite = z180_regwrite,
 	.idle = z180_idle,
@@ -898,10 +927,11 @@
 	.irqctrl = z180_irqctrl,
 	.gpuid = z180_gpuid,
 	.irq_handler = z180_irq_handler,
-	
+	/* Optional functions */
 	.drawctxt_create = NULL,
 	.drawctxt_destroy = z180_drawctxt_destroy,
 	.ioctl = NULL,
+	.postmortem_dump = z180_dump,
 };
 
 static struct platform_device_id z180_id_table[] = {
diff --git a/drivers/gpu/msm/z180.h b/drivers/gpu/msm/z180.h
index 7f4ab7f..268aac3 100644
--- a/drivers/gpu/msm/z180.h
+++ b/drivers/gpu/msm/z180.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,8 +19,16 @@
 #define DEVICE_2D0_NAME "kgsl-2d0"
 #define DEVICE_2D1_NAME "kgsl-2d1"
 
+#define Z180_PACKET_SIZE 15
+#define Z180_PACKET_COUNT 8
+#define Z180_RB_SIZE (Z180_PACKET_SIZE*Z180_PACKET_COUNT \
+			  *sizeof(uint32_t))
+#define Z180_DEVICE(device) \
+		KGSL_CONTAINER_OF(device, struct z180_device, dev)
+
 #define Z180_DEFAULT_PWRSCALE_POLICY  NULL
 
+/* Wait a maximum of 10 seconds when trying to idle the core */
 #define Z180_IDLE_TIMEOUT (10 * 1000)
 
 struct z180_ringbuffer {
@@ -29,11 +37,13 @@
 };
 
 struct z180_device {
-	struct kgsl_device dev;    
+	struct kgsl_device dev;    /* Must be first field in this struct */
 	int current_timestamp;
 	int timestamp;
 	struct z180_ringbuffer ringbuffer;
 	spinlock_t cmdwin_lock;
 };
 
-#endif 
+int z180_dump(struct kgsl_device *, int);
+
+#endif /* __Z180_H */
diff --git a/drivers/gpu/msm/z180_postmortem.c b/drivers/gpu/msm/z180_postmortem.c
new file mode 100644
index 0000000..c1e5f07
--- /dev/null
+++ b/drivers/gpu/msm/z180_postmortem.c
@@ -0,0 +1,230 @@
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "kgsl.h"
+#include "kgsl_device.h"
+#include "z180.h"
+#include "z180_reg.h"
+
+#define Z180_STREAM_PACKET_CALL 0x7C000275
+
+/* Postmortem Dump formatted Output parameters */
+
+/* Number of Words per dump data line */
+#define WORDS_PER_LINE 8
+
+/* Number of spaces per dump data line */
+#define NUM_SPACES (WORDS_PER_LINE - 1)
+
+/*
+ * Output dump data is formatted as string, hence number of chars
+ * per line for line string allocation
+ */
+#define CHARS_PER_LINE  \
+	((WORDS_PER_LINE * (2*sizeof(unsigned int))) + NUM_SPACES + 1)
+
+/* Z180 registers (byte offsets) to be dumped */
+static const unsigned int regs_to_dump[] = {
+		ADDR_VGC_VERSION,
+		ADDR_VGC_SYSSTATUS,
+		ADDR_VGC_IRQSTATUS,
+		ADDR_VGC_IRQENABLE,
+		ADDR_VGC_IRQ_ACTIVE_CNT,
+		ADDR_VGC_CLOCKEN,
+		ADDR_VGC_MH_DATA_ADDR,
+		ADDR_VGC_GPR0,
+		ADDR_VGC_GPR1,
+		ADDR_VGC_BUSYCNT,
+		ADDR_VGC_FIFOFREE,
+};
+
+/**
+ * z180_dump_regs - Dumps all of Z180 external registers. Prints the word offset
+ * of the register in each output line.
+ * @device: kgsl_device pointer to the Z180 core
+ */
+static void z180_dump_regs(struct kgsl_device *device)
+{
+	unsigned int i;
+	unsigned int reg_val;
+
+	KGSL_LOG_DUMP(device, "Z180 Register Dump\n");
+	for (i = 0; i < ARRAY_SIZE(regs_to_dump); i++) {
+		kgsl_regread(device,
+				regs_to_dump[i]/sizeof(unsigned int), &reg_val);
+		KGSL_LOG_DUMP(device, "REG: %04X: %08X\n",
+				regs_to_dump[i]/sizeof(unsigned int), reg_val);
+	}
+}
+
+/**
+ * z180_dump_ringbuffer - Dumps the Z180 core's ringbuffer contents
+ * @device: kgsl_device pointer to the z180 core
+ */
+static void z180_dump_ringbuffer(struct kgsl_device *device)
+{
+	unsigned int rb_size;
+	unsigned int *rb_hostptr;
+	unsigned int rb_words;
+	unsigned int rb_gpuaddr;
+	struct z180_device *z180_dev = Z180_DEVICE(device);
+	unsigned int i;
+	char linebuf[CHARS_PER_LINE];
+
+	KGSL_LOG_DUMP(device, "Z180 ringbuffer dump\n");
+
+	rb_hostptr = (unsigned int *) z180_dev->ringbuffer.cmdbufdesc.hostptr;
+
+	rb_size = Z180_RB_SIZE;
+	rb_gpuaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr;
+
+	rb_words = rb_size/sizeof(unsigned int);
+
+	KGSL_LOG_DUMP(device, "ringbuffer size: %u\n", rb_size);
+
+	KGSL_LOG_DUMP(device, "rb_words: %d\n", rb_words);
+
+	for (i = 0; i < rb_words; i += WORDS_PER_LINE) {
+		hex_dump_to_buffer(rb_hostptr+i,
+				rb_size - i*sizeof(unsigned int),
+				WORDS_PER_LINE*sizeof(unsigned int),
+				sizeof(unsigned int), linebuf,
+				sizeof(linebuf), false);
+		KGSL_LOG_DUMP(device, "RB: %04X: %s\n",
+				rb_gpuaddr + i*sizeof(unsigned int), linebuf);
+	}
+}
+
+
+static void z180_dump_ib(struct kgsl_device *device)
+{
+	unsigned int rb_size;
+	unsigned int *rb_hostptr;
+	unsigned int rb_words;
+	unsigned int rb_gpuaddr;
+	unsigned int ib_gpuptr = 0;
+	unsigned int ib_size = 0;
+	void *ib_hostptr = NULL;
+	int rb_slot_num = -1;
+	struct z180_device *z180_dev = Z180_DEVICE(device);
+	struct kgsl_mem_entry *entry = NULL;
+	unsigned int pt_base;
+	unsigned int i;
+	unsigned int j;
+	char linebuf[CHARS_PER_LINE];
+	unsigned int current_ib_slot;
+	unsigned int len;
+	unsigned int rowsize;
+	KGSL_LOG_DUMP(device, "Z180 IB dump\n");
+
+	rb_hostptr = (unsigned int *) z180_dev->ringbuffer.cmdbufdesc.hostptr;
+
+	rb_size = Z180_RB_SIZE;
+	rb_gpuaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr;
+
+	rb_words = rb_size/sizeof(unsigned int);
+
+	KGSL_LOG_DUMP(device, "Ringbuffer size (bytes): %u\n", rb_size);
+
+	KGSL_LOG_DUMP(device, "rb_words: %d\n", rb_words);
+
+	pt_base = kgsl_mmu_get_current_ptbase(&device->mmu);
+
+	/* Dump the current IB */
+	for (i = 0; i < rb_words; i++) {
+		if (rb_hostptr[i] == Z180_STREAM_PACKET_CALL) {
+
+			rb_slot_num++;
+			current_ib_slot =
+				z180_dev->current_timestamp % Z180_PACKET_COUNT;
+			if (rb_slot_num != current_ib_slot)
+				continue;
+
+			ib_gpuptr = rb_hostptr[i+1];
+
+			entry = kgsl_get_mem_entry(device, pt_base, ib_gpuptr,
+							1);
+
+			if (entry == NULL) {
+				KGSL_LOG_DUMP(device,
+				"IB mem entry not found for ringbuffer slot#: %d\n",
+				rb_slot_num);
+				continue;
+			}
+
+			ib_hostptr = kgsl_memdesc_map(&entry->memdesc);
+
+			if (ib_hostptr == NULL) {
+				KGSL_LOG_DUMP(device,
+				"Could not map IB to kernel memory, Ringbuffer Slot: %d\n",
+				rb_slot_num);
+				continue;
+			}
+
+			ib_size = entry->memdesc.size;
+			KGSL_LOG_DUMP(device,
+				"IB size: %dbytes, IB size in words: %d\n",
+				ib_size,
+				ib_size/sizeof(unsigned int));
+
+			for (j = 0; j < ib_size; j += WORDS_PER_LINE) {
+				len = ib_size - j*sizeof(unsigned int);
+				rowsize = WORDS_PER_LINE*sizeof(unsigned int);
+				hex_dump_to_buffer(ib_hostptr+j, len, rowsize,
+						sizeof(unsigned int), linebuf,
+						sizeof(linebuf), false);
+				KGSL_LOG_DUMP(device, "IB%d: %04X: %s\n",
+						rb_slot_num,
+						(rb_gpuaddr +
+						j*sizeof(unsigned int)),
+						linebuf);
+			}
+			KGSL_LOG_DUMP(device, "IB Dump Finished\n");
+		}
+	}
+}
+
+
+/**
+ * z180_dump - Dumps the Z180 ringbuffer and registers (and IBs if asked for)
+ * for postmortem
+ * analysis.
+ * @device: kgsl_device pointer to the Z180 core
+ */
+int z180_dump(struct kgsl_device *device, int manual)
+{
+	struct z180_device *z180_dev = Z180_DEVICE(device);
+
+	mb();
+
+	KGSL_LOG_DUMP(device, "Retired Timestamp: %d\n", z180_dev->timestamp);
+	KGSL_LOG_DUMP(device,
+			"Current Timestamp: %d\n", z180_dev->current_timestamp);
+
+	/* Dump ringbuffer */
+	z180_dump_ringbuffer(device);
+
+	/* Dump registers */
+	z180_dump_regs(device);
+
+	/* Dump IBs, if asked for */
+	if (device->pm_ib_enabled)
+		z180_dump_ib(device);
+
+	/* Get the stack trace if the dump was automatic */
+	if (!manual)
+		BUG_ON(1);
+
+	return 0;
+}
+
diff --git a/drivers/gpu/msm/z180_reg.h b/drivers/gpu/msm/z180_reg.h
index 382d0c5..81f1fdc 100644
--- a/drivers/gpu/msm/z180_reg.h
+++ b/drivers/gpu/msm/z180_reg.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -45,5 +45,12 @@
 #define ADDR_VGV3_NEXTADDR               0x0075
 #define ADDR_VGV3_NEXTCMD                0x0076
 #define ADDR_VGV3_WRITEADDR              0x0072
+#define ADDR_VGC_VERSION				 0x400
+#define ADDR_VGC_SYSSTATUS				 0x410
+#define ADDR_VGC_CLOCKEN				 0x508
+#define ADDR_VGC_GPR0					 0x520
+#define ADDR_VGC_GPR1					 0x528
+#define ADDR_VGC_BUSYCNT				 0x530
+#define ADDR_VGC_FIFOFREE				 0x7c0
 
-#endif 
+#endif /* __Z180_REG_H */
diff --git a/drivers/gpu/msm/z180_trace.c b/drivers/gpu/msm/z180_trace.c
index c5349db..9d971ee 100644
--- a/drivers/gpu/msm/z180_trace.c
+++ b/drivers/gpu/msm/z180_trace.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -15,5 +15,6 @@
 #include "z180.h"
 #include "z180_reg.h"
 
+/* Instantiate tracepoints */
 #define CREATE_TRACE_POINTS
 #include "z180_trace.h"
diff --git a/drivers/gpu/msm/z180_trace.h b/drivers/gpu/msm/z180_trace.h
index 3536655..4f65b9b 100644
--- a/drivers/gpu/msm/z180_trace.h
+++ b/drivers/gpu/msm/z180_trace.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -25,6 +25,9 @@
 
 struct kgsl_device;
 
+/*
+ * Tracepoint for z180 irq. Includes status info
+ */
 TRACE_EVENT(kgsl_z180_irq_status,
 
 	TP_PROTO(struct kgsl_device *device, unsigned int status),
@@ -51,6 +54,7 @@
 	)
 );
 
-#endif 
+#endif /* _Z180_TRACE_H */
 
+/* This part must be outside protection */
 #include <trace/define_trace.h>