msm-3.4 (commit 35cca8ba3ee0e6a2085dbcac48fb2ccbaa72ba98) video/gpu/iommu .. and all the hacks that goes with that
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index b114875..9d24d65 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3737,10 +3737,6 @@
 # define GEN6_RCPBUNIT_CLOCK_GATE_DISABLE		(1 << 12)
 # define GEN6_RCCUNIT_CLOCK_GATE_DISABLE		(1 << 11)
 
-#define GEN6_UCGCTL2				0x9404
-# define GEN6_RCPBUNIT_CLOCK_GATE_DISABLE		(1 << 12)
-# define GEN6_RCCUNIT_CLOCK_GATE_DISABLE		(1 << 11)
-
 #define GEN6_RPNSWREQ				0xA008
 #define   GEN6_TURBO_DISABLE			(1<<31)
 #define   GEN6_FREQUENCY(x)			((x)<<25)
diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h
index 48dae40..f7eb5d8 100644
--- a/drivers/gpu/drm/radeon/radeon_mode.h
+++ b/drivers/gpu/drm/radeon/radeon_mode.h
@@ -439,9 +439,6 @@
 	struct radeon_i2c_chan *ddc_bus;
 	/* some systems have an hdmi and vga port with a shared ddc line */
 	bool shared_ddc;
-	/* for some Radeon chip families we apply an additional EDID header
-	   check as part of the DDC probe */
-	bool requires_extended_probe;
 	bool use_digital;
 	/* we need to mind the EDID between detect
 	   and get modes due to analog/digital/tvencoder */
@@ -529,8 +526,7 @@
 				u8 val);
 extern void radeon_router_select_ddc_port(struct radeon_connector *radeon_connector);
 extern void radeon_router_select_cd_port(struct radeon_connector *radeon_connector);
-extern bool radeon_ddc_probe(struct radeon_connector *radeon_connector,
-			bool requires_extended_probe);
+extern bool radeon_ddc_probe(struct radeon_connector *radeon_connector);
 extern int radeon_ddc_get_modes(struct radeon_connector *radeon_connector);
 
 extern struct drm_encoder *radeon_best_encoder(struct drm_connector *connector);
diff --git a/drivers/gpu/ion/Kconfig b/drivers/gpu/ion/Kconfig
index 5bb254b..39133b5 100644
--- a/drivers/gpu/ion/Kconfig
+++ b/drivers/gpu/ion/Kconfig
@@ -16,3 +16,12 @@
 	depends on ARCH_MSM && ION
 	help
 	  Choose this option if you wish to use ion on an MSM target.
+
+config ION_LEAK_CHECK
+	bool "Check for leaked Ion buffers (debugging)"
+	depends on ION
+	help
+	  Choose this option if you wish to enable checking for leaked
+	  ion buffers at runtime. Choosing this option will also add a
+	  debugfs node under the ion directory that can be used to
+	  enable/disable the leak checking.
diff --git a/drivers/gpu/ion/Makefile b/drivers/gpu/ion/Makefile
index 51349f6..60a6b81 100644
--- a/drivers/gpu/ion/Makefile
+++ b/drivers/gpu/ion/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_ION) +=	ion.o ion_heap.o ion_system_heap.o ion_carveout_heap.o ion_iommu_heap.o ion_cp_heap.o
-obj-$(CONFIG_CMA) += ion_cma_heap.o
+obj-$(CONFIG_CMA) += ion_cma_heap.o ion_cma_secure_heap.o
 obj-$(CONFIG_ION_TEGRA) += tegra/
 obj-$(CONFIG_ION_MSM) += msm/
diff --git a/drivers/gpu/ion/ion.c b/drivers/gpu/ion/ion.c
index d005605..d3434d8 100644
--- a/drivers/gpu/ion/ion.c
+++ b/drivers/gpu/ion/ion.c
@@ -2,7 +2,7 @@
  * drivers/gpu/ion/ion.c
  *
  * Copyright (C) 2011 Google, Inc.
- * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -34,10 +34,11 @@
 #include <linux/debugfs.h>
 #include <linux/dma-buf.h>
 #include <linux/msm_ion.h>
+#include <trace/events/kmem.h>
+
 
 #include <mach/iommu_domains.h>
 #include "ion_priv.h"
-#define DEBUG
 
 /**
  * struct ion_device - the metadata of the ion device node
@@ -105,6 +106,12 @@
 	unsigned int iommu_map_cnt;
 };
 
+bool ion_buffer_fault_user_mappings(struct ion_buffer *buffer)
+{
+        return ((buffer->flags & ION_FLAG_CACHED) &&
+                !(buffer->flags & ION_FLAG_CACHED_NEEDS_SYNC));
+}
+
 static void ion_iommu_release(struct kref *kref);
 
 /* this function should only be called while dev->lock is held */
@@ -188,6 +195,8 @@
 	return NULL;
 }
 
+static int ion_buffer_alloc_dirty(struct ion_buffer *buffer);
+
 /* this function should only be called while dev->lock is held */
 static struct ion_buffer *ion_buffer_create(struct ion_heap *heap,
 				     struct ion_device *dev,
@@ -197,13 +206,15 @@
 {
 	struct ion_buffer *buffer;
 	struct sg_table *table;
-	int ret;
+	struct scatterlist *sg;
+	int i, ret;
 
 	buffer = kzalloc(sizeof(struct ion_buffer), GFP_KERNEL);
 	if (!buffer)
 		return ERR_PTR(-ENOMEM);
 
 	buffer->heap = heap;
+	buffer->flags = flags;
 	kref_init(&buffer->ref);
 
 	ret = heap->ops->allocate(heap, buffer, len, align, flags);
@@ -214,19 +225,54 @@
 
 	buffer->dev = dev;
 	buffer->size = len;
-	buffer->flags = flags;
 
-	table = buffer->heap->ops->map_dma(buffer->heap, buffer);
+	table = heap->ops->map_dma(heap, buffer);
 	if (IS_ERR_OR_NULL(table)) {
 		heap->ops->free(buffer);
 		kfree(buffer);
 		return ERR_PTR(PTR_ERR(table));
 	}
 	buffer->sg_table = table;
+	if (ion_buffer_fault_user_mappings(buffer)) {
+		for_each_sg(buffer->sg_table->sgl, sg, buffer->sg_table->nents,
+			    i) {
+			if (sg_dma_len(sg) == PAGE_SIZE)
+				continue;
+			pr_err("%s: cached mappings that will be faulted in "
+			       "must have pagewise sg_lists\n", __func__);
+			ret = -EINVAL;
+			goto err;
+		}
 
+		ret = ion_buffer_alloc_dirty(buffer);
+		if (ret)
+			goto err;
+	}
+
+	buffer->dev = dev;
+	buffer->size = len;
+	INIT_LIST_HEAD(&buffer->vmas);
 	mutex_init(&buffer->lock);
+	/* this will set up dma addresses for the sglist -- it is not
+	   technically correct as per the dma api -- a specific
+	   device isn't really taking ownership here.  However, in practice on
+	   our systems the only dma_address space is physical addresses.
+	   Additionally, we can't afford the overhead of invalidating every
+	   allocation via dma_map_sg. The implicit contract here is that
+	   memory comming from the heaps is ready for dma, ie if it has a
+	   cached mapping that mapping has been invalidated */
+	for_each_sg(buffer->sg_table->sgl, sg, buffer->sg_table->nents, i) {
+		if (sg_dma_address(sg) == 0)
+			sg_dma_address(sg) = sg_phys(sg);
+	}
 	ion_buffer_add(dev, buffer);
 	return buffer;
+
+err:
+	heap->ops->unmap_dma(heap, buffer);
+	heap->ops->free(buffer);
+	kfree(buffer);
+	return ERR_PTR(ret);
 }
 
 /**
@@ -261,6 +307,12 @@
 	mutex_unlock(&buffer->lock);
 }
 
+static void ion_delayed_unsecure(struct ion_buffer *buffer)
+{
+	if (buffer->heap->ops->unsecure_buffer)
+		buffer->heap->ops->unsecure_buffer(buffer, 1);
+}
+
 static void ion_buffer_destroy(struct kref *kref)
 {
 	struct ion_buffer *buffer = container_of(kref, struct ion_buffer, ref);
@@ -268,14 +320,16 @@
 
 	if (WARN_ON(buffer->kmap_cnt > 0))
 		buffer->heap->ops->unmap_kernel(buffer->heap, buffer);
-
 	buffer->heap->ops->unmap_dma(buffer->heap, buffer);
 
+	ion_delayed_unsecure(buffer);
 	ion_iommu_delayed_unmap(buffer);
 	buffer->heap->ops->free(buffer);
 	mutex_lock(&dev->lock);
 	rb_erase(&buffer->node, &dev->buffers);
 	mutex_unlock(&dev->lock);
+	if (buffer->flags & ION_FLAG_CACHED)
+		kfree(buffer->dirty);
 	kfree(buffer);
 }
 
@@ -402,7 +456,7 @@
 	struct ion_handle *handle;
 	struct ion_device *dev = client->dev;
 	struct ion_buffer *buffer = NULL;
-	unsigned long secure_allocation = flags & ION_SECURE;
+	unsigned long secure_allocation = flags & ION_FLAG_SECURE;
 	const unsigned int MAX_DBG_STR_LEN = 64;
 	char dbg_str[MAX_DBG_STR_LEN];
 	unsigned int dbg_str_idx = 0;
@@ -410,6 +464,16 @@
 	dbg_str[0] = '\0';
 
 	/*
+	 * For now, we don't want to fault in pages individually since
+	 * clients are already doing manual cache maintenance. In
+	 * other words, the implicit caching infrastructure is in
+	 * place (in code) but should not be used.
+	 */
+	flags |= ION_FLAG_CACHED_NEEDS_SYNC;
+
+	pr_debug("%s: len %d align %d heap_mask %u flags %x\n", __func__, len,
+		 align, heap_mask, flags);
+	/*
 	 * traverse the list of heaps available in this system in priority
 	 * order.  If the heap type is supported by the client, and matches the
 	 * request of the caller allocate from it.  Repeat until allocate has
@@ -431,11 +495,18 @@
 			continue;
 		/* Do not allow un-secure heap if secure is specified */
 		if (secure_allocation &&
-			(heap->type != (enum ion_heap_type) ION_HEAP_TYPE_CP))
+		    !ion_heap_allow_secure_allocation(heap->type))
 			continue;
+		trace_ion_alloc_buffer_start(client->name, heap->name, len,
+					     heap_mask, flags);
 		buffer = ion_buffer_create(heap, dev, len, align, flags);
+		trace_ion_alloc_buffer_end(client->name, heap->name, len,
+					   heap_mask, flags);
 		if (!IS_ERR_OR_NULL(buffer))
 			break;
+
+		trace_ion_alloc_buffer_fallback(client->name, heap->name, len,
+					    heap_mask, flags, PTR_ERR(buffer));
 		if (dbg_str_idx < MAX_DBG_STR_LEN) {
 			unsigned int len_left = MAX_DBG_STR_LEN-dbg_str_idx-1;
 			int ret_value = snprintf(&dbg_str[dbg_str_idx],
@@ -454,10 +525,15 @@
 	}
 	mutex_unlock(&dev->lock);
 
-	if (buffer == NULL)
+	if (buffer == NULL) {
+		trace_ion_alloc_buffer_fail(client->name, dbg_str, len,
+					    heap_mask, flags, -ENODEV);
 		return ERR_PTR(-ENODEV);
+	}
 
 	if (IS_ERR(buffer)) {
+		trace_ion_alloc_buffer_fail(client->name, dbg_str, len,
+					    heap_mask, flags, PTR_ERR(buffer));
 		pr_debug("ION is unable to allocate 0x%x bytes (alignment: "
 			 "0x%x) from heap(s) %sfor client %s with heap "
 			 "mask 0x%x\n",
@@ -627,6 +703,19 @@
 	struct ion_iommu_map *iommu_map;
 	int ret = 0;
 
+	if (IS_ERR_OR_NULL(client)) {
+		pr_err("%s: client pointer is invalid\n", __func__);
+		return -EINVAL;
+	}
+	if (IS_ERR_OR_NULL(handle)) {
+		pr_err("%s: handle pointer is invalid\n", __func__);
+		return -EINVAL;
+	}
+	if (IS_ERR_OR_NULL(handle->buffer)) {
+		pr_err("%s: buffer pointer is invalid\n", __func__);
+		return -EINVAL;
+	}
+
 	if (ION_IS_CACHED(flags)) {
 		pr_err("%s: Cannot map iommu as cached.\n", __func__);
 		return -EINVAL;
@@ -687,6 +776,8 @@
 
 			if (iommu_map->flags & ION_IOMMU_UNMAP_DELAYED)
 				kref_get(&iommu_map->ref);
+		} else {
+			ret = PTR_ERR(iommu_map);
 		}
 	} else {
 		if (iommu_map->flags != iommu_flags) {
@@ -732,6 +823,19 @@
 	struct ion_iommu_map *iommu_map;
 	struct ion_buffer *buffer;
 
+	if (IS_ERR_OR_NULL(client)) {
+		pr_err("%s: client pointer is invalid\n", __func__);
+		return;
+	}
+	if (IS_ERR_OR_NULL(handle)) {
+		pr_err("%s: handle pointer is invalid\n", __func__);
+		return;
+	}
+	if (IS_ERR_OR_NULL(handle->buffer)) {
+		pr_err("%s: buffer pointer is invalid\n", __func__);
+		return;
+	}
+
 	mutex_lock(&client->lock);
 	buffer = handle->buffer;
 
@@ -865,7 +969,7 @@
 		if (type == ION_HEAP_TYPE_SYSTEM_CONTIG ||
 			type == ION_HEAP_TYPE_CARVEOUT ||
 			type == (enum ion_heap_type) ION_HEAP_TYPE_CP)
-			seq_printf(s, " : %12lx", handle->buffer->priv_phys);
+			seq_printf(s, " : %12pa", &handle->buffer->priv_phys);
 		else
 			seq_printf(s, " : %12s", "N/A");
 
@@ -975,10 +1079,91 @@
 	return client;
 }
 
+/**
+ * ion_mark_dangling_buffers_locked() - Mark dangling buffers
+ * @dev:	the ion device whose buffers will be searched
+ *
+ * Sets marked=1 for all known buffers associated with `dev' that no
+ * longer have a handle pointing to them. dev->lock should be held
+ * across a call to this function (and should only be unlocked after
+ * checking for marked buffers).
+ */
+static void ion_mark_dangling_buffers_locked(struct ion_device *dev)
+{
+	struct rb_node *n, *n2;
+	/* mark all buffers as 1 */
+	for (n = rb_first(&dev->buffers); n; n = rb_next(n)) {
+		struct ion_buffer *buf = rb_entry(n, struct ion_buffer,
+						node);
+
+		buf->marked = 1;
+	}
+
+	/* now see which buffers we can access */
+	for (n = rb_first(&dev->clients); n; n = rb_next(n)) {
+		struct ion_client *client = rb_entry(n, struct ion_client,
+						node);
+
+		mutex_lock(&client->lock);
+		for (n2 = rb_first(&client->handles); n2; n2 = rb_next(n2)) {
+			struct ion_handle *handle
+				= rb_entry(n2, struct ion_handle, node);
+
+			handle->buffer->marked = 0;
+
+		}
+		mutex_unlock(&client->lock);
+
+	}
+}
+
+#ifdef CONFIG_ION_LEAK_CHECK
+static u32 ion_debug_check_leaks_on_destroy;
+
+static int ion_check_for_and_print_leaks(struct ion_device *dev)
+{
+	struct rb_node *n;
+	int num_leaks = 0;
+
+	if (!ion_debug_check_leaks_on_destroy)
+		return 0;
+
+	/* check for leaked buffers (those that no longer have a
+	 * handle pointing to them) */
+	ion_mark_dangling_buffers_locked(dev);
+
+	/* Anyone still marked as a 1 means a leaked handle somewhere */
+	for (n = rb_first(&dev->buffers); n; n = rb_next(n)) {
+		struct ion_buffer *buf = rb_entry(n, struct ion_buffer,
+						node);
+
+		if (buf->marked == 1) {
+			pr_info("Leaked ion buffer at %p\n", buf);
+			num_leaks++;
+		}
+	}
+	return num_leaks;
+}
+static void setup_ion_leak_check(struct dentry *debug_root)
+{
+	debugfs_create_bool("check_leaks_on_destroy", 0664, debug_root,
+			&ion_debug_check_leaks_on_destroy);
+}
+#else
+static int ion_check_for_and_print_leaks(struct ion_device *dev)
+{
+	return 0;
+}
+static void setup_ion_leak_check(struct dentry *debug_root)
+{
+}
+#endif
+
 void ion_client_destroy(struct ion_client *client)
 {
 	struct ion_device *dev = client->dev;
 	struct rb_node *n;
+	int num_leaks;
 
 	pr_debug("%s: %d\n", __func__, __LINE__);
 	while ((n = rb_first(&client->handles))) {
@@ -991,8 +1176,21 @@
 		put_task_struct(client->task);
 	rb_erase(&client->node, &dev->clients);
 	debugfs_remove_recursive(client->debug_root);
+
+	num_leaks = ion_check_for_and_print_leaks(dev);
+
 	mutex_unlock(&dev->lock);
 
+	if (num_leaks) {
+		struct task_struct *current_task = current;
+		char current_task_name[TASK_COMM_LEN];
+		get_task_comm(current_task_name, current_task);
+		WARN(1, "%s: Detected %d leaked ion buffer%s.\n",
+			__func__, num_leaks, num_leaks == 1 ? "" : "s");
+		pr_info("task name at time of leak: %s, pid: %d\n",
+			current_task_name, current_task->pid);
+	}
+
 	kfree(client->name);
 	kfree(client);
 }
@@ -1062,12 +1260,47 @@
 }
 EXPORT_SYMBOL(ion_sg_table);
 
+struct sg_table *ion_create_chunked_sg_table(phys_addr_t buffer_base,
+					size_t chunk_size, size_t total_size)
+{
+	struct sg_table *table;
+	int i, n_chunks, ret;
+	struct scatterlist *sg;
+
+	table = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+	if (!table)
+		return ERR_PTR(-ENOMEM);
+
+	n_chunks = DIV_ROUND_UP(total_size, chunk_size);
+	pr_debug("creating sg_table with %d chunks\n", n_chunks);
+
+	ret = sg_alloc_table(table, n_chunks, GFP_KERNEL);
+	if (ret)
+		goto err0;
+
+	for_each_sg(table->sgl, sg, table->nents, i) {
+		dma_addr_t addr = buffer_base + i * chunk_size;
+		sg_dma_address(sg) = addr;
+		sg_dma_len(sg) = chunk_size;
+	}
+
+	return table;
+err0:
+	kfree(table);
+	return ERR_PTR(ret);
+}
+
+static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
+				       struct device *dev,
+				       enum dma_data_direction direction);
+
 static struct sg_table *ion_map_dma_buf(struct dma_buf_attachment *attachment,
 					enum dma_data_direction direction)
 {
 	struct dma_buf *dmabuf = attachment->dmabuf;
 	struct ion_buffer *buffer = dmabuf->priv;
 
+	ion_buffer_sync_for_device(buffer, attachment->dev, direction);
 	return buffer->sg_table;
 }
 
@@ -1077,40 +1310,119 @@
 {
 }
 
-static void ion_vma_open(struct vm_area_struct *vma)
+static int ion_buffer_alloc_dirty(struct ion_buffer *buffer)
 {
-	struct ion_buffer *buffer = vma->vm_private_data;
+	unsigned long pages = buffer->sg_table->nents;
+	unsigned long length = (pages + BITS_PER_LONG - 1)/BITS_PER_LONG;
 
-	pr_debug("%s: %d\n", __func__, __LINE__);
+	buffer->dirty = kzalloc(length * sizeof(unsigned long), GFP_KERNEL);
+	if (!buffer->dirty)
+		return -ENOMEM;
+	return 0;
+}
+
+struct ion_vma_list {
+	struct list_head list;
+	struct vm_area_struct *vma;
+};
+
+static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
+				       struct device *dev,
+				       enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+	struct ion_vma_list *vma_list;
+
+	pr_debug("%s: syncing for device %s\n", __func__,
+		 dev ? dev_name(dev) : "null");
+
+	if (!ion_buffer_fault_user_mappings(buffer))
+		return;
 
 	mutex_lock(&buffer->lock);
-	buffer->umap_cnt++;
+	for_each_sg(buffer->sg_table->sgl, sg, buffer->sg_table->nents, i) {
+		if (!test_bit(i, buffer->dirty))
+			continue;
+		dma_sync_sg_for_device(dev, sg, 1, dir);
+		clear_bit(i, buffer->dirty);
+	}
+	list_for_each_entry(vma_list, &buffer->vmas, list) {
+		struct vm_area_struct *vma = vma_list->vma;
+
+		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start,
+			       NULL);
+	}
 	mutex_unlock(&buffer->lock);
 }
 
-static void ion_vma_close(struct vm_area_struct *vma)
+int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct ion_buffer *buffer = vma->vm_private_data;
-
-	pr_debug("%s: %d\n", __func__, __LINE__);
+	struct scatterlist *sg;
+	int i;
 
 	mutex_lock(&buffer->lock);
-	buffer->umap_cnt--;
+	set_bit(vmf->pgoff, buffer->dirty);
+
+	for_each_sg(buffer->sg_table->sgl, sg, buffer->sg_table->nents, i) {
+		if (i != vmf->pgoff)
+			continue;
+		dma_sync_sg_for_cpu(NULL, sg, 1, DMA_BIDIRECTIONAL);
+		vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+			       sg_page(sg));
+		break;
+	}
+	mutex_unlock(&buffer->lock);
+	return VM_FAULT_NOPAGE;
+}
+
+static void ion_vm_open(struct vm_area_struct *vma)
+{
+	struct ion_buffer *buffer = vma->vm_private_data;
+	struct ion_vma_list *vma_list;
+
+	vma_list = kmalloc(sizeof(struct ion_vma_list), GFP_KERNEL);
+	if (!vma_list)
+		return;
+	vma_list->vma = vma;
+	mutex_lock(&buffer->lock);
+	list_add(&vma_list->list, &buffer->vmas);
+	mutex_unlock(&buffer->lock);
+	pr_debug("%s: adding %p\n", __func__, vma);
+}
+
+static void ion_vm_close(struct vm_area_struct *vma)
+{
+	struct ion_buffer *buffer = vma->vm_private_data;
+	struct ion_vma_list *vma_list, *tmp;
+
+	pr_debug("%s\n", __func__);
+	mutex_lock(&buffer->lock);
+	list_for_each_entry_safe(vma_list, tmp, &buffer->vmas, list) {
+		if (vma_list->vma != vma)
+			continue;
+		list_del(&vma_list->list);
+		kfree(vma_list);
+		pr_debug("%s: deleting %p\n", __func__, vma);
+		break;
+	}
 	mutex_unlock(&buffer->lock);
 
 	if (buffer->heap->ops->unmap_user)
 		buffer->heap->ops->unmap_user(buffer->heap, buffer);
 }
 
-static struct vm_operations_struct ion_vm_ops = {
-	.open = ion_vma_open,
-	.close = ion_vma_close,
+struct vm_operations_struct ion_vma_ops = {
+	.open = ion_vm_open,
+	.close = ion_vm_close,
+	.fault = ion_vm_fault,
 };
 
 static int ion_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 {
 	struct ion_buffer *buffer = dmabuf->priv;
-	int ret;
+	int ret = 0;
 
 	if (!buffer->heap->ops->map_user) {
 		pr_err("%s: this heap does not define a method for mapping "
@@ -1118,25 +1430,26 @@
 		return -EINVAL;
 	}
 
+	if (ion_buffer_fault_user_mappings(buffer)) {
+		vma->vm_private_data = buffer;
+		vma->vm_ops = &ion_vma_ops;
+		vma->vm_flags |= VM_MIXEDMAP;
+		ion_vm_open(vma);
+		return 0;
+	}
+
+	if (!(buffer->flags & ION_FLAG_CACHED))
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
 	mutex_lock(&buffer->lock);
 	/* now map it to userspace */
 	ret = buffer->heap->ops->map_user(buffer->heap, buffer, vma);
+	mutex_unlock(&buffer->lock);
 
-	if (ret) {
-		mutex_unlock(&buffer->lock);
+	if (ret)
 		pr_err("%s: failure mapping buffer to userspace\n",
 		       __func__);
-	} else {
-		buffer->umap_cnt++;
-		mutex_unlock(&buffer->lock);
 
-		vma->vm_ops = &ion_vm_ops;
-		/*
-		 * move the buffer into the vm_private_data so we can access it
-		 * from vma_open/close
-		 */
-		vma->vm_private_data = buffer;
-	}
 	return ret;
 }
 
@@ -1205,33 +1518,6 @@
 	.kunmap = ion_dma_buf_kunmap,
 };
 
-static int ion_share_set_flags(struct ion_client *client,
-				struct ion_handle *handle,
-				unsigned long flags)
-{
-	struct ion_buffer *buffer;
-	bool valid_handle;
-	unsigned long ion_flags = 0;
-	if (flags & O_DSYNC)
-		ion_flags = ION_SET_UNCACHED(ion_flags);
-	else
-		ion_flags = ION_SET_CACHED(ion_flags);
-
-
-	mutex_lock(&client->lock);
-	valid_handle = ion_handle_validate(client, handle);
-	mutex_unlock(&client->lock);
-	if (!valid_handle) {
-		WARN(1, "%s: invalid handle passed to set_flags.\n", __func__);
-		return -EINVAL;
-	}
-
-	buffer = handle->buffer;
-
-	return 0;
-}
-
-
 int ion_share_dma_buf(struct ion_client *client, struct ion_handle *handle)
 {
 	struct ion_buffer *buffer;
@@ -1299,6 +1585,30 @@
 }
 EXPORT_SYMBOL(ion_import_dma_buf);
 
+static int ion_sync_for_device(struct ion_client *client, int fd)
+{
+	struct dma_buf *dmabuf;
+	struct ion_buffer *buffer;
+
+	dmabuf = dma_buf_get(fd);
+	if (IS_ERR_OR_NULL(dmabuf))
+		return PTR_ERR(dmabuf);
+
+	/* if this memory came from ion */
+	if (dmabuf->ops != &dma_buf_ops) {
+		pr_err("%s: can not sync dmabuf from another exporter\n",
+		       __func__);
+		dma_buf_put(dmabuf);
+		return -EINVAL;
+	}
+	buffer = dmabuf->priv;
+
+	dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+			       buffer->sg_table->nents, DMA_BIDIRECTIONAL);
+	dma_buf_put(dmabuf);
+	return 0;
+}
+
 static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct ion_client *client = filp->private_data;
@@ -1342,14 +1652,9 @@
 	case ION_IOC_SHARE:
 	{
 		struct ion_fd_data data;
-		int ret;
 		if (copy_from_user(&data, (void __user *)arg, sizeof(data)))
 			return -EFAULT;
 
-		ret = ion_share_set_flags(client, data.handle, filp->f_flags);
-		if (ret)
-			return ret;
-
 		data.fd = ion_share_dma_buf(client, data.handle);
 		if (copy_to_user((void __user *)arg, &data, sizeof(data)))
 			return -EFAULT;
@@ -1376,6 +1681,15 @@
 			return ret;
 		break;
 	}
+	case ION_IOC_SYNC:
+	{
+		struct ion_fd_data data;
+		if (copy_from_user(&data, (void __user *)arg,
+				   sizeof(struct ion_fd_data)))
+			return -EFAULT;
+		ion_sync_for_device(client, data.fd);
+		break;
+	}
 	case ION_IOC_CUSTOM:
 	{
 		struct ion_device *dev = client->dev;
@@ -1397,9 +1711,6 @@
 	case ION_IOC_CLEAN_INV_CACHES:
 		return client->dev->custom_ioctl(client,
 						ION_IOC_CLEAN_INV_CACHES, arg);
-	case ION_IOC_GET_FLAGS:
-		return client->dev->custom_ioctl(client,
-						ION_IOC_GET_FLAGS, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -1542,6 +1853,10 @@
 {
 	struct ion_device *dev = heap->dev;
 	struct rb_node *n;
+	size_t size;
+
+	if (!heap->ops->phys)
+		return;
 
 	for (n = rb_first(&dev->buffers); n; n = rb_next(n)) {
 		struct ion_buffer *buffer =
@@ -1554,9 +1869,11 @@
 					   "Part of memory map will not be logged\n");
 				break;
 			}
-			data->addr = buffer->priv_phys;
-			data->addr_end = buffer->priv_phys + buffer->size-1;
-			data->size = buffer->size;
+
+			buffer->heap->ops->phys(buffer->heap, buffer,
+						&(data->addr), &size);
+			data->size = (unsigned long) size;
+			data->addr_end = data->addr + data->size - 1;
 			data->client_name = ion_debug_locate_owner(dev, buffer);
 			ion_debug_mem_map_add(mem_map, data);
 		}
@@ -1674,6 +1991,73 @@
 	mutex_unlock(&dev->lock);
 }
 
+int ion_secure_handle(struct ion_client *client, struct ion_handle *handle,
+			int version, void *data, int flags)
+{
+	int ret = -EINVAL;
+	struct ion_heap *heap;
+	struct ion_buffer *buffer;
+
+	mutex_lock(&client->lock);
+	if (!ion_handle_validate(client, handle)) {
+		WARN(1, "%s: invalid handle passed to secure.\n", __func__);
+		goto out_unlock;
+	}
+
+	buffer = handle->buffer;
+	heap = buffer->heap;
+
+	if (!ion_heap_allow_handle_secure(heap->type)) {
+		pr_err("%s: cannot secure buffer from non secure heap\n",
+			__func__);
+		goto out_unlock;
+	}
+
+	BUG_ON(!buffer->heap->ops->secure_buffer);
+	/*
+	 * Protect the handle via the client lock to ensure we aren't
+	 * racing with free
+	 */
+	ret = buffer->heap->ops->secure_buffer(buffer, version, data, flags);
+
+out_unlock:
+	mutex_unlock(&client->lock);
+	return ret;
+}
+
+int ion_unsecure_handle(struct ion_client *client, struct ion_handle *handle)
+{
+	int ret = -EINVAL;
+	struct ion_heap *heap;
+	struct ion_buffer *buffer;
+
+	mutex_lock(&client->lock);
+	if (!ion_handle_validate(client, handle)) {
+		WARN(1, "%s: invalid handle passed to secure.\n", __func__);
+		goto out_unlock;
+	}
+
+	buffer = handle->buffer;
+	heap = buffer->heap;
+
+	if (!ion_heap_allow_handle_secure(heap->type)) {
+		pr_err("%s: cannot secure buffer from non secure heap\n",
+			__func__);
+		goto out_unlock;
+	}
+
+	BUG_ON(!buffer->heap->ops->unsecure_buffer);
+	/*
+	 * Protect the handle via the client lock to ensure we aren't
+	 * racing with free
+	 */
+	ret = buffer->heap->ops->unsecure_buffer(buffer, 0);
+
+out_unlock:
+	mutex_unlock(&client->lock);
+	return ret;
+}
+
 int ion_secure_heap(struct ion_device *dev, int heap_id, int version,
 			void *data)
 {
@@ -1687,7 +2071,7 @@
 	mutex_lock(&dev->lock);
 	for (n = rb_first(&dev->heaps); n != NULL; n = rb_next(n)) {
 		struct ion_heap *heap = rb_entry(n, struct ion_heap, node);
-		if (heap->type != (enum ion_heap_type) ION_HEAP_TYPE_CP)
+		if (!ion_heap_allow_heap_secure(heap->type))
 			continue;
 		if (ION_HEAP(heap->id) != heap_id)
 			continue;
@@ -1715,7 +2099,7 @@
 	mutex_lock(&dev->lock);
 	for (n = rb_first(&dev->heaps); n != NULL; n = rb_next(n)) {
 		struct ion_heap *heap = rb_entry(n, struct ion_heap, node);
-		if (heap->type != (enum ion_heap_type) ION_HEAP_TYPE_CP)
+		if (!ion_heap_allow_heap_secure(heap->type))
 			continue;
 		if (ION_HEAP(heap->id) != heap_id)
 			continue;
@@ -1734,37 +2118,14 @@
 {
 	struct ion_device *dev = s->private;
 	struct rb_node *n;
-	struct rb_node *n2;
 
-	/* mark all buffers as 1 */
 	seq_printf(s, "%16.s %16.s %16.s %16.s\n", "buffer", "heap", "size",
 		"ref cnt");
+
 	mutex_lock(&dev->lock);
-	for (n = rb_first(&dev->buffers); n; n = rb_next(n)) {
-		struct ion_buffer *buf = rb_entry(n, struct ion_buffer,
-						     node);
+	ion_mark_dangling_buffers_locked(dev);
 
-		buf->marked = 1;
-	}
-
-	/* now see which buffers we can access */
-	for (n = rb_first(&dev->clients); n; n = rb_next(n)) {
-		struct ion_client *client = rb_entry(n, struct ion_client,
-						     node);
-
-		mutex_lock(&client->lock);
-		for (n2 = rb_first(&client->handles); n2; n2 = rb_next(n2)) {
-			struct ion_handle *handle = rb_entry(n2,
-						struct ion_handle, node);
-
-			handle->buffer->marked = 0;
-
-		}
-		mutex_unlock(&client->lock);
-
-	}
-
-	/* And anyone still marked as a 1 means a leaked handle somewhere */
+	/* Anyone still marked as a 1 means a leaked handle somewhere */
 	for (n = rb_first(&dev->buffers); n; n = rb_next(n)) {
 		struct ion_buffer *buf = rb_entry(n, struct ion_buffer,
 						     node);
@@ -1825,6 +2186,8 @@
 	idev->clients = RB_ROOT;
 	debugfs_create_file("check_leaked_fds", 0664, idev->debug_root, idev,
 			    &debug_leak_fops);
+
+	setup_ion_leak_check(idev->debug_root);
 	return idev;
 }
 
@@ -1845,8 +2208,8 @@
 		ret = memblock_reserve(data->heaps[i].base,
 				       data->heaps[i].size);
 		if (ret)
-			pr_err("memblock reserve of %x@%lx failed\n",
+			pr_err("memblock reserve of %x@%pa failed\n",
 			       data->heaps[i].size,
-			       data->heaps[i].base);
+			       &data->heaps[i].base);
 	}
 }
diff --git a/drivers/gpu/ion/ion_carveout_heap.c b/drivers/gpu/ion/ion_carveout_heap.c
index a808cc9..9610dfe 100644
--- a/drivers/gpu/ion/ion_carveout_heap.c
+++ b/drivers/gpu/ion/ion_carveout_heap.c
@@ -2,7 +2,7 @@
  * drivers/gpu/ion/ion_carveout_heap.c
  *
  * Copyright (C) 2011 Google, Inc.
- * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/iommu.h>
 #include <linux/seq_file.h>
 #include "ion_priv.h"
@@ -111,26 +112,13 @@
 struct sg_table *ion_carveout_heap_map_dma(struct ion_heap *heap,
 					      struct ion_buffer *buffer)
 {
-	struct sg_table *table;
-	int ret;
+	size_t chunk_size = buffer->size;
 
-	table = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
-	if (!table)
-		return ERR_PTR(-ENOMEM);
+	if (ION_IS_CACHED(buffer->flags))
+		chunk_size = PAGE_SIZE;
 
-	ret = sg_alloc_table(table, 1, GFP_KERNEL);
-	if (ret)
-		goto err0;
-
-	table->sgl->length = buffer->size;
-	table->sgl->offset = 0;
-	table->sgl->dma_address = buffer->priv_phys;
-
-	return table;
-
-err0:
-	kfree(table);
-	return ERR_PTR(ret);
+	return ion_create_chunked_sg_table(buffer->priv_phys, chunk_size,
+					buffer->size);
 }
 
 void ion_carveout_heap_unmap_dma(struct ion_heap *heap,
@@ -240,25 +228,78 @@
 			void *vaddr, unsigned int offset, unsigned int length,
 			unsigned int cmd)
 {
-	void (*outer_cache_op)(phys_addr_t, phys_addr_t);
+	void (*outer_cache_op)(phys_addr_t, phys_addr_t) = NULL;
 	struct ion_carveout_heap *carveout_heap =
 	     container_of(heap, struct  ion_carveout_heap, heap);
+	unsigned int size_to_vmap, total_size;
+	int i, j;
+	void *ptr = NULL;
+	ion_phys_addr_t buff_phys = buffer->priv_phys;
 
-	switch (cmd) {
-	case ION_IOC_CLEAN_CACHES:
-		dmac_clean_range(vaddr, vaddr + length);
-		outer_cache_op = outer_clean_range;
-		break;
-	case ION_IOC_INV_CACHES:
-		dmac_inv_range(vaddr, vaddr + length);
-		outer_cache_op = outer_inv_range;
-		break;
-	case ION_IOC_CLEAN_INV_CACHES:
-		dmac_flush_range(vaddr, vaddr + length);
-		outer_cache_op = outer_flush_range;
-		break;
-	default:
-		return -EINVAL;
+	if (!vaddr) {
+		/*
+		 * Split the vmalloc space into smaller regions in
+		 * order to clean and/or invalidate the cache.
+		 */
+		size_to_vmap = ((VMALLOC_END - VMALLOC_START)/8);
+		total_size = buffer->size;
+
+		for (i = 0; i < total_size; i += size_to_vmap) {
+			size_to_vmap = min(size_to_vmap, total_size - i);
+			for (j = 0; j < 10 && size_to_vmap; ++j) {
+				ptr = ioremap(buff_phys, size_to_vmap);
+				if (ptr) {
+					switch (cmd) {
+					case ION_IOC_CLEAN_CACHES:
+						dmac_clean_range(ptr,
+							ptr + size_to_vmap);
+						outer_cache_op =
+							outer_clean_range;
+						break;
+					case ION_IOC_INV_CACHES:
+						dmac_inv_range(ptr,
+							ptr + size_to_vmap);
+						outer_cache_op =
+							outer_inv_range;
+						break;
+					case ION_IOC_CLEAN_INV_CACHES:
+						dmac_flush_range(ptr,
+							ptr + size_to_vmap);
+						outer_cache_op =
+							outer_flush_range;
+						break;
+					default:
+						return -EINVAL;
+					}
+					buff_phys += size_to_vmap;
+					break;
+				} else {
+					size_to_vmap >>= 1;
+				}
+			}
+			if (!ptr) {
+				pr_err("Couldn't io-remap the memory\n");
+				return -EINVAL;
+			}
+			iounmap(ptr);
+		}
+	} else {
+		switch (cmd) {
+		case ION_IOC_CLEAN_CACHES:
+			dmac_clean_range(vaddr, vaddr + length);
+			outer_cache_op = outer_clean_range;
+			break;
+		case ION_IOC_INV_CACHES:
+			dmac_inv_range(vaddr, vaddr + length);
+			outer_cache_op = outer_inv_range;
+			break;
+		case ION_IOC_CLEAN_INV_CACHES:
+			dmac_flush_range(vaddr, vaddr + length);
+			outer_cache_op = outer_flush_range;
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
 
 	if (carveout_heap->has_outer_cache) {
@@ -296,8 +337,11 @@
 			const char *client_name = "(null)";
 
 			if (last_end < data->addr) {
-				seq_printf(s, "%16.s %14lx %14lx %14lu (%lx)\n",
-					   "FREE", last_end, data->addr-1,
+				phys_addr_t da;
+
+				da = data->addr-1;
+				seq_printf(s, "%16.s %14pa %14pa %14lu (%lx)\n",
+					   "FREE", &last_end, &da,
 					   data->addr-last_end,
 					   data->addr-last_end);
 			}
@@ -305,9 +349,9 @@
 			if (data->client_name)
 				client_name = data->client_name;
 
-			seq_printf(s, "%16.s %14lx %14lx %14lu (%lx)\n",
-				   client_name, data->addr,
-				   data->addr_end,
+			seq_printf(s, "%16.s %14pa %14pa %14lu (%lx)\n",
+				   client_name, &data->addr,
+				   &data->addr_end,
 				   data->size, data->size);
 			last_end = data->addr_end+1;
 		}
@@ -357,7 +401,7 @@
 		goto out1;
 	}
 
-	sglist = kmalloc(sizeof(*sglist), GFP_KERNEL);
+	sglist = vmalloc(sizeof(*sglist));
 	if (!sglist)
 		goto out1;
 
@@ -376,18 +420,19 @@
 
 	if (extra) {
 		unsigned long extra_iova_addr = data->iova_addr + buffer->size;
-		ret = msm_iommu_map_extra(domain, extra_iova_addr, extra,
-					  SZ_4K, prot);
+		unsigned long phys_addr = sg_phys(sglist);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
 		if (ret)
 			goto out2;
 	}
-	kfree(sglist);
+	vfree(sglist);
 	return ret;
 
 out2:
 	iommu_unmap_range(domain, data->iova_addr, buffer->size);
 out1:
-	kfree(sglist);
+	vfree(sglist);
 	msm_free_iova_address(data->iova_addr, domain_num, partition_num,
 				data->mapped_size);
 
diff --git a/drivers/gpu/ion/ion_cma_heap.c b/drivers/gpu/ion/ion_cma_heap.c
index bef6b6f..4f12e38 100644
--- a/drivers/gpu/ion/ion_cma_heap.c
+++ b/drivers/gpu/ion/ion_cma_heap.c
@@ -127,8 +127,8 @@
 	struct device *dev = heap->priv;
 	struct ion_cma_buffer_info *info = buffer->priv_virt;
 
-	dev_dbg(dev, "Return buffer %p physical address 0x%x\n", buffer,
-		info->handle);
+	dev_dbg(dev, "Return buffer %p physical address 0x%pa\n", buffer,
+		&info->handle);
 
 	*addr = info->handle;
 	*len = buffer->size;
@@ -228,8 +228,9 @@
 
 	extra_iova_addr = data->iova_addr + buffer->size;
 	if (extra) {
-		ret = msm_iommu_map_extra(domain, extra_iova_addr, extra, SZ_4K,
-						prot);
+		unsigned long phys_addr = sg_phys(table->sgl);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
 		if (ret)
 			goto out2;
 	}
@@ -280,15 +281,30 @@
 
 	switch (cmd) {
 	case ION_IOC_CLEAN_CACHES:
-		dmac_clean_range(vaddr, vaddr + length);
+		if (!vaddr)
+			dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_TO_DEVICE);
+		else
+			dmac_clean_range(vaddr, vaddr + length);
 		outer_cache_op = outer_clean_range;
 		break;
 	case ION_IOC_INV_CACHES:
-		dmac_inv_range(vaddr, vaddr + length);
+		if (!vaddr)
+			dma_sync_sg_for_cpu(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_FROM_DEVICE);
+		else
+			dmac_inv_range(vaddr, vaddr + length);
 		outer_cache_op = outer_inv_range;
 		break;
 	case ION_IOC_CLEAN_INV_CACHES:
-		dmac_flush_range(vaddr, vaddr + length);
+		if (!vaddr) {
+			dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_TO_DEVICE);
+			dma_sync_sg_for_cpu(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_FROM_DEVICE);
+		} else {
+			dmac_flush_range(vaddr, vaddr + length);
+		}
 		outer_cache_op = outer_flush_range;
 		break;
 	default:
@@ -304,6 +320,35 @@
 	return 0;
 }
 
+static int ion_cma_print_debug(struct ion_heap *heap, struct seq_file *s,
+			const struct rb_root *mem_map)
+{
+	if (mem_map) {
+		struct rb_node *n;
+
+		seq_printf(s, "\nMemory Map\n");
+		seq_printf(s, "%16.s %14.s %14.s %14.s\n",
+			   "client", "start address", "end address",
+			   "size (hex)");
+
+		for (n = rb_first(mem_map); n; n = rb_next(n)) {
+			struct mem_map_data *data =
+					rb_entry(n, struct mem_map_data, node);
+			const char *client_name = "(null)";
+
+
+			if (data->client_name)
+				client_name = data->client_name;
+
+			seq_printf(s, "%16.s %14pa %14pa %14lu (%lx)\n",
+				   client_name, &data->addr,
+				   &data->addr_end,
+				   data->size, data->size);
+		}
+	}
+	return 0;
+}
+
 static struct ion_heap_ops ion_cma_ops = {
 	.allocate = ion_cma_allocate,
 	.free = ion_cma_free,
@@ -316,6 +361,7 @@
 	.map_iommu = ion_cma_map_iommu,
 	.unmap_iommu = ion_cma_unmap_iommu,
 	.cache_op = ion_cma_cache_ops,
+	.print_debug = ion_cma_print_debug,
 };
 
 struct ion_heap *ion_cma_heap_create(struct ion_platform_heap *data)
diff --git a/drivers/gpu/ion/ion_cma_secure_heap.c b/drivers/gpu/ion/ion_cma_secure_heap.c
new file mode 100644
index 0000000..0fbcfbf
--- /dev/null
+++ b/drivers/gpu/ion/ion_cma_secure_heap.c
@@ -0,0 +1,386 @@
+/*
+ * drivers/gpu/ion/ion_secure_cma_heap.c
+ *
+ * Copyright (C) Linaro 2012
+ * Author: <benjamin.gaignard@linaro.org> for ST-Ericsson.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/ion.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/dma-mapping.h>
+#include <linux/msm_ion.h>
+#include <mach/iommu_domains.h>
+
+#include <asm/cacheflush.h>
+
+/* for ion_heap_ops structure */
+#include "ion_priv.h"
+#include "msm/ion_cp_common.h"
+
+#define ION_CMA_ALLOCATE_FAILED NULL
+
+struct ion_secure_cma_buffer_info {
+	/*
+	 * This needs to come first for compatibility with the secure buffer API
+	 */
+	struct ion_cp_buffer secure;
+	void *cpu_addr;
+	dma_addr_t handle;
+	struct sg_table *table;
+	bool is_cached;
+};
+
+static int cma_heap_has_outer_cache;
+/*
+ * Create scatter-list for the already allocated DMA buffer.
+ * This function could be replace by dma_common_get_sgtable
+ * as soon as it will avalaible.
+ */
+int ion_secure_cma_get_sgtable(struct device *dev, struct sg_table *sgt,
+			void *cpu_addr, dma_addr_t handle, size_t size)
+{
+	struct page *page = virt_to_page(cpu_addr);
+	int ret;
+
+	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (unlikely(ret))
+		return ret;
+
+	sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	sg_dma_address(sgt->sgl) = handle;
+	return 0;
+}
+
+/* ION CMA heap operations functions */
+static struct ion_secure_cma_buffer_info *__ion_secure_cma_allocate(
+			    struct ion_heap *heap, struct ion_buffer *buffer,
+			    unsigned long len, unsigned long align,
+			    unsigned long flags)
+{
+	struct device *dev = heap->priv;
+	struct ion_secure_cma_buffer_info *info;
+	DEFINE_DMA_ATTRS(attrs);
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+
+	dev_dbg(dev, "Request buffer allocation len %ld\n", len);
+
+	info = kzalloc(sizeof(struct ion_secure_cma_buffer_info), GFP_KERNEL);
+	if (!info) {
+		dev_err(dev, "Can't allocate buffer info\n");
+		return ION_CMA_ALLOCATE_FAILED;
+	}
+
+	info->cpu_addr = dma_alloc_attrs(dev, len, &(info->handle), 0, &attrs);
+
+	if (!info->cpu_addr) {
+		dev_err(dev, "Fail to allocate buffer\n");
+		goto err;
+	}
+
+	info->table = kmalloc(sizeof(struct sg_table), GFP_KERNEL);
+	if (!info->table) {
+		dev_err(dev, "Fail to allocate sg table\n");
+		goto err;
+	}
+
+	ion_secure_cma_get_sgtable(dev,
+			info->table, info->cpu_addr, info->handle, len);
+
+	info->secure.buffer = info->handle;
+
+	/* keep this for memory release */
+	buffer->priv_virt = info;
+	dev_dbg(dev, "Allocate buffer %p\n", buffer);
+	return info;
+
+err:
+	kfree(info);
+	return ION_CMA_ALLOCATE_FAILED;
+}
+
+static int ion_secure_cma_allocate(struct ion_heap *heap,
+			    struct ion_buffer *buffer,
+			    unsigned long len, unsigned long align,
+			    unsigned long flags)
+{
+	unsigned long secure_allocation = flags & ION_FLAG_SECURE;
+	struct ion_secure_cma_buffer_info *buf = NULL;
+
+	if (!secure_allocation) {
+		pr_err("%s: non-secure allocation disallowed from heap %s %lx\n",
+			__func__, heap->name, flags);
+		return -ENOMEM;
+	}
+
+	if (ION_IS_CACHED(flags)) {
+		pr_err("%s: cannot allocate cached memory from secure heap %s\n",
+			__func__, heap->name);
+		return -ENOMEM;
+	}
+
+
+	buf = __ion_secure_cma_allocate(heap, buffer, len, align, flags);
+
+	if (buf) {
+		buf->secure.want_delayed_unsecure = 0;
+		atomic_set(&buf->secure.secure_cnt, 0);
+		mutex_init(&buf->secure.lock);
+		buf->secure.is_secure = 1;
+		return 0;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+
+static void ion_secure_cma_free(struct ion_buffer *buffer)
+{
+	struct device *dev = buffer->heap->priv;
+	struct ion_secure_cma_buffer_info *info = buffer->priv_virt;
+
+	dev_dbg(dev, "Release buffer %p\n", buffer);
+	/* release memory */
+	dma_free_coherent(dev, buffer->size, info->cpu_addr, info->handle);
+	/* release sg table */
+	kfree(info->table);
+	kfree(info);
+}
+
+static int ion_secure_cma_phys(struct ion_heap *heap, struct ion_buffer *buffer,
+			ion_phys_addr_t *addr, size_t *len)
+{
+	struct device *dev = heap->priv;
+	struct ion_secure_cma_buffer_info *info = buffer->priv_virt;
+
+	dev_dbg(dev, "Return buffer %p physical address 0x%pa\n", buffer,
+		&info->handle);
+
+	*addr = info->handle;
+	*len = buffer->size;
+
+	return 0;
+}
+
+struct sg_table *ion_secure_cma_heap_map_dma(struct ion_heap *heap,
+					 struct ion_buffer *buffer)
+{
+	struct ion_secure_cma_buffer_info *info = buffer->priv_virt;
+
+	return info->table;
+}
+
+void ion_secure_cma_heap_unmap_dma(struct ion_heap *heap,
+			       struct ion_buffer *buffer)
+{
+	return;
+}
+
+static int ion_secure_cma_mmap(struct ion_heap *mapper,
+			struct ion_buffer *buffer,
+			struct vm_area_struct *vma)
+{
+	pr_info("%s: mmaping from secure heap %s disallowed\n",
+		__func__, mapper->name);
+	return -EINVAL;
+}
+
+static void *ion_secure_cma_map_kernel(struct ion_heap *heap,
+				struct ion_buffer *buffer)
+{
+	pr_info("%s: kernel mapping from secure heap %s disallowed\n",
+		__func__, heap->name);
+	return NULL;
+}
+
+static void ion_secure_cma_unmap_kernel(struct ion_heap *heap,
+				 struct ion_buffer *buffer)
+{
+	return;
+}
+
+int ion_secure_cma_map_iommu(struct ion_buffer *buffer,
+				struct ion_iommu_map *data,
+				unsigned int domain_num,
+				unsigned int partition_num,
+				unsigned long align,
+				unsigned long iova_length,
+				unsigned long flags)
+{
+	int ret = 0;
+	struct iommu_domain *domain;
+	unsigned long extra;
+	unsigned long extra_iova_addr;
+	struct ion_secure_cma_buffer_info *info = buffer->priv_virt;
+	struct sg_table *table = info->table;
+	int prot = IOMMU_WRITE | IOMMU_READ;
+
+	data->mapped_size = iova_length;
+
+	if (!msm_use_iommu()) {
+		data->iova_addr = info->handle;
+		return 0;
+	}
+
+	extra = iova_length - buffer->size;
+
+	ret = msm_allocate_iova_address(domain_num, partition_num,
+						data->mapped_size, align,
+						&data->iova_addr);
+
+	if (ret)
+		goto out;
+
+	domain = msm_get_iommu_domain(domain_num);
+
+	if (!domain) {
+		ret = -EINVAL;
+		goto out1;
+	}
+
+	ret = iommu_map_range(domain, data->iova_addr, table->sgl,
+				buffer->size, prot);
+
+	if (ret) {
+		pr_err("%s: could not map %lx in domain %p\n",
+			__func__, data->iova_addr, domain);
+		goto out1;
+	}
+
+	extra_iova_addr = data->iova_addr + buffer->size;
+	if (extra) {
+		unsigned long phys_addr = sg_phys(table->sgl);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
+		if (ret)
+			goto out2;
+	}
+	return ret;
+
+out2:
+	iommu_unmap_range(domain, data->iova_addr, buffer->size);
+out1:
+	msm_free_iova_address(data->iova_addr, domain_num, partition_num,
+				data->mapped_size);
+out:
+	return ret;
+}
+
+
+void ion_secure_cma_unmap_iommu(struct ion_iommu_map *data)
+{
+	unsigned int domain_num;
+	unsigned int partition_num;
+	struct iommu_domain *domain;
+
+	if (!msm_use_iommu())
+		return;
+
+	domain_num = iommu_map_domain(data);
+	partition_num = iommu_map_partition(data);
+
+	domain = msm_get_iommu_domain(domain_num);
+
+	if (!domain) {
+		WARN(1, "Could not get domain %d. Corruption?\n", domain_num);
+		return;
+	}
+
+	iommu_unmap_range(domain, data->iova_addr, data->mapped_size);
+	msm_free_iova_address(data->iova_addr, domain_num, partition_num,
+				data->mapped_size);
+
+	return;
+}
+
+int ion_secure_cma_cache_ops(struct ion_heap *heap,
+			struct ion_buffer *buffer, void *vaddr,
+			unsigned int offset, unsigned int length,
+			unsigned int cmd)
+{
+	pr_info("%s: cache operations disallowed from secure heap %s\n",
+		__func__, heap->name);
+	return -EINVAL;
+}
+
+static int ion_secure_cma_print_debug(struct ion_heap *heap, struct seq_file *s,
+			const struct rb_root *mem_map)
+{
+	if (mem_map) {
+		struct rb_node *n;
+
+		seq_printf(s, "\nMemory Map\n");
+		seq_printf(s, "%16.s %14.s %14.s %14.s\n",
+			   "client", "start address", "end address",
+			   "size (hex)");
+
+		for (n = rb_first(mem_map); n; n = rb_next(n)) {
+			struct mem_map_data *data =
+					rb_entry(n, struct mem_map_data, node);
+			const char *client_name = "(null)";
+
+
+			if (data->client_name)
+				client_name = data->client_name;
+
+			seq_printf(s, "%16.s %14pa %14pa %14lu (%lx)\n",
+				   client_name, &data->addr,
+				   &data->addr_end,
+				   data->size, data->size);
+		}
+	}
+	return 0;
+}
+
+static struct ion_heap_ops ion_secure_cma_ops = {
+	.allocate = ion_secure_cma_allocate,
+	.free = ion_secure_cma_free,
+	.map_dma = ion_secure_cma_heap_map_dma,
+	.unmap_dma = ion_secure_cma_heap_unmap_dma,
+	.phys = ion_secure_cma_phys,
+	.map_user = ion_secure_cma_mmap,
+	.map_kernel = ion_secure_cma_map_kernel,
+	.unmap_kernel = ion_secure_cma_unmap_kernel,
+	.map_iommu = ion_secure_cma_map_iommu,
+	.unmap_iommu = ion_secure_cma_unmap_iommu,
+	.cache_op = ion_secure_cma_cache_ops,
+	.print_debug = ion_secure_cma_print_debug,
+	.secure_buffer = ion_cp_secure_buffer,
+	.unsecure_buffer = ion_cp_unsecure_buffer,
+};
+
+struct ion_heap *ion_secure_cma_heap_create(struct ion_platform_heap *data)
+{
+	struct ion_heap *heap;
+
+	heap = kzalloc(sizeof(struct ion_heap), GFP_KERNEL);
+
+	if (!heap)
+		return ERR_PTR(-ENOMEM);
+
+	heap->ops = &ion_secure_cma_ops;
+	/* set device as private heaps data, later it will be
+	 * used to make the link with reserved CMA memory */
+	heap->priv = data->priv;
+	heap->type = ION_HEAP_TYPE_SECURE_DMA;
+	cma_heap_has_outer_cache = data->has_outer_cache;
+	return heap;
+}
+
+void ion_secure_cma_heap_destroy(struct ion_heap *heap)
+{
+	kfree(heap);
+}
diff --git a/drivers/gpu/ion/ion_cp_heap.c b/drivers/gpu/ion/ion_cp_heap.c
index 41ff28d..88addab 100644
--- a/drivers/gpu/ion/ion_cp_heap.c
+++ b/drivers/gpu/ion/ion_cp_heap.c
@@ -23,11 +23,12 @@
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/memory_alloc.h>
 #include <linux/seq_file.h>
-#include <linux/fmem.h>
 #include <linux/iommu.h>
 #include <linux/dma-mapping.h>
+#include <trace/events/kmem.h>
 
 #include <asm/mach/map.h>
 
@@ -68,8 +69,6 @@
  *		user space.
  * @iommu_iova: saved iova when mapping full heap at once.
  * @iommu_partition: partition used to map full heap.
- * @reusable: indicates if the memory should be reused via fmem.
- * @reserved_vrange: reserved virtual address range for use with fmem
  * @iommu_map_all:	Indicates whether we should map whole heap into IOMMU.
  * @iommu_2x_map_domain: Indicates the domain to use for overmapping.
  * @has_outer_cache:    set to 1 if outer cache is used, 0 otherwise.
@@ -93,7 +92,6 @@
 	unsigned long umap_count;
 	unsigned long iommu_iova[MAX_DOMAINS];
 	unsigned long iommu_partition[MAX_DOMAINS];
-	int reusable;
 	void *reserved_vrange;
 	int iommu_map_all;
 	int iommu_2x_map_domain;
@@ -103,7 +101,7 @@
 	size_t heap_size;
 	dma_addr_t handle;
 	int cma;
-	int disallow_non_secure_allocation;
+	int allow_non_secure_allocation;
 };
 
 enum {
@@ -111,17 +109,8 @@
 	HEAP_PROTECTED = 1,
 };
 
-#define DMA_ALLOC_RETRIES	5
+#define DMA_ALLOC_TRIES	5
 
-static int ion_cp_protect_mem(unsigned int phy_base, unsigned int size,
-			unsigned int permission_type, int version,
-			void *data);
-
-static int ion_cp_unprotect_mem(unsigned int phy_base, unsigned int size,
-				unsigned int permission_type, int version,
-				void *data);
-
-#if 0
 static int allocate_heap_memory(struct ion_heap *heap)
 {
 	struct device *dev = heap->priv;
@@ -136,14 +125,16 @@
 	if (cp_heap->cpu_addr)
 		return 0;
 
-	while (!cp_heap->cpu_addr && (++tries < DMA_ALLOC_RETRIES)) {
+	while (!cp_heap->cpu_addr && (++tries < DMA_ALLOC_TRIES)) {
 		cp_heap->cpu_addr = dma_alloc_attrs(dev,
 						cp_heap->heap_size,
 						&(cp_heap->handle),
 						0,
 						&attrs);
-		if (!cp_heap->cpu_addr)
+		if (!cp_heap->cpu_addr) {
+			trace_ion_cp_alloc_retry(tries);
 			msleep(20);
+		}
 	}
 
 	if (!cp_heap->cpu_addr)
@@ -170,7 +161,7 @@
 out:
 	return ION_CP_ALLOCATE_FAIL;
 }
-#endif
+
 static void free_heap_memory(struct ion_heap *heap)
 {
 	struct device *dev = heap->priv;
@@ -197,19 +188,12 @@
 	return cp_heap->kmap_cached_count + cp_heap->kmap_uncached_count;
 }
 
-#if 0
 static int ion_on_first_alloc(struct ion_heap *heap)
 {
 	struct ion_cp_heap *cp_heap =
 		container_of(heap, struct ion_cp_heap, heap);
 	int ret_value;
 
-	if (cp_heap->reusable) {
-		ret_value = fmem_set_state(FMEM_C_STATE);
-		if (ret_value)
-			return 1;
-	}
-
 	if (cp_heap->cma) {
 		ret_value = allocate_heap_memory(heap);
 		if (ret_value)
@@ -217,18 +201,12 @@
 	}
 	return 0;
 }
-#endif
 
 static void ion_on_last_free(struct ion_heap *heap)
 {
 	struct ion_cp_heap *cp_heap =
 		container_of(heap, struct ion_cp_heap, heap);
 
-	if (cp_heap->reusable)
-		if (fmem_set_state(FMEM_T_STATE) != 0)
-			pr_err("%s: unable to transition heap to T-state\n",
-				__func__);
-
 	if (cp_heap->cma)
 		free_heap_memory(heap);
 }
@@ -246,31 +224,6 @@
 
 	if (atomic_inc_return(&cp_heap->protect_cnt) == 1) {
 		/* Make sure we are in C state when the heap is protected. */
-		if (cp_heap->reusable && !cp_heap->allocated_bytes) {
-			ret_value = fmem_set_state(FMEM_C_STATE);
-			if (ret_value)
-				goto out;
-		}
-
-		ret_value = ion_cp_protect_mem(cp_heap->secure_base,
-				cp_heap->secure_size, cp_heap->permission_type,
-				version, data);
-		if (ret_value) {
-			pr_err("Failed to protect memory for heap %s - "
-				"error code: %d\n", heap->name, ret_value);
-
-			if (cp_heap->reusable && !cp_heap->allocated_bytes) {
-				if (fmem_set_state(FMEM_T_STATE) != 0)
-					pr_err("%s: unable to transition heap to T-state\n",
-						__func__);
-			}
-			atomic_dec(&cp_heap->protect_cnt);
-		} else {
-			cp_heap->heap_protected = HEAP_PROTECTED;
-			pr_debug("Protected heap %s @ 0x%lx\n",
-				heap->name, cp_heap->base);
-		}
-#if 0
 		if (!cp_heap->allocated_bytes)
 			if (ion_on_first_alloc(heap))
 				goto out;
@@ -288,10 +241,9 @@
 			atomic_dec(&cp_heap->protect_cnt);
 		} else {
 			cp_heap->heap_protected = HEAP_PROTECTED;
-			pr_debug("Protected heap %s @ 0x%lx\n",
-				heap->name, cp_heap->base);
+			pr_debug("Protected heap %s @ 0x%pa\n",
+				heap->name, &cp_heap->base);
 		}
-#endif
 	}
 out:
 	pr_debug("%s: protect count is %d\n", __func__,
@@ -337,8 +289,8 @@
 				      unsigned long flags)
 {
 	unsigned long offset;
-	unsigned long secure_allocation = flags & ION_SECURE;
-	unsigned long force_contig = flags & ION_FORCE_CONTIGUOUS;
+	unsigned long secure_allocation = flags & ION_FLAG_SECURE;
+	unsigned long force_contig = flags & ION_FLAG_FORCE_CONTIGUOUS;
 
 	struct ion_cp_heap *cp_heap =
 		container_of(heap, struct ion_cp_heap, heap);
@@ -352,31 +304,25 @@
 	}
 
 	if (!force_contig && !secure_allocation &&
-	     cp_heap->disallow_non_secure_allocation) {
+	     !cp_heap->allow_non_secure_allocation) {
 		mutex_unlock(&cp_heap->lock);
 		pr_debug("%s: non-secure allocation disallowed from this heap\n",
 			__func__);
 		return ION_CP_ALLOCATE_FAIL;
 	}
 
-	if (secure_allocation &&
-	    (cp_heap->umap_count > 0 || cp_heap->kmap_cached_count > 0)) {
-		mutex_unlock(&cp_heap->lock);
-		pr_err("ION cannot allocate secure memory from heap with "
-			"outstanding mappings: User space: %lu, kernel space "
-			"(cached): %lu\n", cp_heap->umap_count,
-					   cp_heap->kmap_cached_count);
-		return ION_CP_ALLOCATE_FAIL;
+	/*
+	 * The check above already checked for non-secure allocations when the
+	 * heap is protected. HEAP_PROTECTED implies that this must be a secure
+	 * allocation. If the heap is protected and there are userspace or
+	 * cached kernel mappings, something has gone wrong in the security
+	 * model.
+	 */
+	if (cp_heap->heap_protected == HEAP_PROTECTED) {
+		BUG_ON(cp_heap->umap_count != 0);
+		BUG_ON(cp_heap->kmap_cached_count != 0);
 	}
 
-	if (cp_heap->reusable && !cp_heap->allocated_bytes) {
-		if (fmem_set_state(FMEM_C_STATE) != 0) {
-			mutex_unlock(&cp_heap->lock);
-			return ION_RESERVED_ALLOCATE_FAIL;
-		}
-	}
-
-#if 0
 	/*
 	 * if this is the first reusable allocation, transition
 	 * the heap
@@ -387,7 +333,6 @@
 			return ION_RESERVED_ALLOCATE_FAIL;
 		}
 
-#endif
 	cp_heap->allocated_bytes += size;
 	mutex_unlock(&cp_heap->lock);
 
@@ -482,7 +427,9 @@
 				  struct ion_buffer *buffer,
 				  ion_phys_addr_t *addr, size_t *len)
 {
-	*addr = buffer->priv_phys;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
+
+	*addr = buf->buffer;
 	*len = buffer->size;
 	return 0;
 }
@@ -492,39 +439,58 @@
 				      unsigned long size, unsigned long align,
 				      unsigned long flags)
 {
-	buffer->priv_phys = ion_cp_allocate(heap, size, align, flags);
-	return buffer->priv_phys == ION_CP_ALLOCATE_FAIL ? -ENOMEM : 0;
+	struct ion_cp_buffer *buf;
+	phys_addr_t addr;
+
+	/*
+	 * we never want Ion to fault pages in for us with this
+	 * heap. We want to set up the mappings ourselves in .map_user
+	 */
+	flags |= ION_FLAG_CACHED_NEEDS_SYNC;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return ION_CP_ALLOCATE_FAIL;
+
+	addr = ion_cp_allocate(heap, size, align, flags);
+	if (addr == ION_CP_ALLOCATE_FAIL)
+		return -ENOMEM;
+
+	buf->buffer = addr;
+	buf->want_delayed_unsecure = 0;
+	atomic_set(&buf->secure_cnt, 0);
+	mutex_init(&buf->lock);
+	buf->is_secure = flags & ION_FLAG_SECURE ? 1 : 0;
+	buffer->priv_virt = buf;
+
+	return 0;
 }
 
 static void ion_cp_heap_free(struct ion_buffer *buffer)
 {
 	struct ion_heap *heap = buffer->heap;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 
-	ion_cp_free(heap, buffer->priv_phys, buffer->size);
-	buffer->priv_phys = ION_CP_ALLOCATE_FAIL;
+	ion_cp_free(heap, buf->buffer, buffer->size);
+	WARN_ON(atomic_read(&buf->secure_cnt));
+	WARN_ON(atomic_read(&buf->map_cnt));
+	kfree(buf);
+
+	buffer->priv_virt = NULL;
 }
 
 struct sg_table *ion_cp_heap_create_sg_table(struct ion_buffer *buffer)
 {
-	struct sg_table *table;
-	int ret;
+	size_t chunk_size = buffer->size;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 
-	table = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
-	if (!table)
-		return ERR_PTR(-ENOMEM);
+	if (ION_IS_CACHED(buffer->flags))
+		chunk_size = PAGE_SIZE;
+	else if (buf->is_secure && IS_ALIGNED(buffer->size, SZ_1M))
+		chunk_size = SZ_1M;
 
-	ret = sg_alloc_table(table, 1, GFP_KERNEL);
-	if (ret)
-		goto err0;
-
-	table->sgl->length = buffer->size;
-	table->sgl->offset = 0;
-	table->sgl->dma_address = buffer->priv_phys;
-
-	return table;
-err0:
-	kfree(table);
-	return ERR_PTR(ret);
+	return ion_create_chunked_sg_table(buf->buffer, chunk_size,
+					buffer->size);
 }
 
 struct sg_table *ion_cp_heap_map_dma(struct ion_heap *heap,
@@ -568,33 +534,12 @@
 	return ret_value;
 }
 
-void *ion_map_fmem_buffer(struct ion_buffer *buffer, unsigned long phys_base,
-				void *virt_base, unsigned long flags)
-{
-	int ret;
-	unsigned int offset = buffer->priv_phys - phys_base;
-	unsigned long start = ((unsigned long)virt_base) + offset;
-	const struct mem_type *type = ION_IS_CACHED(flags) ?
-				get_mem_type(MT_DEVICE_CACHED) :
-				get_mem_type(MT_DEVICE);
-
-	if (phys_base > buffer->priv_phys)
-		return NULL;
-
-
-	ret = ioremap_pages(start, buffer->priv_phys, buffer->size, type);
-
-	if (!ret)
-		return (void *)start;
-	else
-		return NULL;
-}
-
 void *ion_cp_heap_map_kernel(struct ion_heap *heap, struct ion_buffer *buffer)
 {
 	struct ion_cp_heap *cp_heap =
 		container_of(heap, struct ion_cp_heap, heap);
 	void *ret_value = NULL;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 
 	mutex_lock(&cp_heap->lock);
 	if ((cp_heap->heap_protected == HEAP_NOT_PROTECTED) ||
@@ -606,33 +551,35 @@
 			return NULL;
 		}
 
-		if (cp_heap->reusable) {
-			ret_value = ion_map_fmem_buffer(buffer, cp_heap->base,
-				cp_heap->reserved_vrange, buffer->flags);
-		} else if (cp_heap->cma) {
+		if (cp_heap->cma) {
 			int npages = PAGE_ALIGN(buffer->size) / PAGE_SIZE;
 			struct page **pages = vmalloc(
 						sizeof(struct page *) * npages);
 			int i;
 			pgprot_t pgprot;
 
+			if (!pages) {
+				mutex_unlock(&cp_heap->lock);
+				return ERR_PTR(-ENOMEM);
+			}
+
 			if (ION_IS_CACHED(buffer->flags))
 				pgprot = PAGE_KERNEL;
 			else
 				pgprot = pgprot_writecombine(PAGE_KERNEL);
 
 			for (i = 0; i < npages; i++) {
-				pages[i] = phys_to_page(buffer->priv_phys +
+				pages[i] = phys_to_page(buf->buffer +
 						i * PAGE_SIZE);
 			}
 			ret_value = vmap(pages, npages, VM_IOREMAP, pgprot);
 			vfree(pages);
 		} else {
 			if (ION_IS_CACHED(buffer->flags))
-				ret_value = ioremap_cached(buffer->priv_phys,
+				ret_value = ioremap_cached(buf->buffer,
 							   buffer->size);
 			else
-				ret_value = ioremap(buffer->priv_phys,
+				ret_value = ioremap(buf->buffer,
 						    buffer->size);
 		}
 
@@ -643,6 +590,7 @@
 				++cp_heap->kmap_cached_count;
 			else
 				++cp_heap->kmap_uncached_count;
+			atomic_inc(&buf->map_cnt);
 		}
 	}
 	mutex_unlock(&cp_heap->lock);
@@ -654,10 +602,9 @@
 {
 	struct ion_cp_heap *cp_heap =
 		container_of(heap, struct ion_cp_heap, heap);
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 
-	if (cp_heap->reusable)
-		unmap_kernel_range((unsigned long)buffer->vaddr, buffer->size);
-	else if (cp_heap->cma)
+	if (cp_heap->cma)
 		vunmap(buffer->vaddr);
 	else
 		__arm_iounmap(buffer->vaddr);
@@ -669,6 +616,8 @@
 		--cp_heap->kmap_cached_count;
 	else
 		--cp_heap->kmap_uncached_count;
+
+	atomic_dec(&buf->map_cnt);
 	ion_cp_release_region(cp_heap);
 	mutex_unlock(&cp_heap->lock);
 
@@ -681,9 +630,10 @@
 	int ret_value = -EAGAIN;
 	struct ion_cp_heap *cp_heap =
 		container_of(heap, struct ion_cp_heap, heap);
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 
 	mutex_lock(&cp_heap->lock);
-	if (cp_heap->heap_protected == HEAP_NOT_PROTECTED) {
+	if (cp_heap->heap_protected == HEAP_NOT_PROTECTED && !buf->is_secure) {
 		if (ion_cp_request_region(cp_heap)) {
 			mutex_unlock(&cp_heap->lock);
 			return -EINVAL;
@@ -694,14 +644,17 @@
 							vma->vm_page_prot);
 
 		ret_value =  remap_pfn_range(vma, vma->vm_start,
-			__phys_to_pfn(buffer->priv_phys) + vma->vm_pgoff,
+			__phys_to_pfn(buf->buffer) + vma->vm_pgoff,
 			vma->vm_end - vma->vm_start,
 			vma->vm_page_prot);
 
-		if (ret_value)
+		if (ret_value) {
 			ion_cp_release_region(cp_heap);
-		else
+		} else {
+			atomic_inc(&buf->map_cnt);
 			++cp_heap->umap_count;
+		}
+
 	}
 	mutex_unlock(&cp_heap->lock);
 	return ret_value;
@@ -712,9 +665,11 @@
 {
 	struct ion_cp_heap *cp_heap =
 			container_of(heap, struct ion_cp_heap, heap);
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 
 	mutex_lock(&cp_heap->lock);
 	--cp_heap->umap_count;
+	atomic_dec(&buf->map_cnt);
 	ion_cp_release_region(cp_heap);
 	mutex_unlock(&cp_heap->lock);
 }
@@ -723,29 +678,82 @@
 			void *vaddr, unsigned int offset, unsigned int length,
 			unsigned int cmd)
 {
-	void (*outer_cache_op)(phys_addr_t, phys_addr_t);
+	void (*outer_cache_op)(phys_addr_t, phys_addr_t) = NULL;
 	struct ion_cp_heap *cp_heap =
-	     container_of(heap, struct  ion_cp_heap, heap);
+		container_of(heap, struct  ion_cp_heap, heap);
+	unsigned int size_to_vmap, total_size;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
+	int i, j;
+	void *ptr = NULL;
+	ion_phys_addr_t buff_phys = buffer->priv_phys;
 
-	switch (cmd) {
-	case ION_IOC_CLEAN_CACHES:
-		dmac_clean_range(vaddr, vaddr + length);
-		outer_cache_op = outer_clean_range;
-		break;
-	case ION_IOC_INV_CACHES:
-		dmac_inv_range(vaddr, vaddr + length);
-		outer_cache_op = outer_inv_range;
-		break;
-	case ION_IOC_CLEAN_INV_CACHES:
-		dmac_flush_range(vaddr, vaddr + length);
-		outer_cache_op = outer_flush_range;
-		break;
-	default:
-		return -EINVAL;
+	if (!vaddr) {
+		/*
+		 * Split the vmalloc space into smaller regions in
+		 * order to clean and/or invalidate the cache.
+		 */
+		size_to_vmap = (VMALLOC_END - VMALLOC_START)/8;
+		total_size = buffer->size;
+		for (i = 0; i < total_size; i += size_to_vmap) {
+			size_to_vmap = min(size_to_vmap, total_size - i);
+			for (j = 0; j < 10 && size_to_vmap; ++j) {
+				ptr = ioremap(buff_phys, size_to_vmap);
+				if (ptr) {
+					switch (cmd) {
+					case ION_IOC_CLEAN_CACHES:
+						dmac_clean_range(ptr,
+							ptr + size_to_vmap);
+						outer_cache_op =
+							outer_clean_range;
+						break;
+					case ION_IOC_INV_CACHES:
+						dmac_inv_range(ptr,
+							ptr + size_to_vmap);
+						outer_cache_op =
+							outer_inv_range;
+						break;
+					case ION_IOC_CLEAN_INV_CACHES:
+						dmac_flush_range(ptr,
+							ptr + size_to_vmap);
+						outer_cache_op =
+							outer_flush_range;
+						break;
+					default:
+						return -EINVAL;
+					}
+					buff_phys += size_to_vmap;
+					break;
+				} else {
+					size_to_vmap >>= 1;
+				}
+			}
+			if (!ptr) {
+				pr_err("Couldn't io-remap the memory\n");
+				return -EINVAL;
+			}
+			iounmap(ptr);
+		}
+	} else {
+		switch (cmd) {
+		case ION_IOC_CLEAN_CACHES:
+			dmac_clean_range(vaddr, vaddr + length);
+			outer_cache_op = outer_clean_range;
+			break;
+		case ION_IOC_INV_CACHES:
+			dmac_inv_range(vaddr, vaddr + length);
+			outer_cache_op = outer_inv_range;
+			break;
+		case ION_IOC_CLEAN_INV_CACHES:
+			dmac_flush_range(vaddr, vaddr + length);
+			outer_cache_op = outer_flush_range;
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
 
 	if (cp_heap->has_outer_cache) {
-		unsigned long pstart = buffer->priv_phys + offset;
+		unsigned long pstart = buf->buffer + offset;
 		outer_cache_op(pstart, pstart + length);
 	}
 	return 0;
@@ -775,7 +783,6 @@
 	seq_printf(s, "umapping count: %lx\n", umap_count);
 	seq_printf(s, "kmapping count: %lx\n", kmap_count);
 	seq_printf(s, "heap protected: %s\n", heap_protected ? "Yes" : "No");
-	seq_printf(s, "reusable: %s\n", cp_heap->reusable  ? "Yes" : "No");
 
 	if (mem_map) {
 		unsigned long base = cp_heap->base;
@@ -795,8 +802,11 @@
 			const char *client_name = "(null)";
 
 			if (last_end < data->addr) {
-				seq_printf(s, "%16.s %14lx %14lx %14lu (%lx)\n",
-					   "FREE", last_end, data->addr-1,
+				phys_addr_t da;
+
+				da = data->addr-1;
+				seq_printf(s, "%16.s %14pa %14pa %14lu (%lx)\n",
+					   "FREE", &last_end, &da,
 					   data->addr-last_end,
 					   data->addr-last_end);
 			}
@@ -804,9 +814,9 @@
 			if (data->client_name)
 				client_name = data->client_name;
 
-			seq_printf(s, "%16.s %14lx %14lx %14lu (%lx)\n",
-				   client_name, data->addr,
-				   data->addr_end,
+			seq_printf(s, "%16.s %14pa %14pa %14lu (%lx)\n",
+				   client_name, &data->addr,
+				   &data->addr_end,
 				   data->size, data->size);
 			last_end = data->addr_end+1;
 		}
@@ -902,6 +912,7 @@
 		}
 		if (domain_num == cp_heap->iommu_2x_map_domain)
 			ret_value = msm_iommu_map_extra(domain, temp_iova,
+							cp_heap->base,
 							cp_heap->total_size,
 							SZ_64K, prot);
 		if (ret_value)
@@ -933,25 +944,26 @@
 	struct ion_cp_heap *cp_heap =
 		container_of(buffer->heap, struct ion_cp_heap, heap);
 	int prot = IOMMU_WRITE | IOMMU_READ;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
 	prot |= ION_IS_CACHED(flags) ? IOMMU_CACHE : 0;
 
 	data->mapped_size = iova_length;
 
 	if (!msm_use_iommu()) {
-		data->iova_addr = buffer->priv_phys;
+		data->iova_addr = buf->buffer;
 		return 0;
 	}
 
 	if (cp_heap->iommu_iova[domain_num]) {
 		/* Already mapped. */
-		unsigned long offset = buffer->priv_phys - cp_heap->base;
+		unsigned long offset = buf->buffer - cp_heap->base;
 		data->iova_addr = cp_heap->iommu_iova[domain_num] + offset;
 		return 0;
 	} else if (cp_heap->iommu_map_all) {
 		ret = iommu_map_all(domain_num, cp_heap, partition_num, prot);
 		if (!ret) {
 			unsigned long offset =
-					buffer->priv_phys - cp_heap->base;
+					buf->buffer - cp_heap->base;
 			data->iova_addr =
 				cp_heap->iommu_iova[domain_num] + offset;
 			cp_heap->iommu_partition[domain_num] = partition_num;
@@ -994,8 +1006,9 @@
 
 	if (extra) {
 		unsigned long extra_iova_addr = data->iova_addr + buffer->size;
-		ret = msm_iommu_map_extra(domain, extra_iova_addr, extra,
-					  SZ_4K, prot);
+		unsigned long phys_addr = sg_phys(buffer->sg_table->sgl);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
 		if (ret)
 			goto out2;
 	}
@@ -1061,6 +1074,8 @@
 	.unsecure_heap = ion_cp_unsecure_heap,
 	.map_iommu = ion_cp_heap_map_iommu,
 	.unmap_iommu = ion_cp_heap_unmap_iommu,
+	.secure_buffer = ion_cp_secure_buffer,
+	.unsecure_buffer = ion_cp_unsecure_buffer,
 };
 
 struct ion_heap *ion_cp_heap_create(struct ion_platform_heap *heap_data)
@@ -1092,8 +1107,6 @@
 	if (heap_data->extra_data) {
 		struct ion_cp_heap_pdata *extra_data =
 				heap_data->extra_data;
-		cp_heap->reusable = extra_data->reusable;
-		cp_heap->reserved_vrange = extra_data->virt_addr;
 		cp_heap->permission_type = extra_data->permission_type;
 		if (extra_data->secure_size) {
 			cp_heap->secure_base = extra_data->secure_base;
@@ -1112,8 +1125,8 @@
 		cp_heap->iommu_2x_map_domain =
 				extra_data->iommu_2x_map_domain;
 		cp_heap->cma = extra_data->is_cma;
-		cp_heap->disallow_non_secure_allocation =
-			extra_data->no_nonsecure_alloc;
+		cp_heap->allow_non_secure_allocation =
+			extra_data->allow_nonsecure_alloc;
 
 	}
 
@@ -1163,108 +1176,4 @@
 	*size = cp_heap->total_size;
 }
 
-/*  SCM related code for locking down memory for content protection */
 
-#define SCM_CP_LOCK_CMD_ID	0x1
-#define SCM_CP_PROTECT		0x1
-#define SCM_CP_UNPROTECT	0x0
-
-struct cp_lock_msg {
-	unsigned int start;
-	unsigned int end;
-	unsigned int permission_type;
-	unsigned char lock;
-} __attribute__ ((__packed__));
-
-static int ion_cp_protect_mem_v1(unsigned int phy_base, unsigned int size,
-			      unsigned int permission_type)
-{
-	struct cp_lock_msg cmd;
-	cmd.start = phy_base;
-	cmd.end = phy_base + size;
-	cmd.permission_type = permission_type;
-	cmd.lock = SCM_CP_PROTECT;
-
-	return scm_call(SCM_SVC_CP, SCM_CP_LOCK_CMD_ID,
-			&cmd, sizeof(cmd), NULL, 0);
-}
-
-static int ion_cp_unprotect_mem_v1(unsigned int phy_base, unsigned int size,
-				unsigned int permission_type)
-{
-	struct cp_lock_msg cmd;
-	cmd.start = phy_base;
-	cmd.end = phy_base + size;
-	cmd.permission_type = permission_type;
-	cmd.lock = SCM_CP_UNPROTECT;
-
-	return scm_call(SCM_SVC_CP, SCM_CP_LOCK_CMD_ID,
-			&cmd, sizeof(cmd), NULL, 0);
-}
-
-#define V2_CHUNK_SIZE	SZ_1M
-
-static int ion_cp_change_mem_v2(unsigned int phy_base, unsigned int size,
-			      void *data, int lock)
-{
-	enum cp_mem_usage usage = (enum cp_mem_usage) data;
-	unsigned long *chunk_list;
-	int nchunks;
-	int ret;
-	int i;
-
-	if (usage < 0 || usage >= MAX_USAGE)
-		return -EINVAL;
-
-	if (!IS_ALIGNED(size, V2_CHUNK_SIZE)) {
-		pr_err("%s: heap size is not aligned to %x\n",
-			__func__, V2_CHUNK_SIZE);
-		return -EINVAL;
-	}
-
-	nchunks = size / V2_CHUNK_SIZE;
-
-	chunk_list = allocate_contiguous_ebi(sizeof(unsigned long)*nchunks,
-						SZ_4K, 0);
-	if (!chunk_list)
-		return -ENOMEM;
-
-	for (i = 0; i < nchunks; i++)
-		chunk_list[i] = phy_base + i * V2_CHUNK_SIZE;
-
-	ret = ion_cp_change_chunks_state(memory_pool_node_paddr(chunk_list),
-					nchunks, V2_CHUNK_SIZE, usage, lock);
-
-	free_contiguous_memory(chunk_list);
-	return ret;
-}
-
-static int ion_cp_protect_mem(unsigned int phy_base, unsigned int size,
-			      unsigned int permission_type, int version,
-			      void *data)
-{
-	switch (version) {
-	case ION_CP_V1:
-		return ion_cp_protect_mem_v1(phy_base, size, permission_type);
-	case ION_CP_V2:
-		return ion_cp_change_mem_v2(phy_base, size, data,
-						SCM_CP_PROTECT);
-	default:
-		return -EINVAL;
-	}
-}
-
-static int ion_cp_unprotect_mem(unsigned int phy_base, unsigned int size,
-			      unsigned int permission_type, int version,
-			      void *data)
-{
-	switch (version) {
-	case ION_CP_V1:
-		return ion_cp_unprotect_mem_v1(phy_base, size, permission_type);
-	case ION_CP_V2:
-		return ion_cp_change_mem_v2(phy_base, size, data,
-						SCM_CP_UNPROTECT);
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/drivers/gpu/ion/ion_heap.c b/drivers/gpu/ion/ion_heap.c
index 98c1a8c..510b9ce 100644
--- a/drivers/gpu/ion/ion_heap.c
+++ b/drivers/gpu/ion/ion_heap.c
@@ -2,7 +2,7 @@
  * drivers/gpu/ion/ion_heap.c
  *
  * Copyright (C) 2011 Google, Inc.
- * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -18,13 +18,12 @@
 #include <linux/err.h>
 #include <linux/ion.h>
 #include "ion_priv.h"
-#include <linux/msm_ion.h>
 
 struct ion_heap *ion_heap_create(struct ion_platform_heap *heap_data)
 {
 	struct ion_heap *heap = NULL;
 
-	switch ((int) heap_data->type) {
+	switch (heap_data->type) {
 	case ION_HEAP_TYPE_SYSTEM_CONTIG:
 		heap = ion_system_contig_heap_create(heap_data);
 		break;
@@ -34,17 +33,6 @@
 	case ION_HEAP_TYPE_CARVEOUT:
 		heap = ion_carveout_heap_create(heap_data);
 		break;
-	case ION_HEAP_TYPE_IOMMU:
-		heap = ion_iommu_heap_create(heap_data);
-		break;
-	case ION_HEAP_TYPE_CP:
-		heap = ion_cp_heap_create(heap_data);
-		break;
-#ifdef CONFIG_CMA
-	case ION_HEAP_TYPE_DMA:
-		heap = ion_cma_heap_create(heap_data);
-		break;
-#endif
 	default:
 		pr_err("%s: Invalid heap type %d\n", __func__,
 		       heap_data->type);
@@ -52,9 +40,9 @@
 	}
 
 	if (IS_ERR_OR_NULL(heap)) {
-		pr_err("%s: error creating heap %s type %d base %lu size %u\n",
+		pr_err("%s: error creating heap %s type %d base %pa size %u\n",
 		       __func__, heap_data->name, heap_data->type,
-		       heap_data->base, heap_data->size);
+		       &heap_data->base, heap_data->size);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -69,7 +57,7 @@
 	if (!heap)
 		return;
 
-	switch ((int) heap->type) {
+	switch (heap->type) {
 	case ION_HEAP_TYPE_SYSTEM_CONTIG:
 		ion_system_contig_heap_destroy(heap);
 		break;
@@ -79,17 +67,6 @@
 	case ION_HEAP_TYPE_CARVEOUT:
 		ion_carveout_heap_destroy(heap);
 		break;
-	case ION_HEAP_TYPE_IOMMU:
-		ion_iommu_heap_destroy(heap);
-		break;
-	case ION_HEAP_TYPE_CP:
-		ion_cp_heap_destroy(heap);
-		break;
-#ifdef CONFIG_CMA
-	case ION_HEAP_TYPE_DMA:
-		ion_cma_heap_destroy(heap);
-		break;
-#endif
 	default:
 		pr_err("%s: Invalid heap type %d\n", __func__,
 		       heap->type);
diff --git a/drivers/gpu/ion/ion_iommu_heap.c b/drivers/gpu/ion/ion_iommu_heap.c
index 3a32390..512ebf3 100644
--- a/drivers/gpu/ion/ion_iommu_heap.c
+++ b/drivers/gpu/ion/ion_iommu_heap.c
@@ -48,8 +48,6 @@
 
 #define MAX_VMAP_RETRIES 10
 
-atomic_t v = ATOMIC_INIT(0);
-
 static const unsigned int orders[] = {8, 4, 0};
 static const int num_orders = ARRAY_SIZE(orders);
 
@@ -72,13 +70,21 @@
 	int i;
 
 	for (i = 0; i < num_orders; i++) {
+		gfp_t gfp;
 		if (size < order_to_size(orders[i]))
 			continue;
 		if (max_order < orders[i])
 			continue;
 
-		page = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM | __GFP_COMP,
-				orders[i]);
+		gfp = __GFP_HIGHMEM;
+
+		if (orders[i]) {
+			gfp |= __GFP_COMP | __GFP_NORETRY |
+			       __GFP_NO_KSWAPD | __GFP_NOWARN;
+		} else {
+			gfp |= GFP_KERNEL;
+		}
+		page = alloc_pages(gfp, orders[i]);
 		if (!page)
 			continue;
 
@@ -107,7 +113,7 @@
 		void *ptr = NULL;
 		unsigned int npages_to_vmap, total_pages, num_large_pages = 0;
 		long size_remaining = PAGE_ALIGN(size);
-		unsigned int max_order = orders[0];
+		unsigned int max_order = ION_IS_CACHED(flags) ? 0 : orders[0];
 
 		data = kmalloc(sizeof(*data), GFP_KERNEL);
 		if (!data)
@@ -199,9 +205,6 @@
 						DMA_BIDIRECTIONAL);
 
 		buffer->priv_virt = data;
-
-		atomic_add(data->size, &v);
-
 		return 0;
 
 	} else {
@@ -246,19 +249,10 @@
 	sg_free_table(table);
 	kfree(table);
 	table = 0;
-
-	atomic_sub(data->size, &v);
-
 	kfree(data->pages);
 	kfree(data);
 }
 
-int ion_iommu_heap_dump_size(void)
-{
-	int ret = atomic_read(&v);
-	return ret;
-}
-
 void *ion_iommu_heap_map_kernel(struct ion_heap *heap,
 				struct ion_buffer *buffer)
 {
@@ -269,7 +263,7 @@
 		return NULL;
 
 	if (!ION_IS_CACHED(buffer->flags))
-		page_prot = pgprot_noncached(page_prot);
+		page_prot = pgprot_writecombine(page_prot);
 
 	buffer->vaddr = vmap(data->pages, data->nrpages, VM_IOREMAP, page_prot);
 
@@ -340,6 +334,14 @@
 	data->mapped_size = iova_length;
 	extra = iova_length - buffer->size;
 
+	/* Use the biggest alignment to allow bigger IOMMU mappings.
+	 * Use the first entry since the first entry will always be the
+	 * biggest entry. To take advantage of bigger mapping sizes both the
+	 * VA and PA addresses have to be aligned to the biggest size.
+	 */
+	if (buffer->sg_table->sgl->length > align)
+		align = buffer->sg_table->sgl->length;
+
 	ret = msm_allocate_iova_address(domain_num, partition_num,
 						data->mapped_size, align,
 						&data->iova_addr);
@@ -365,8 +367,9 @@
 
 	if (extra) {
 		unsigned long extra_iova_addr = data->iova_addr + buffer->size;
-		ret = msm_iommu_map_extra(domain, extra_iova_addr, extra, SZ_4K,
-					  prot);
+		unsigned long phys_addr = sg_phys(buffer->sg_table->sgl);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
 		if (ret)
 			goto out2;
 	}
@@ -418,15 +421,30 @@
 
 	switch (cmd) {
 	case ION_IOC_CLEAN_CACHES:
-		dmac_clean_range(vaddr, vaddr + length);
+		if (!vaddr)
+			dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_TO_DEVICE);
+		else
+			dmac_clean_range(vaddr, vaddr + length);
 		outer_cache_op = outer_clean_range;
 		break;
 	case ION_IOC_INV_CACHES:
-		dmac_inv_range(vaddr, vaddr + length);
+		if (!vaddr)
+			dma_sync_sg_for_cpu(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_FROM_DEVICE);
+		else
+			dmac_inv_range(vaddr, vaddr + length);
 		outer_cache_op = outer_inv_range;
 		break;
 	case ION_IOC_CLEAN_INV_CACHES:
-		dmac_flush_range(vaddr, vaddr + length);
+		if (!vaddr) {
+			dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_TO_DEVICE);
+			dma_sync_sg_for_cpu(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_FROM_DEVICE);
+		} else {
+			dmac_flush_range(vaddr, vaddr + length);
+		}
 		outer_cache_op = outer_flush_range;
 		break;
 	default:
diff --git a/drivers/gpu/ion/ion_priv.h b/drivers/gpu/ion/ion_priv.h
index d494f7a..28ef1a5 100644
--- a/drivers/gpu/ion/ion_priv.h
+++ b/drivers/gpu/ion/ion_priv.h
@@ -2,7 +2,7 @@
  * drivers/gpu/ion/ion_priv.h
  *
  * Copyright (C) 2011 Google, Inc.
- * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -98,7 +98,8 @@
 	void *vaddr;
 	int dmap_cnt;
 	struct sg_table *sg_table;
-	int umap_cnt;
+	unsigned long *dirty;
+	struct list_head vmas;
 	unsigned int iommu_map_cnt;
 	struct rb_root iommu_maps;
 	int marked;
@@ -147,6 +148,9 @@
 			   const struct rb_root *mem_map);
 	int (*secure_heap)(struct ion_heap *heap, int version, void *data);
 	int (*unsecure_heap)(struct ion_heap *heap, int version, void *data);
+	int (*secure_buffer)(struct ion_buffer *buffer, int version,
+				void *data, int flags);
+	int (*unsecure_buffer)(struct ion_buffer *buffer, int force_unsecure);
 };
 
 /**
@@ -177,6 +181,15 @@
 };
 
 /**
+ * ion_buffer_fault_user_mappings - fault in user mappings of this buffer
+ * @buffer:		buffer
+ *
+ * indicates whether userspace mappings of this buffer will be faulted
+ * in, this can affect how buffers are allocated from the heap.
+ */
+bool ion_buffer_fault_user_mappings(struct ion_buffer *buffer);
+
+/**
  * struct mem_map_data - represents information about the memory map for a heap
  * @node:		rb node used to store in the tree of mem_map_data
  * @addr:		start address of memory region.
@@ -187,8 +200,8 @@
  */
 struct mem_map_data {
 	struct rb_node node;
-	unsigned long addr;
-	unsigned long addr_end;
+	ion_phys_addr_t addr;
+	ion_phys_addr_t addr_end;
 	unsigned long size;
 	const char *client_name;
 };
@@ -259,6 +272,9 @@
 #ifdef CONFIG_CMA
 struct ion_heap *ion_cma_heap_create(struct ion_platform_heap *);
 void ion_cma_heap_destroy(struct ion_heap *);
+
+struct ion_heap *ion_secure_cma_heap_create(struct ion_platform_heap *);
+void ion_secure_cma_heap_destroy(struct ion_heap *);
 #endif
 
 struct ion_heap *msm_get_contiguous_heap(void);
@@ -313,4 +329,28 @@
 
 void ion_mem_map_show(struct ion_heap *heap);
 
+
+
+int ion_secure_handle(struct ion_client *client, struct ion_handle *handle,
+			int version, void *data, int flags);
+
+int ion_unsecure_handle(struct ion_client *client, struct ion_handle *handle);
+
+int ion_heap_allow_secure_allocation(enum ion_heap_type type);
+
+int ion_heap_allow_heap_secure(enum ion_heap_type type);
+
+int ion_heap_allow_handle_secure(enum ion_heap_type type);
+
+/**
+ * ion_create_chunked_sg_table - helper function to create sg table
+ * with specified chunk size
+ * @buffer_base:	The starting address used for the sg dma address
+ * @chunk_size:		The size of each entry in the sg table
+ * @total_size:		The total size of the sg table (i.e. the sum of the
+ *			entries). This will be rounded up to the nearest
+ *			multiple of `chunk_size'
+ */
+struct sg_table *ion_create_chunked_sg_table(phys_addr_t buffer_base,
+					size_t chunk_size, size_t total_size);
 #endif /* _ION_PRIV_H */
diff --git a/drivers/gpu/ion/ion_system_heap.c b/drivers/gpu/ion/ion_system_heap.c
index 980174e..ceb30a4 100644
--- a/drivers/gpu/ion/ion_system_heap.c
+++ b/drivers/gpu/ion/ion_system_heap.c
@@ -2,7 +2,7 @@
  * drivers/gpu/ion/ion_system_heap.c
  *
  * Copyright (C) 2011 Google, Inc.
- * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -15,7 +15,10 @@
  *
  */
 
+#include <asm/page.h>
+#include <linux/dma-mapping.h>
 #include <linux/err.h>
+#include <linux/highmem.h>
 #include <linux/ion.h>
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
@@ -28,12 +31,44 @@
 #include <mach/memory.h>
 #include <asm/cacheflush.h>
 #include <linux/msm_ion.h>
+#include <linux/dma-mapping.h>
 
 static atomic_t system_heap_allocated;
 static atomic_t system_contig_heap_allocated;
 static unsigned int system_heap_has_outer_cache;
 static unsigned int system_heap_contig_has_outer_cache;
 
+struct page_info {
+	struct page *page;
+	unsigned long order;
+	struct list_head list;
+};
+
+static struct page_info *alloc_largest_available(unsigned long size,
+						 bool split_pages)
+{
+	static unsigned int orders[] = {8, 4, 0};
+	struct page *page;
+	struct page_info *info;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(orders); i++) {
+		if (size < (1 << orders[i]) * PAGE_SIZE)
+			continue;
+		page = alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
+				   __GFP_NOWARN | __GFP_NORETRY, orders[i]);
+		if (!page)
+			continue;
+		if (split_pages)
+			split_page(page, orders[i]);
+		info = kmap(page);
+		info->page = page;
+		info->order = orders[i];
+		return info;
+	}
+	return NULL;
+}
+
 static int ion_system_heap_allocate(struct ion_heap *heap,
 				     struct ion_buffer *buffer,
 				     unsigned long size, unsigned long align,
@@ -41,31 +76,73 @@
 {
 	struct sg_table *table;
 	struct scatterlist *sg;
-	int i, j;
-	int npages = PAGE_ALIGN(size) / PAGE_SIZE;
+	int ret;
+	struct list_head pages;
+	struct page_info *info, *tmp_info;
+	int i = 0;
+	long size_remaining = PAGE_ALIGN(size);
+	bool split_pages = ion_buffer_fault_user_mappings(buffer);
+
+
+	INIT_LIST_HEAD(&pages);
+	while (size_remaining > 0) {
+		info = alloc_largest_available(size_remaining, split_pages);
+		if (!info)
+			goto err;
+		list_add_tail(&info->list, &pages);
+		size_remaining -= (1 << info->order) * PAGE_SIZE;
+		i++;
+	}
 
 	table = kmalloc(sizeof(struct sg_table), GFP_KERNEL);
 	if (!table)
-		return -ENOMEM;
-	i = sg_alloc_table(table, npages, GFP_KERNEL);
-	if (i)
-		goto err0;
-	for_each_sg(table->sgl, sg, table->nents, i) {
-		struct page *page;
-		page = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!page)
-			goto err1;
-		sg_set_page(sg, page, PAGE_SIZE, 0);
+		goto err;
+
+	if (split_pages)
+		ret = sg_alloc_table(table, PAGE_ALIGN(size) / PAGE_SIZE,
+				     GFP_KERNEL);
+	else
+		ret = sg_alloc_table(table, i, GFP_KERNEL);
+
+	if (ret)
+		goto err1;
+
+	sg = table->sgl;
+	list_for_each_entry_safe(info, tmp_info, &pages, list) {
+		struct page *page = info->page;
+
+		if (split_pages) {
+			for (i = 0; i < (1 << info->order); i++) {
+				sg_set_page(sg, page + i, PAGE_SIZE, 0);
+				sg = sg_next(sg);
+			}
+		} else {
+			sg_set_page(sg, page, (1 << info->order) * PAGE_SIZE,
+				    0);
+			sg = sg_next(sg);
+		}
+		list_del(&info->list);
+		kunmap(page);
 	}
+
+	dma_sync_sg_for_device(NULL, table->sgl, table->nents,
+			       DMA_BIDIRECTIONAL);
+
 	buffer->priv_virt = table;
 	atomic_add(size, &system_heap_allocated);
 	return 0;
 err1:
-	for_each_sg(table->sgl, sg, i, j)
-		__free_page(sg_page(sg));
-	sg_free_table(table);
-err0:
 	kfree(table);
+err:
+	list_for_each_entry(info, &pages, list) {
+		if (split_pages)
+			for (i = 0; i < (1 << info->order); i++)
+				__free_page(info->page + i);
+		else
+			__free_pages(info->page, info->order);
+
+		kunmap(info->page);
+	}
 	return -ENOMEM;
 }
 
@@ -76,7 +153,7 @@
 	struct sg_table *table = buffer->priv_virt;
 
 	for_each_sg(table->sgl, sg, table->nents, i)
-		__free_page(sg_page(sg));
+		__free_pages(sg_page(sg), get_order(sg_dma_len(sg)));
 	if (buffer->sg_table)
 		sg_free_table(buffer->sg_table);
 	kfree(buffer->sg_table);
@@ -98,25 +175,33 @@
 void *ion_system_heap_map_kernel(struct ion_heap *heap,
 				 struct ion_buffer *buffer)
 {
-	if (!ION_IS_CACHED(buffer->flags)) {
-		pr_err("%s: cannot map system heap uncached\n", __func__);
-		return ERR_PTR(-EINVAL);
-	} else {
-		struct scatterlist *sg;
-		int i;
-		void *vaddr;
-		struct sg_table *table = buffer->priv_virt;
-		struct page **pages = kmalloc(
-					sizeof(struct page *) * table->nents,
-					GFP_KERNEL);
+	struct scatterlist *sg;
+	int i, j;
+	void *vaddr;
+	pgprot_t pgprot;
+	struct sg_table *table = buffer->priv_virt;
+	int npages = PAGE_ALIGN(buffer->size) / PAGE_SIZE;
+	struct page **pages = kzalloc(sizeof(struct page *) * npages,
+				     GFP_KERNEL);
+	struct page **tmp = pages;
 
-		for_each_sg(table->sgl, sg, table->nents, i)
-			pages[i] = sg_page(sg);
-		vaddr = vmap(pages, table->nents, VM_MAP, PAGE_KERNEL);
-		kfree(pages);
+	if (buffer->flags & ION_FLAG_CACHED)
+		pgprot = PAGE_KERNEL;
+	else
+		pgprot = pgprot_writecombine(PAGE_KERNEL);
 
-		return vaddr;
+	for_each_sg(table->sgl, sg, table->nents, i) {
+		int npages_this_entry = PAGE_ALIGN(sg_dma_len(sg)) / PAGE_SIZE;
+		struct page *page = sg_page(sg);
+		BUG_ON(i >= npages);
+		for (j = 0; j < npages_this_entry; j++) {
+			*(tmp++) = page++;
+		}
 	}
+	vaddr = vmap(pages, npages, VM_MAP, pgprot);
+	kfree(pages);
+
+	return vaddr;
 }
 
 void ion_system_heap_unmap_kernel(struct ion_heap *heap,
@@ -154,26 +239,27 @@
 int ion_system_heap_map_user(struct ion_heap *heap, struct ion_buffer *buffer,
 			     struct vm_area_struct *vma)
 {
+	struct sg_table *table = buffer->priv_virt;
+	unsigned long addr = vma->vm_start;
+	unsigned long offset = vma->vm_pgoff;
+	struct scatterlist *sg;
+	int i;
+
 	if (!ION_IS_CACHED(buffer->flags)) {
 		pr_err("%s: cannot map system heap uncached\n", __func__);
 		return -EINVAL;
-	} else {
-		struct sg_table *table = buffer->priv_virt;
-		unsigned long addr = vma->vm_start;
-		unsigned long offset = vma->vm_pgoff;
-		struct scatterlist *sg;
-		int i;
-
-		for_each_sg(table->sgl, sg, table->nents, i) {
-			if (offset) {
-				offset--;
-				continue;
-			}
-			vm_insert_page(vma, addr, sg_page(sg));
-			addr += PAGE_SIZE;
-		}
-		return 0;
 	}
+
+	for_each_sg(table->sgl, sg, table->nents, i) {
+		if (offset) {
+			offset--;
+			continue;
+		}
+		remap_pfn_range(vma, addr, page_to_pfn(sg_page(sg)),
+				sg_dma_len(sg), vma->vm_page_prot);
+		addr += sg_dma_len(sg);
+	}
+	return 0;
 }
 
 int ion_system_heap_cache_ops(struct ion_heap *heap, struct ion_buffer *buffer,
@@ -184,15 +270,30 @@
 
 	switch (cmd) {
 	case ION_IOC_CLEAN_CACHES:
-		dmac_clean_range(vaddr, vaddr + length);
+		if (!vaddr)
+			dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_TO_DEVICE);
+		else
+			dmac_clean_range(vaddr, vaddr + length);
 		outer_cache_op = outer_clean_range;
 		break;
 	case ION_IOC_INV_CACHES:
-		dmac_inv_range(vaddr, vaddr + length);
+		if (!vaddr)
+			dma_sync_sg_for_cpu(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_FROM_DEVICE);
+		else
+			dmac_inv_range(vaddr, vaddr + length);
 		outer_cache_op = outer_inv_range;
 		break;
 	case ION_IOC_CLEAN_INV_CACHES:
-		dmac_flush_range(vaddr, vaddr + length);
+		if (!vaddr) {
+			dma_sync_sg_for_device(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_TO_DEVICE);
+			dma_sync_sg_for_cpu(NULL, buffer->sg_table->sgl,
+				buffer->sg_table->nents, DMA_FROM_DEVICE);
+		} else {
+			dmac_flush_range(vaddr, vaddr + length);
+		}
 		outer_cache_op = outer_flush_range;
 		break;
 	default:
@@ -255,6 +356,14 @@
 	data->mapped_size = iova_length;
 	extra = iova_length - buffer->size;
 
+	/* Use the biggest alignment to allow bigger IOMMU mappings.
+	 * Use the first entry since the first entry will always be the
+	 * biggest entry. To take advantage of bigger mapping sizes both the
+	 * VA and PA addresses have to be aligned to the biggest size.
+	 */
+	if (table->sgl->length > align)
+		align = table->sgl->length;
+
 	ret = msm_allocate_iova_address(domain_num, partition_num,
 						data->mapped_size, align,
 						&data->iova_addr);
@@ -280,8 +389,9 @@
 
 	extra_iova_addr = data->iova_addr + buffer->size;
 	if (extra) {
-		ret = msm_iommu_map_extra(domain, extra_iova_addr, extra, SZ_4K,
-					  prot);
+		unsigned long phys_addr = sg_phys(table->sgl);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
 		if (ret)
 			goto out2;
 	}
@@ -357,7 +467,7 @@
 }
 
 struct sg_table *ion_system_contig_heap_map_dma(struct ion_heap *heap,
-						   struct ion_buffer *buffer)
+						struct ion_buffer *buffer)
 {
 	struct sg_table *table;
 	int ret;
@@ -375,6 +485,13 @@
 	return table;
 }
 
+void ion_system_contig_heap_unmap_dma(struct ion_heap *heap,
+				      struct ion_buffer *buffer)
+{
+	sg_free_table(buffer->sg_table);
+	kfree(buffer->sg_table);
+}
+
 int ion_system_contig_heap_map_user(struct ion_heap *heap,
 				    struct ion_buffer *buffer,
 				    struct vm_area_struct *vma)
@@ -483,7 +600,7 @@
 	}
 	page = virt_to_page(buffer->vaddr);
 
-	sglist = kmalloc(sizeof(*sglist), GFP_KERNEL);
+	sglist = vmalloc(sizeof(*sglist));
 	if (!sglist)
 		goto out1;
 
@@ -500,32 +617,45 @@
 
 	if (extra) {
 		unsigned long extra_iova_addr = data->iova_addr + buffer->size;
-		ret = msm_iommu_map_extra(domain, extra_iova_addr, extra, SZ_4K,
-					  prot);
+		unsigned long phys_addr = sg_phys(sglist);
+		ret = msm_iommu_map_extra(domain, extra_iova_addr, phys_addr,
+					extra, SZ_4K, prot);
 		if (ret)
 			goto out2;
 	}
-	kfree(sglist);
+	vfree(sglist);
 	return ret;
 out2:
 	iommu_unmap_range(domain, data->iova_addr, buffer->size);
 
 out1:
-	kfree(sglist);
+	vfree(sglist);
 	msm_free_iova_address(data->iova_addr, domain_num, partition_num,
 						data->mapped_size);
 out:
 	return ret;
 }
 
+void *ion_system_contig_heap_map_kernel(struct ion_heap *heap,
+	struct ion_buffer *buffer)
+{
+	return buffer->priv_virt;
+}
+
+void ion_system_contig_heap_unmap_kernel(struct ion_heap *heap,
+	struct ion_buffer *buffer)
+{
+	return;
+}
+
 static struct ion_heap_ops kmalloc_ops = {
 	.allocate = ion_system_contig_heap_allocate,
 	.free = ion_system_contig_heap_free,
 	.phys = ion_system_contig_heap_phys,
 	.map_dma = ion_system_contig_heap_map_dma,
-	.unmap_dma = ion_system_heap_unmap_dma,
-	.map_kernel = ion_system_heap_map_kernel,
-	.unmap_kernel = ion_system_heap_unmap_kernel,
+	.unmap_dma = ion_system_contig_heap_unmap_dma,
+	.map_kernel = ion_system_contig_heap_map_kernel,
+	.unmap_kernel = ion_system_contig_heap_unmap_kernel,
 	.map_user = ion_system_contig_heap_map_user,
 	.cache_op = ion_system_contig_heap_cache_ops,
 	.print_debug = ion_system_contig_print_debug,
diff --git a/drivers/gpu/ion/msm/ion_cp_common.c b/drivers/gpu/ion/msm/ion_cp_common.c
index 41e0a04..7d54cfa 100644
--- a/drivers/gpu/ion/msm/ion_cp_common.c
+++ b/drivers/gpu/ion/msm/ion_cp_common.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2011 Google, Inc
- * Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -12,12 +12,15 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/memory_alloc.h>
 #include <linux/types.h>
 #include <mach/scm.h>
 
+#include "../ion_priv.h"
 #include "ion_cp_common.h"
 
 #define MEM_PROTECT_LOCK_ID	0x05
+#define MEM_PROTECT_LOCK_ID2 0x0A
 
 struct cp2_mem_chunks {
 	unsigned int *chunk_list;
@@ -25,28 +28,275 @@
 	unsigned int chunk_size;
 } __attribute__ ((__packed__));
 
-struct cp2_lock_req {
+struct cp2_lock2_req {
 	struct cp2_mem_chunks chunks;
 	unsigned int mem_usage;
 	unsigned int lock;
+	unsigned int flags;
 } __attribute__ ((__packed__));
 
+/*  SCM related code for locking down memory for content protection */
+
+#define SCM_CP_LOCK_CMD_ID	0x1
+#define SCM_CP_PROTECT		0x1
+#define SCM_CP_UNPROTECT	0x0
+
+struct cp_lock_msg {
+	unsigned int start;
+	unsigned int end;
+	unsigned int permission_type;
+	unsigned char lock;
+} __attribute__ ((__packed__));
+
+static int ion_cp_protect_mem_v1(unsigned int phy_base, unsigned int size,
+			      unsigned int permission_type)
+{
+	struct cp_lock_msg cmd;
+	cmd.start = phy_base;
+	cmd.end = phy_base + size;
+	cmd.permission_type = permission_type;
+	cmd.lock = SCM_CP_PROTECT;
+
+	return scm_call(SCM_SVC_MP, SCM_CP_LOCK_CMD_ID,
+			&cmd, sizeof(cmd), NULL, 0);
+}
+
+static int ion_cp_unprotect_mem_v1(unsigned int phy_base, unsigned int size,
+				unsigned int permission_type)
+{
+	struct cp_lock_msg cmd;
+	cmd.start = phy_base;
+	cmd.end = phy_base + size;
+	cmd.permission_type = permission_type;
+	cmd.lock = SCM_CP_UNPROTECT;
+
+	return scm_call(SCM_SVC_MP, SCM_CP_LOCK_CMD_ID,
+			&cmd, sizeof(cmd), NULL, 0);
+}
+
+#define V2_CHUNK_SIZE	SZ_1M
+
+static int ion_cp_change_mem_v2(unsigned int phy_base, unsigned int size,
+			      void *data, int lock)
+{
+	enum cp_mem_usage usage = (enum cp_mem_usage) data;
+	unsigned long *chunk_list;
+	int nchunks;
+	int ret;
+	int i;
+
+	if (usage < 0 || usage >= MAX_USAGE)
+		return -EINVAL;
+
+	if (!IS_ALIGNED(size, V2_CHUNK_SIZE)) {
+		pr_err("%s: heap size is not aligned to %x\n",
+			__func__, V2_CHUNK_SIZE);
+		return -EINVAL;
+	}
+
+	nchunks = size / V2_CHUNK_SIZE;
+
+	chunk_list = allocate_contiguous_ebi(sizeof(unsigned long)*nchunks,
+						SZ_4K, 0);
+	if (!chunk_list)
+		return -ENOMEM;
+
+	for (i = 0; i < nchunks; i++)
+		chunk_list[i] = phy_base + i * V2_CHUNK_SIZE;
+
+	ret = ion_cp_change_chunks_state(memory_pool_node_paddr(chunk_list),
+					nchunks, V2_CHUNK_SIZE, usage, lock);
+
+	free_contiguous_memory(chunk_list);
+	return ret;
+}
+
+int ion_cp_protect_mem(unsigned int phy_base, unsigned int size,
+			      unsigned int permission_type, int version,
+			      void *data)
+{
+	switch (version) {
+	case ION_CP_V1:
+		return ion_cp_protect_mem_v1(phy_base, size, permission_type);
+	case ION_CP_V2:
+		return ion_cp_change_mem_v2(phy_base, size, data,
+						SCM_CP_PROTECT);
+	default:
+		return -EINVAL;
+	}
+}
+
+int ion_cp_unprotect_mem(unsigned int phy_base, unsigned int size,
+			      unsigned int permission_type, int version,
+			      void *data)
+{
+	switch (version) {
+	case ION_CP_V1:
+		return ion_cp_unprotect_mem_v1(phy_base, size, permission_type);
+	case ION_CP_V2:
+		return ion_cp_change_mem_v2(phy_base, size, data,
+						SCM_CP_UNPROTECT);
+	default:
+		return -EINVAL;
+	}
+}
+
 int ion_cp_change_chunks_state(unsigned long chunks, unsigned int nchunks,
 				unsigned int chunk_size,
 				enum cp_mem_usage usage,
 				int lock)
 {
-	struct cp2_lock_req request;
+	struct cp2_lock2_req request;
+	u32 resp;
 
 	request.mem_usage = usage;
 	request.lock = lock;
+	request.flags = 0;
 
 	request.chunks.chunk_list = (unsigned int *)chunks;
 	request.chunks.chunk_list_size = nchunks;
 	request.chunks.chunk_size = chunk_size;
 
-	return scm_call(SCM_SVC_CP, MEM_PROTECT_LOCK_ID,
-			&request, sizeof(request), NULL, 0);
+	return scm_call(SCM_SVC_MP, MEM_PROTECT_LOCK_ID2,
+			&request, sizeof(request), &resp, sizeof(resp));
 
 }
 
+/* Must be protected by ion_cp_buffer lock */
+static int __ion_cp_protect_buffer(struct ion_buffer *buffer, int version,
+					void *data, int flags)
+{
+	struct ion_cp_buffer *buf = buffer->priv_virt;
+	int ret_value = 0;
+
+	if (atomic_inc_return(&buf->secure_cnt) == 1) {
+		ret_value = ion_cp_protect_mem(buf->buffer,
+				buffer->size, 0,
+				version, data);
+
+		if (ret_value) {
+			pr_err("Failed to secure buffer %p, error %d\n",
+				buffer, ret_value);
+			atomic_dec(&buf->secure_cnt);
+		} else {
+			pr_debug("Protected buffer %p from %pa (size %x)\n",
+				buffer, &buf->buffer,
+				buffer->size);
+			buf->want_delayed_unsecure |=
+				flags & ION_UNSECURE_DELAYED ? 1 : 0;
+			buf->data = data;
+			buf->version = version;
+		}
+	}
+	pr_debug("buffer %p protect count %d\n", buffer,
+		atomic_read(&buf->secure_cnt));
+	BUG_ON(atomic_read(&buf->secure_cnt) < 0);
+	return ret_value;
+}
+
+/* Must be protected by ion_cp_buffer lock */
+static int __ion_cp_unprotect_buffer(struct ion_buffer *buffer, int version,
+					void *data, int force_unsecure)
+{
+	struct ion_cp_buffer *buf = buffer->priv_virt;
+	int ret_value = 0;
+
+	if (force_unsecure) {
+		if (!buf->is_secure || atomic_read(&buf->secure_cnt) == 0)
+			return 0;
+
+		if (atomic_read(&buf->secure_cnt) != 1) {
+			WARN(1, "Forcing unsecure of buffer with outstanding secure count %d!\n",
+				atomic_read(&buf->secure_cnt));
+			atomic_set(&buf->secure_cnt, 1);
+		}
+	}
+
+	if (atomic_dec_and_test(&buf->secure_cnt)) {
+		ret_value = ion_cp_unprotect_mem(
+			buf->buffer, buffer->size,
+			0, version, data);
+
+		if (ret_value) {
+			pr_err("Failed to unsecure buffer %p, error %d\n",
+				buffer, ret_value);
+			/*
+			 * If the force unsecure is happening, the buffer
+			 * is being destroyed. We failed to unsecure the
+			 * buffer even though the memory is given back.
+			 * Just die now rather than discovering later what
+			 * happens when trying to use the secured memory as
+			 * unsecured...
+			 */
+			BUG_ON(force_unsecure);
+			/* Bump the count back up one to try again later */
+			atomic_inc(&buf->secure_cnt);
+		} else {
+			buf->version = -1;
+			buf->data = NULL;
+		}
+	}
+	pr_debug("buffer %p unprotect count %d\n", buffer,
+		atomic_read(&buf->secure_cnt));
+	BUG_ON(atomic_read(&buf->secure_cnt) < 0);
+	return ret_value;
+}
+
+int ion_cp_secure_buffer(struct ion_buffer *buffer, int version, void *data,
+				int flags)
+{
+	int ret_value;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
+
+	mutex_lock(&buf->lock);
+	if (!buf->is_secure) {
+		pr_err("%s: buffer %p was not allocated as secure\n",
+			__func__, buffer);
+		ret_value = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (ION_IS_CACHED(buffer->flags)) {
+		pr_err("%s: buffer %p was allocated as cached\n",
+			__func__, buffer);
+		ret_value = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (atomic_read(&buf->map_cnt)) {
+		pr_err("%s: cannot secure buffer %p with outstanding mappings. Total count: %d",
+			__func__, buffer, atomic_read(&buf->map_cnt));
+		ret_value = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (atomic_read(&buf->secure_cnt)) {
+		if (buf->version != version || buf->data != data) {
+			pr_err("%s: Trying to re-secure buffer with different values",
+				__func__);
+			pr_err("Last secured version: %d Currrent %d\n",
+				buf->version, version);
+			pr_err("Last secured data: %p current %p\n",
+				buf->data, data);
+			ret_value = -EINVAL;
+			goto out_unlock;
+		}
+	}
+	ret_value = __ion_cp_protect_buffer(buffer, version, data, flags);
+
+out_unlock:
+	mutex_unlock(&buf->lock);
+	return ret_value;
+}
+
+int ion_cp_unsecure_buffer(struct ion_buffer *buffer, int force_unsecure)
+{
+	int ret_value = 0;
+	struct ion_cp_buffer *buf = buffer->priv_virt;
+
+	mutex_lock(&buf->lock);
+	ret_value = __ion_cp_unprotect_buffer(buffer, buf->version, buf->data,
+						force_unsecure);
+	mutex_unlock(&buf->lock);
+	return ret_value;
+}
diff --git a/drivers/gpu/ion/msm/ion_cp_common.h b/drivers/gpu/ion/msm/ion_cp_common.h
index eec66e6..8ae19be 100644
--- a/drivers/gpu/ion/msm/ion_cp_common.h
+++ b/drivers/gpu/ion/msm/ion_cp_common.h
@@ -20,6 +20,26 @@
 #define ION_CP_V1	1
 #define ION_CP_V2	2
 
+struct ion_cp_buffer {
+	phys_addr_t buffer;
+	atomic_t secure_cnt;
+	int is_secure;
+	int want_delayed_unsecure;
+	/*
+	 * Currently all user/kernel mapping is protected by the heap lock.
+	 * This is sufficient to protect the map count as well. The lock
+	 * should be used to protect map_cnt if the whole heap lock is
+	 * ever removed.
+	 */
+	atomic_t map_cnt;
+	/*
+	 * protects secure_cnt for securing.
+	 */
+	struct mutex lock;
+	int version;
+	void *data;
+};
+
 #if defined(CONFIG_ION_MSM)
 /*
  * ion_cp2_protect_mem - secures memory via trustzone
@@ -37,6 +57,18 @@
 			unsigned int chunk_size, enum cp_mem_usage usage,
 			int lock);
 
+int ion_cp_protect_mem(unsigned int phy_base, unsigned int size,
+			unsigned int permission_type, int version,
+			void *data);
+
+int ion_cp_unprotect_mem(unsigned int phy_base, unsigned int size,
+				unsigned int permission_type, int version,
+				void *data);
+
+int ion_cp_secure_buffer(struct ion_buffer *buffer, int version, void *data,
+				int flags);
+
+int ion_cp_unsecure_buffer(struct ion_buffer *buffer, int force_unsecure);
 #else
 static inline int ion_cp_change_chunks_state(unsigned long chunks,
 			unsigned int nchunks, unsigned int chunk_size,
@@ -44,6 +76,32 @@
 {
 	return -ENODEV;
 }
+
+static inline int ion_cp_protect_mem(unsigned int phy_base, unsigned int size,
+			unsigned int permission_type, int version,
+			void *data)
+{
+	return -ENODEV;
+}
+
+static inline int ion_cp_unprotect_mem(unsigned int phy_base, unsigned int size,
+				unsigned int permission_type, int version,
+				void *data)
+{
+	return -ENODEV;
+}
+
+static inline int ion_cp_secure_buffer(struct ion_buffer *buffer, int version,
+				void *data, int flags)
+{
+	return -ENODEV;
+}
+
+static inline int ion_cp_unsecure_buffer(struct ion_buffer *buffer,
+				int force_unsecure)
+{
+	return -ENODEV;
+}
 #endif
 
 #endif
diff --git a/drivers/gpu/ion/msm/msm_ion.c b/drivers/gpu/ion/msm/msm_ion.c
index ab5d09b..4b55875 100644
--- a/drivers/gpu/ion/msm/msm_ion.c
+++ b/drivers/gpu/ion/msm/msm_ion.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,11 +19,13 @@
 #include <linux/memory_alloc.h>
 #include <linux/fmem.h>
 #include <linux/of.h>
+#include <linux/of_platform.h>
 #include <linux/mm.h>
 #include <linux/mm_types.h>
 #include <linux/sched.h>
 #include <linux/rwsem.h>
 #include <linux/uaccess.h>
+#include <linux/memblock.h>
 #include <mach/ion.h>
 #include <mach/msm_memtypes.h>
 #include "../ion_priv.h"
@@ -44,6 +46,7 @@
 };
 
 
+#ifdef CONFIG_OF
 static struct ion_heap_desc ion_heap_meta[] = {
 	{
 		.id	= ION_SYSTEM_HEAP_ID,
@@ -51,8 +54,13 @@
 		.name	= ION_VMALLOC_HEAP_NAME,
 	},
 	{
+		.id	= ION_SYSTEM_CONTIG_HEAP_ID,
+		.type	= ION_HEAP_TYPE_SYSTEM_CONTIG,
+		.name	= ION_KMALLOC_HEAP_NAME,
+	},
+	{
 		.id	= ION_CP_MM_HEAP_ID,
-		.type	= ION_HEAP_TYPE_CP,
+		.type	= ION_HEAP_TYPE_SECURE_DMA,
 		.name	= ION_MM_HEAP_NAME,
 		.permission_type = IPT_TYPE_MM_CARVEOUT,
 	},
@@ -88,6 +96,16 @@
 		.name	= ION_AUDIO_HEAP_NAME,
 	},
 	{
+		.id	= ION_PIL1_HEAP_ID,
+		.type	= ION_HEAP_TYPE_CARVEOUT,
+		.name	= ION_PIL1_HEAP_NAME,
+	},
+	{
+		.id	= ION_PIL2_HEAP_ID,
+		.type	= ION_HEAP_TYPE_CARVEOUT,
+		.name	= ION_PIL2_HEAP_NAME,
+	},
+	{
 		.id	= ION_CP_WB_HEAP_ID,
 		.type	= ION_HEAP_TYPE_CP,
 		.name	= ION_WB_HEAP_NAME,
@@ -97,7 +115,13 @@
 		.type	= ION_HEAP_TYPE_CARVEOUT,
 		.name	= ION_CAMERA_HEAP_NAME,
 	},
+	{
+		.id	= ION_ADSP_HEAP_ID,
+		.type	= ION_HEAP_TYPE_DMA,
+		.name	= ION_ADSP_HEAP_NAME,
+	}
 };
+#endif
 
 struct ion_client *msm_ion_client_create(unsigned int heap_mask,
 					const char *name)
@@ -130,6 +154,22 @@
 }
 EXPORT_SYMBOL(msm_ion_unsecure_heap_2_0);
 
+int msm_ion_secure_buffer(struct ion_client *client, struct ion_handle *handle,
+				enum cp_mem_usage usage,
+				int flags)
+{
+	return ion_secure_handle(client, handle, ION_CP_V2,
+				(void *)usage, flags);
+}
+EXPORT_SYMBOL(msm_ion_secure_buffer);
+
+int msm_ion_unsecure_buffer(struct ion_client *client,
+				struct ion_handle *handle)
+{
+	return ion_unsecure_handle(client, handle);
+}
+EXPORT_SYMBOL(msm_ion_unsecure_buffer);
+
 int msm_ion_do_cache_op(struct ion_client *client, struct ion_handle *handle,
 			void *vaddr, unsigned long len, unsigned int cmd)
 {
@@ -137,7 +177,7 @@
 }
 EXPORT_SYMBOL(msm_ion_do_cache_op);
 
-static unsigned long msm_ion_get_base(unsigned long size, int memory_type,
+static ion_phys_addr_t msm_ion_get_base(unsigned long size, int memory_type,
 				    unsigned int align)
 {
 	switch (memory_type) {
@@ -305,10 +345,10 @@
 static int is_heap_overlapping(const struct ion_platform_heap *heap1,
 				const struct ion_platform_heap *heap2)
 {
-	unsigned long heap1_base = heap1->base;
-	unsigned long heap2_base = heap2->base;
-	unsigned long heap1_end = heap1->base + heap1->size - 1;
-	unsigned long heap2_end = heap2->base + heap2->size - 1;
+	ion_phys_addr_t heap1_base = heap1->base;
+	ion_phys_addr_t heap2_base = heap2->base;
+	ion_phys_addr_t heap1_end = heap1->base + heap1->size - 1;
+	ion_phys_addr_t heap2_end = heap2->base + heap2->size - 1;
 
 	if (heap1_base == heap2_base)
 		return 1;
@@ -341,6 +381,7 @@
 	}
 }
 
+#ifdef CONFIG_OF
 static int msm_init_extra_data(struct ion_platform_heap *heap,
 			       const struct ion_heap_desc *heap_desc)
 {
@@ -397,6 +438,7 @@
 	unsigned int i;
 	for (i = 0; i < pdata->nr; ++i)
 		kfree(pdata->heaps[i].extra_data);
+	kfree(pdata->heaps);
 	kfree(pdata);
 }
 
@@ -442,6 +484,7 @@
 {
 	unsigned int val;
 	int ret = 0;
+	u32 out_values[2];
 	const char *memory_name_prop;
 
 	ret = of_property_read_u32(node, "qcom,memory-reservation-size", &val);
@@ -465,12 +508,29 @@
 			ret = -EINVAL;
 		}
 	} else {
-		ret = 0;
+		ret = of_property_read_u32_array(node, "qcom,memory-fixed",
+								out_values, 2);
+		if (!ret)
+			heap->size = out_values[1];
+		else
+			ret = 0;
 	}
 out:
 	return ret;
 }
 
+static void msm_ion_get_heap_base(struct device_node *node,
+				 struct ion_platform_heap *heap)
+{
+	u32 out_values[2];
+	int ret = 0;
+
+	ret = of_property_read_u32_array(node, "qcom,memory-fixed",
+							out_values, 2);
+	if (!ret)
+		heap->base = out_values[0];
+	return;
+}
 
 static void msm_ion_get_heap_adjacent(struct device_node *node,
 				      struct ion_platform_heap *heap)
@@ -504,11 +564,13 @@
 	}
 }
 
-static struct ion_platform_data *msm_ion_parse_dt(
-					const struct device_node *dt_node)
+static struct ion_platform_data *msm_ion_parse_dt(struct platform_device *pdev)
 {
 	struct ion_platform_data *pdata = 0;
+	struct ion_platform_heap *heaps = NULL;
 	struct device_node *node;
+	struct platform_device *new_dev = NULL;
+	const struct device_node *dt_node = pdev->dev.of_node;
 	uint32_t val = 0;
 	int ret = 0;
 	uint32_t num_heaps = 0;
@@ -520,14 +582,27 @@
 	if (!num_heaps)
 		return ERR_PTR(-EINVAL);
 
-	pdata = kzalloc(sizeof(struct ion_platform_data) +
-			num_heaps*sizeof(struct ion_platform_heap), GFP_KERNEL);
+	pdata = kzalloc(sizeof(struct ion_platform_data), GFP_KERNEL);
 	if (!pdata)
 		return ERR_PTR(-ENOMEM);
 
+	heaps = kzalloc(sizeof(struct ion_platform_heap)*num_heaps, GFP_KERNEL);
+	if (!heaps) {
+		kfree(pdata);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pdata->heaps = heaps;
 	pdata->nr = num_heaps;
 
 	for_each_child_of_node(dt_node, node) {
+		new_dev = of_platform_device_create(node, NULL, &pdev->dev);
+		if (!new_dev) {
+			pr_err("Failed to create device %s\n", node->name);
+			goto free_heaps;
+		}
+
+		pdata->heaps[idx].priv = &new_dev->dev;
 		/**
 		 * TODO: Replace this with of_get_address() when this patch
 		 * gets merged: http://
@@ -544,6 +619,7 @@
 		if (ret)
 			goto free_heaps;
 
+		msm_ion_get_heap_base(node, &pdata->heaps[idx]);
 		msm_ion_get_heap_align(node, &pdata->heaps[idx]);
 
 		ret = msm_ion_get_heap_size(node, &pdata->heaps[idx]);
@@ -560,6 +636,17 @@
 	free_pdata(pdata);
 	return ERR_PTR(ret);
 }
+#else
+static struct ion_platform_data *msm_ion_parse_dt(struct platform_device *pdev)
+{
+	return NULL;
+}
+
+static void free_pdata(const struct ion_platform_data *pdata)
+{
+
+}
+#endif
 
 static int check_vaddr_bounds(unsigned long start, unsigned long end)
 {
@@ -570,22 +657,36 @@
 	if (end < start)
 		goto out;
 
-	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, start);
 	if (vma && vma->vm_start < end) {
 		if (start < vma->vm_start)
-			goto out_up;
+			goto out;
 		if (end > vma->vm_end)
-			goto out_up;
+			goto out;
 		ret = 0;
 	}
 
-out_up:
-	up_read(&mm->mmap_sem);
 out:
 	return ret;
 }
 
+int ion_heap_allow_secure_allocation(enum ion_heap_type type)
+{
+	return type == ((enum ion_heap_type) ION_HEAP_TYPE_CP) ||
+		type == ((enum ion_heap_type) ION_HEAP_TYPE_SECURE_DMA);
+}
+
+int ion_heap_allow_handle_secure(enum ion_heap_type type)
+{
+	return type == ((enum ion_heap_type) ION_HEAP_TYPE_CP) ||
+		type == ((enum ion_heap_type) ION_HEAP_TYPE_SECURE_DMA);
+}
+
+int ion_heap_allow_heap_secure(enum ion_heap_type type)
+{
+	return type == ((enum ion_heap_type) ION_HEAP_TYPE_CP);
+}
+
 static long msm_ion_custom_ioctl(struct ion_client *client,
 				unsigned int cmd,
 				unsigned long arg)
@@ -599,20 +700,12 @@
 		unsigned long start, end;
 		struct ion_handle *handle = NULL;
 		int ret;
+		struct mm_struct *mm = current->active_mm;
 
 		if (copy_from_user(&data, (void __user *)arg,
 					sizeof(struct ion_flush_data)))
 			return -EFAULT;
 
-		start = (unsigned long) data.vaddr;
-		end = (unsigned long) data.vaddr + data.length;
-
-		if (check_vaddr_bounds(start, end)) {
-			pr_err("%s: virtual address %p is out of bounds\n",
-				__func__, data.vaddr);
-			return -EINVAL;
-		}
-
 		if (!data.handle) {
 			handle = ion_import_dma_buf(client, data.fd);
 			if (IS_ERR(handle)) {
@@ -622,11 +715,27 @@
 			}
 		}
 
+		down_read(&mm->mmap_sem);
+
+		start = (unsigned long) data.vaddr;
+		end = (unsigned long) data.vaddr + data.length;
+
+		if (start && check_vaddr_bounds(start, end)) {
+			up_read(&mm->mmap_sem);
+			pr_err("%s: virtual address %p is out of bounds\n",
+				__func__, data.vaddr);
+			if (!data.handle)
+				ion_free(client, handle);
+			return -EINVAL;
+		}
+
 		ret = ion_do_cache_op(client,
 				data.handle ? data.handle : handle,
 				data.vaddr, data.offset, data.length,
 				cmd);
 
+		up_read(&mm->mmap_sem);
+
 		if (!data.handle)
 			ion_free(client, handle);
 
@@ -635,28 +744,74 @@
 		break;
 
 	}
-	case ION_IOC_GET_FLAGS:
-	{
-		struct ion_flag_data data;
-		int ret;
-		if (copy_from_user(&data, (void __user *)arg,
-					sizeof(struct ion_flag_data)))
-			return -EFAULT;
-
-		ret = ion_handle_get_flags(client, data.handle, &data.flags);
-		if (ret < 0)
-			return ret;
-		if (copy_to_user((void __user *)arg, &data,
-					sizeof(struct ion_flag_data)))
-			return -EFAULT;
-		break;
-	}
 	default:
 		return -ENOTTY;
 	}
 	return 0;
 }
 
+static struct ion_heap *msm_ion_heap_create(struct ion_platform_heap *heap_data)
+{
+	struct ion_heap *heap = NULL;
+
+	switch ((int)heap_data->type) {
+	case ION_HEAP_TYPE_IOMMU:
+		heap = ion_iommu_heap_create(heap_data);
+		break;
+	case ION_HEAP_TYPE_CP:
+		heap = ion_cp_heap_create(heap_data);
+		break;
+#ifdef CONFIG_CMA
+	case ION_HEAP_TYPE_DMA:
+		heap = ion_cma_heap_create(heap_data);
+		break;
+
+	case ION_HEAP_TYPE_SECURE_DMA:
+		heap = ion_secure_cma_heap_create(heap_data);
+		break;
+#endif
+	default:
+		heap = ion_heap_create(heap_data);
+	}
+
+	if (IS_ERR_OR_NULL(heap)) {
+		pr_err("%s: error creating heap %s type %d base %pa size %u\n",
+		       __func__, heap_data->name, heap_data->type,
+		       &heap_data->base, heap_data->size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	heap->name = heap_data->name;
+	heap->id = heap_data->id;
+	heap->priv = heap_data->priv;
+	return heap;
+}
+
+static void msm_ion_heap_destroy(struct ion_heap *heap)
+{
+	if (!heap)
+		return;
+
+	switch ((int)heap->type) {
+	case ION_HEAP_TYPE_IOMMU:
+		ion_iommu_heap_destroy(heap);
+		break;
+	case ION_HEAP_TYPE_CP:
+		ion_cp_heap_destroy(heap);
+		break;
+#ifdef CONFIG_CMA
+	case ION_HEAP_TYPE_DMA:
+		ion_cma_heap_destroy(heap);
+		break;
+	case ION_HEAP_TYPE_SECURE_DMA:
+		ion_secure_cma_heap_destroy(heap);
+		break;
+#endif
+	default:
+		ion_heap_destroy(heap);
+	}
+}
+
 static int msm_ion_probe(struct platform_device *pdev)
 {
 	struct ion_platform_data *pdata;
@@ -664,7 +819,7 @@
 	int err = -1;
 	int i;
 	if (pdev->dev.of_node) {
-		pdata = msm_ion_parse_dt(pdev->dev.of_node);
+		pdata = msm_ion_parse_dt(pdev);
 		if (IS_ERR(pdata)) {
 			err = PTR_ERR(pdata);
 			goto out;
@@ -698,15 +853,15 @@
 		msm_ion_allocate(heap_data);
 
 		heap_data->has_outer_cache = pdata->has_outer_cache;
-		heaps[i] = ion_heap_create(heap_data);
+		heaps[i] = msm_ion_heap_create(heap_data);
 		if (IS_ERR_OR_NULL(heaps[i])) {
 			heaps[i] = 0;
 			continue;
 		} else {
 			if (heap_data->size)
-				pr_info("ION heap %s created at %lx "
+				pr_info("ION heap %s created at %pa "
 					"with size %x\n", heap_data->name,
-							  heap_data->base,
+							  &heap_data->base,
 							  heap_data->size);
 			else
 				pr_info("ION heap %s created\n",
@@ -715,10 +870,10 @@
 
 		ion_device_add_heap(idev, heaps[i]);
 	}
+	check_for_heap_overlap(pdata->heaps, num_heaps);
 	if (pdata_needs_to_be_freed)
 		free_pdata(pdata);
 
-	check_for_heap_overlap(pdata->heaps, num_heaps);
 	platform_set_drvdata(pdev, idev);
 	return 0;
 
@@ -736,7 +891,7 @@
 	int i;
 
 	for (i = 0; i < num_heaps; i++)
-		ion_heap_destroy(heaps[i]);
+		msm_ion_heap_destroy(heaps[i]);
 
 	ion_device_destroy(idev);
 	kfree(heaps);
diff --git a/drivers/gpu/msm/Makefile b/drivers/gpu/msm/Makefile
index fec5363..3441afa 100644
--- a/drivers/gpu/msm/Makefile
+++ b/drivers/gpu/msm/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Iinclude/drm -Idrivers/gpu/msm
+ccflags-y := -Iinclude/uapi/drm -Iinclude/drm -Idrivers/gpu/msm
 
 msm_kgsl_core-y = \
 	kgsl.o \
diff --git a/drivers/gpu/msm/a3xx_reg.h b/drivers/gpu/msm/a3xx_reg.h
index be9f3ac..a2f0e60 100644
--- a/drivers/gpu/msm/a3xx_reg.h
+++ b/drivers/gpu/msm/a3xx_reg.h
@@ -66,15 +66,103 @@
 #define A3XX_RBBM_INT_0_MASK 0x063
 #define A3XX_RBBM_INT_0_STATUS 0x064
 #define A3XX_RBBM_PERFCTR_CTL 0x80
+#define A3XX_RBBM_PERFCTR_LOAD_CMD0 0x81
+#define A3XX_RBBM_PERFCTR_LOAD_CMD1 0x82
+#define A3XX_RBBM_PERFCTR_LOAD_VALUE_LO 0x84
+#define A3XX_RBBM_PERFCTR_LOAD_VALUE_HI 0x85
+#define A3XX_RBBM_PERFCOUNTER0_SELECT 0x86
+#define A3XX_RBBM_PERFCOUNTER1_SELECT 0x87
 #define A3XX_RBBM_GPU_BUSY_MASKED 0x88
+#define A3XX_RBBM_PERFCTR_CP_0_LO 0x90
+#define A3XX_RBBM_PERFCTR_CP_0_HI 0x91
+#define A3XX_RBBM_PERFCTR_RBBM_0_LO 0x92
+#define A3XX_RBBM_PERFCTR_RBBM_0_HI 0x93
+#define A3XX_RBBM_PERFCTR_RBBM_1_LO 0x94
+#define A3XX_RBBM_PERFCTR_RBBM_1_HI 0x95
+#define A3XX_RBBM_PERFCTR_PC_0_LO 0x96
+#define A3XX_RBBM_PERFCTR_PC_0_HI 0x97
+#define A3XX_RBBM_PERFCTR_PC_1_LO 0x98
+#define A3XX_RBBM_PERFCTR_PC_1_HI 0x99
+#define A3XX_RBBM_PERFCTR_PC_2_LO 0x9A
+#define A3XX_RBBM_PERFCTR_PC_2_HI 0x9B
+#define A3XX_RBBM_PERFCTR_PC_3_LO 0x9C
+#define A3XX_RBBM_PERFCTR_PC_3_HI 0x9D
+#define A3XX_RBBM_PERFCTR_VFD_0_LO 0x9E
+#define A3XX_RBBM_PERFCTR_VFD_0_HI 0x9F
+#define A3XX_RBBM_PERFCTR_VFD_1_LO 0xA0
+#define A3XX_RBBM_PERFCTR_VFD_1_HI 0xA1
+#define A3XX_RBBM_PERFCTR_HLSQ_0_LO 0xA2
+#define A3XX_RBBM_PERFCTR_HLSQ_0_HI 0xA3
+#define A3XX_RBBM_PERFCTR_HLSQ_1_LO 0xA4
+#define A3XX_RBBM_PERFCTR_HLSQ_1_HI 0xA5
+#define A3XX_RBBM_PERFCTR_HLSQ_2_LO 0xA6
+#define A3XX_RBBM_PERFCTR_HLSQ_2_HI 0xA7
+#define A3XX_RBBM_PERFCTR_HLSQ_3_LO 0xA8
+#define A3XX_RBBM_PERFCTR_HLSQ_3_HI 0xA9
+#define A3XX_RBBM_PERFCTR_HLSQ_4_LO 0xAA
+#define A3XX_RBBM_PERFCTR_HLSQ_4_HI 0xAB
+#define A3XX_RBBM_PERFCTR_HLSQ_5_LO 0xAC
+#define A3XX_RBBM_PERFCTR_HLSQ_5_HI 0xAD
+#define A3XX_RBBM_PERFCTR_VPC_0_LO 0xAE
+#define A3XX_RBBM_PERFCTR_VPC_0_HI 0xAF
+#define A3XX_RBBM_PERFCTR_VPC_1_LO 0xB0
+#define A3XX_RBBM_PERFCTR_VPC_1_HI 0xB1
+#define A3XX_RBBM_PERFCTR_TSE_0_LO 0xB2
+#define A3XX_RBBM_PERFCTR_TSE_0_HI 0xB3
+#define A3XX_RBBM_PERFCTR_TSE_1_LO 0xB4
+#define A3XX_RBBM_PERFCTR_TSE_1_HI 0xB5
+#define A3XX_RBBM_PERFCTR_RAS_0_LO 0xB6
+#define A3XX_RBBM_PERFCTR_RAS_0_HI 0xB7
+#define A3XX_RBBM_PERFCTR_RAS_1_LO 0xB8
+#define A3XX_RBBM_PERFCTR_RAS_1_HI 0xB9
+#define A3XX_RBBM_PERFCTR_UCHE_0_LO 0xBA
+#define A3XX_RBBM_PERFCTR_UCHE_0_HI 0xBB
+#define A3XX_RBBM_PERFCTR_UCHE_1_LO 0xBC
+#define A3XX_RBBM_PERFCTR_UCHE_1_HI 0xBD
+#define A3XX_RBBM_PERFCTR_UCHE_2_LO 0xBE
+#define A3XX_RBBM_PERFCTR_UCHE_2_HI 0xBF
+#define A3XX_RBBM_PERFCTR_UCHE_3_LO 0xC0
+#define A3XX_RBBM_PERFCTR_UCHE_3_HI 0xC1
+#define A3XX_RBBM_PERFCTR_UCHE_4_LO 0xC2
+#define A3XX_RBBM_PERFCTR_UCHE_4_HI 0xC3
+#define A3XX_RBBM_PERFCTR_UCHE_5_LO 0xC4
+#define A3XX_RBBM_PERFCTR_UCHE_5_HI 0xC5
+#define A3XX_RBBM_PERFCTR_TP_0_LO 0xC6
+#define A3XX_RBBM_PERFCTR_TP_0_HI 0xC7
+#define A3XX_RBBM_PERFCTR_TP_1_LO 0xC8
+#define A3XX_RBBM_PERFCTR_TP_1_HI 0xC9
+#define A3XX_RBBM_PERFCTR_TP_2_LO 0xCA
+#define A3XX_RBBM_PERFCTR_TP_2_HI 0xCB
+#define A3XX_RBBM_PERFCTR_TP_3_LO 0xCC
+#define A3XX_RBBM_PERFCTR_TP_3_HI 0xCD
+#define A3XX_RBBM_PERFCTR_TP_4_LO 0xCE
+#define A3XX_RBBM_PERFCTR_TP_4_HI 0xCF
+#define A3XX_RBBM_PERFCTR_TP_5_LO 0xD0
+#define A3XX_RBBM_PERFCTR_TP_5_HI 0xD1
+#define A3XX_RBBM_PERFCTR_SP_0_LO 0xD2
+#define A3XX_RBBM_PERFCTR_SP_0_HI 0xD3
+#define A3XX_RBBM_PERFCTR_SP_1_LO 0xD4
+#define A3XX_RBBM_PERFCTR_SP_1_HI 0xD5
+#define A3XX_RBBM_PERFCTR_SP_2_LO 0xD6
+#define A3XX_RBBM_PERFCTR_SP_2_HI 0xD7
+#define A3XX_RBBM_PERFCTR_SP_3_LO 0xD8
+#define A3XX_RBBM_PERFCTR_SP_3_HI 0xD9
+#define A3XX_RBBM_PERFCTR_SP_4_LO 0xDA
+#define A3XX_RBBM_PERFCTR_SP_4_HI 0xDB
 #define A3XX_RBBM_PERFCTR_SP_5_LO 0xDC
 #define A3XX_RBBM_PERFCTR_SP_5_HI 0xDD
 #define A3XX_RBBM_PERFCTR_SP_6_LO 0xDE
 #define A3XX_RBBM_PERFCTR_SP_6_HI 0xDF
 #define A3XX_RBBM_PERFCTR_SP_7_LO 0xE0
 #define A3XX_RBBM_PERFCTR_SP_7_HI 0xE1
+#define A3XX_RBBM_PERFCTR_RB_0_LO 0xE2
+#define A3XX_RBBM_PERFCTR_RB_0_HI 0xE3
+#define A3XX_RBBM_PERFCTR_RB_1_LO 0xE4
+#define A3XX_RBBM_PERFCTR_RB_1_HI 0xE5
+
 #define A3XX_RBBM_RBBM_CTL 0x100
-#define A3XX_RBBM_RBBM_CTL 0x100
+#define A3XX_RBBM_PERFCTR_PWR_0_LO 0x0EA
+#define A3XX_RBBM_PERFCTR_PWR_0_HI 0x0EB
 #define A3XX_RBBM_PERFCTR_PWR_1_LO 0x0EC
 #define A3XX_RBBM_PERFCTR_PWR_1_HI 0x0ED
 #define A3XX_RBBM_DEBUG_BUS_CTL             0x111
@@ -90,6 +178,7 @@
 #define A3XX_CP_MERCIU_DATA2 0x1D3
 #define A3XX_CP_MEQ_ADDR 0x1DA
 #define A3XX_CP_MEQ_DATA 0x1DB
+#define A3XX_CP_PERFCOUNTER_SELECT 0x445
 #define A3XX_CP_HW_FAULT  0x45C
 #define A3XX_CP_AHB_FAULT 0x54D
 #define A3XX_CP_PROTECT_CTRL 0x45E
@@ -138,6 +227,14 @@
 #define A3XX_VSC_PIPE_CONFIG_7 0xC1B
 #define A3XX_VSC_PIPE_DATA_ADDRESS_7 0xC1C
 #define A3XX_VSC_PIPE_DATA_LENGTH_7 0xC1D
+#define A3XX_PC_PERFCOUNTER0_SELECT 0xC48
+#define A3XX_PC_PERFCOUNTER1_SELECT 0xC49
+#define A3XX_PC_PERFCOUNTER2_SELECT 0xC4A
+#define A3XX_PC_PERFCOUNTER3_SELECT 0xC4B
+#define A3XX_GRAS_PERFCOUNTER0_SELECT 0xC88
+#define A3XX_GRAS_PERFCOUNTER1_SELECT 0xC89
+#define A3XX_GRAS_PERFCOUNTER2_SELECT 0xC8A
+#define A3XX_GRAS_PERFCOUNTER3_SELECT 0xC8B
 #define A3XX_GRAS_CL_USER_PLANE_X0 0xCA0
 #define A3XX_GRAS_CL_USER_PLANE_Y0 0xCA1
 #define A3XX_GRAS_CL_USER_PLANE_Z0 0xCA2
@@ -163,14 +260,42 @@
 #define A3XX_GRAS_CL_USER_PLANE_Z5 0xCB6
 #define A3XX_GRAS_CL_USER_PLANE_W5 0xCB7
 #define A3XX_RB_GMEM_BASE_ADDR 0xCC0
+#define A3XX_RB_PERFCOUNTER0_SELECT   0xCC6
+#define A3XX_RB_PERFCOUNTER1_SELECT   0xCC7
+#define A3XX_HLSQ_PERFCOUNTER0_SELECT 0xE00
+#define A3XX_HLSQ_PERFCOUNTER1_SELECT 0xE01
+#define A3XX_HLSQ_PERFCOUNTER2_SELECT 0xE02
+#define A3XX_HLSQ_PERFCOUNTER3_SELECT 0xE03
+#define A3XX_HLSQ_PERFCOUNTER4_SELECT 0xE04
+#define A3XX_HLSQ_PERFCOUNTER5_SELECT 0xE05
 #define A3XX_VFD_PERFCOUNTER0_SELECT 0xE44
+#define A3XX_VFD_PERFCOUNTER1_SELECT 0xE45
 #define A3XX_VPC_VPC_DEBUG_RAM_SEL 0xE61
 #define A3XX_VPC_VPC_DEBUG_RAM_READ 0xE62
+#define A3XX_VPC_PERFCOUNTER0_SELECT 0xE64
+#define A3XX_VPC_PERFCOUNTER1_SELECT 0xE65
 #define A3XX_UCHE_CACHE_MODE_CONTROL_REG 0xE82
+#define A3XX_UCHE_PERFCOUNTER0_SELECT 0xE84
+#define A3XX_UCHE_PERFCOUNTER1_SELECT 0xE85
+#define A3XX_UCHE_PERFCOUNTER2_SELECT 0xE86
+#define A3XX_UCHE_PERFCOUNTER3_SELECT 0xE87
+#define A3XX_UCHE_PERFCOUNTER4_SELECT 0xE88
+#define A3XX_UCHE_PERFCOUNTER5_SELECT 0xE89
 #define A3XX_UCHE_CACHE_INVALIDATE0_REG 0xEA0
+#define A3XX_SP_PERFCOUNTER0_SELECT 0xEC4
+#define A3XX_SP_PERFCOUNTER1_SELECT 0xEC5
+#define A3XX_SP_PERFCOUNTER2_SELECT 0xEC6
+#define A3XX_SP_PERFCOUNTER3_SELECT 0xEC7
+#define A3XX_SP_PERFCOUNTER4_SELECT 0xEC8
 #define A3XX_SP_PERFCOUNTER5_SELECT 0xEC9
 #define A3XX_SP_PERFCOUNTER6_SELECT 0xECA
 #define A3XX_SP_PERFCOUNTER7_SELECT 0xECB
+#define A3XX_TP_PERFCOUNTER0_SELECT 0xF04
+#define A3XX_TP_PERFCOUNTER1_SELECT 0xF05
+#define A3XX_TP_PERFCOUNTER2_SELECT 0xF06
+#define A3XX_TP_PERFCOUNTER3_SELECT 0xF07
+#define A3XX_TP_PERFCOUNTER4_SELECT 0xF08
+#define A3XX_TP_PERFCOUNTER5_SELECT 0xF09
 #define A3XX_GRAS_CL_CLIP_CNTL 0x2040
 #define A3XX_GRAS_CL_GB_CLIP_ADJ 0x2044
 #define A3XX_GRAS_CL_VPORT_XOFFSET 0x2048
@@ -232,12 +357,14 @@
 #define A3XX_SP_VS_OUT_REG_7 0x22CE
 #define A3XX_SP_VS_VPC_DST_REG_0 0x22D0
 #define A3XX_SP_VS_OBJ_OFFSET_REG 0x22D4
+#define A3XX_SP_VS_OBJ_START_REG 0x22D5
 #define A3XX_SP_VS_PVT_MEM_ADDR_REG 0x22D7
 #define A3XX_SP_VS_PVT_MEM_SIZE_REG 0x22D8
 #define A3XX_SP_VS_LENGTH_REG 0x22DF
 #define A3XX_SP_FS_CTRL_REG0 0x22E0
 #define A3XX_SP_FS_CTRL_REG1 0x22E1
 #define A3XX_SP_FS_OBJ_OFFSET_REG 0x22E2
+#define A3XX_SP_FS_OBJ_START_REG 0x22E3
 #define A3XX_SP_FS_PVT_MEM_ADDR_REG 0x22E5
 #define A3XX_SP_FS_PVT_MEM_SIZE_REG 0x22E6
 #define A3XX_SP_FS_FLAT_SHAD_MODE_REG_0 0x22E8
@@ -271,8 +398,10 @@
 #define A3XX_VBIF_OUT_AXI_AOOO 0x305F
 
 /* Bit flags for RBBM_CTL */
-#define RBBM_RBBM_CTL_RESET_PWR_CTR1  (1 << 1)
-#define RBBM_RBBM_CTL_ENABLE_PWR_CTR1  (1 << 17)
+#define RBBM_RBBM_CTL_RESET_PWR_CTR0  BIT(0)
+#define RBBM_RBBM_CTL_RESET_PWR_CTR1  BIT(1)
+#define RBBM_RBBM_CTL_ENABLE_PWR_CTR0  BIT(16)
+#define RBBM_RBBM_CTL_ENABLE_PWR_CTR1  BIT(17)
 
 /* Various flags used by the context switch code */
 
@@ -537,7 +666,15 @@
 #define RBBM_BLOCK_ID_MARB_3           0x2b
 
 /* RBBM_CLOCK_CTL default value */
-#define A3XX_RBBM_CLOCK_CTL_DEFAULT 0xBFFFFFFF
+#define A305_RBBM_CLOCK_CTL_DEFAULT   0xAAAAAAAA
+#define A305C_RBBM_CLOCK_CTL_DEFAULT  0xAAAAAAAA
+#define A320_RBBM_CLOCK_CTL_DEFAULT   0xBFFFFFFF
+#define A330_RBBM_CLOCK_CTL_DEFAULT   0xBFFCFFFF
+#define A330v2_RBBM_CLOCK_CTL_DEFAULT 0xBFFCFFFF
+#define A305B_RBBM_CLOCK_CTL_DEFAULT  0xAAAAAAAA
+
+#define A330_RBBM_GPR0_CTL_DEFAULT    0x00000000
+#define A330v2_RBBM_GPR0_CTL_DEFAULT  0x00000000
 
 /* COUNTABLE FOR SP PERFCOUNTER */
 #define SP_FS_FULL_ALU_INSTRUCTIONS    0x0E
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 1886e04..62b6a71 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -17,7 +17,7 @@
 #include <linux/sched.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/msm_kgsl.h>
+#include <linux/delay.h>
 
 #include <mach/socinfo.h>
 #include <mach/msm_bus_board.h>
@@ -30,6 +30,7 @@
 #include "kgsl_cffdump.h"
 #include "kgsl_sharedmem.h"
 #include "kgsl_iommu.h"
+#include "kgsl_trace.h"
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
@@ -99,6 +100,7 @@
 			.irq_name = KGSL_3D0_IRQ,
 		},
 		.iomemname = KGSL_3D0_REG_MEMORY,
+		.shadermemname = KGSL_3D0_SHADER_MEMORY,
 		.ftbl = &adreno_functable,
 #ifdef CONFIG_HAS_EARLYSUSPEND
 		.display_off = {
@@ -166,10 +168,10 @@
 	/* size of gmem for gpu*/
 	unsigned int gmem_size;
 	/* version of pm4 microcode that supports sync_lock
-	   between CPU and GPU for SMMU-v1 programming */
+	   between CPU and GPU for IOMMU-v0 programming */
 	unsigned int sync_lock_pm4_ver;
 	/* version of pfp microcode that supports sync_lock
-	   between CPU and GPU for SMMU-v1 programming */
+	   between CPU and GPU for IOMMU-v0 programming */
 	unsigned int sync_lock_pfp_ver;
 } adreno_gpulist[] = {
 	{ ADRENO_REV_A200, 0, 2, ANY_ID, ANY_ID,
@@ -198,18 +200,331 @@
 		"a225_pm4.fw", "a225_pfp.fw", &adreno_a2xx_gpudev,
 		1536, 768, 3, SZ_512K, 0x225011, 0x225002 },
 	/* A3XX doesn't use the pix_shader_start */
-	{ ADRENO_REV_A305, 3, 0, 5, ANY_ID,
+	{ ADRENO_REV_A305, 3, 0, 5, 0,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_256K, 0x3FF037, 0x3FF016 },
 	/* A3XX doesn't use the pix_shader_start */
 	{ ADRENO_REV_A320, 3, 2, ANY_ID, ANY_ID,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_512K, 0x3FF037, 0x3FF016 },
-	{ ADRENO_REV_A330, 3, 3, 0, 0,
+	{ ADRENO_REV_A330, 3, 3, 0, ANY_ID,
 		"a330_pm4.fw", "a330_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_1M, NO_VER, NO_VER },
+	{ ADRENO_REV_A305B, 3, 0, 5, 0x10,
+		"a330_pm4.fw", "a330_pfp.fw", &adreno_a3xx_gpudev,
+		512, 0, 2, SZ_128K, NO_VER, NO_VER },
+	{ ADRENO_REV_A305C, 3, 0, 5, 0x20,
+		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
+		512, 0, 2, SZ_128K, 0x3FF037, 0x3FF016 },
 };
 
+/**
+ * adreno_perfcounter_init: Reserve kernel performance counters
+ * @device: device to configure
+ *
+ * The kernel needs/wants a certain group of performance counters for
+ * its own activities.  Reserve these performance counters at init time
+ * to ensure that they are always reserved for the kernel.  The performance
+ * counters used by the kernel can be obtained by the user, but these
+ * performance counters will remain active as long as the device is alive.
+ */
+
+static void adreno_perfcounter_init(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	if (adreno_dev->gpudev->perfcounter_init)
+		adreno_dev->gpudev->perfcounter_init(adreno_dev);
+};
+
+/**
+ * adreno_perfcounter_start: Enable performance counters
+ * @adreno_dev: Adreno device to configure
+ *
+ * Ensure all performance counters are enabled that are allocated.  Since
+ * the device was most likely stopped, we can't trust that the counters
+ * are still valid so make it so.
+ */
+
+static void adreno_perfcounter_start(struct adreno_device *adreno_dev)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	unsigned int i, j;
+
+	/* group id iter */
+	for (i = 0; i < counters->group_count; i++) {
+		group = &(counters->groups[i]);
+
+		/* countable iter */
+		for (j = 0; j < group->reg_count; j++) {
+			if (group->regs[j].countable ==
+					KGSL_PERFCOUNTER_NOT_USED)
+				continue;
+
+			if (adreno_dev->gpudev->perfcounter_enable)
+				adreno_dev->gpudev->perfcounter_enable(
+					adreno_dev, i, j,
+					group->regs[j].countable);
+		}
+	}
+}
+
+/**
+ * adreno_perfcounter_read_group: Determine which countables are in counters
+ * @adreno_dev: Adreno device to configure
+ * @reads: List of kgsl_perfcounter_read_groups
+ * @count: Length of list
+ *
+ * Read the performance counters for the groupid/countable pairs and return
+ * the 64 bit result for each pair
+ */
+
+int adreno_perfcounter_read_group(struct adreno_device *adreno_dev,
+	struct kgsl_perfcounter_read_group *reads, unsigned int count)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	struct kgsl_perfcounter_read_group *list = NULL;
+	unsigned int i, j;
+	int ret = 0;
+
+	/* perfcounter get/put/query/read not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	/* sanity check for later */
+	if (!adreno_dev->gpudev->perfcounter_read)
+		return -EINVAL;
+
+	/* sanity check params passed in */
+	if (reads == NULL || count == 0 || count > 100)
+		return -EINVAL;
+
+	/* verify valid inputs group ids and countables */
+	for (i = 0; i < count; i++) {
+		if (reads[i].groupid >= counters->group_count)
+			return -EINVAL;
+	}
+
+	list = kmalloc(sizeof(struct kgsl_perfcounter_read_group) * count,
+			GFP_KERNEL);
+	if (!list)
+		return -ENOMEM;
+
+	if (copy_from_user(list, reads,
+			sizeof(struct kgsl_perfcounter_read_group) * count)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* list iterator */
+	for (j = 0; j < count; j++) {
+		list[j].value = 0;
+
+		group = &(counters->groups[list[j].groupid]);
+
+		/* group/counter iterator */
+		for (i = 0; i < group->reg_count; i++) {
+			if (group->regs[i].countable == list[j].countable) {
+				list[j].value =
+					adreno_dev->gpudev->perfcounter_read(
+					adreno_dev, list[j].groupid,
+					i, group->regs[i].offset);
+				break;
+			}
+		}
+	}
+
+	/* write the data */
+	if (copy_to_user(reads, list,
+			sizeof(struct kgsl_perfcounter_read_group) *
+			count) != 0)
+		ret = -EFAULT;
+
+done:
+	kfree(list);
+	return ret;
+}
+
+/**
+ * adreno_perfcounter_query_group: Determine which countables are in counters
+ * @adreno_dev: Adreno device to configure
+ * @groupid: Desired performance counter group
+ * @countables: Return list of all countables in the groups counters
+ * @count: Max length of the array
+ * @max_counters: max counters for the groupid
+ *
+ * Query the current state of counters for the group.
+ */
+
+int adreno_perfcounter_query_group(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int *countables, unsigned int count,
+	unsigned int *max_counters)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	unsigned int i;
+
+	*max_counters = 0;
+
+	/* perfcounter get/put/query not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	if (groupid >= counters->group_count)
+		return -EINVAL;
+
+	group = &(counters->groups[groupid]);
+	*max_counters = group->reg_count;
+
+	/*
+	 * if NULL countable or *count of zero, return max reg_count in
+	 * *max_counters and return success
+	 */
+	if (countables == NULL || count == 0)
+		return 0;
+
+	/*
+	 * Go through all available counters.  Write upto *count * countable
+	 * values.
+	 */
+	for (i = 0; i < group->reg_count && i < count; i++) {
+		if (copy_to_user(&countables[i], &(group->regs[i].countable),
+				sizeof(unsigned int)) != 0)
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * adreno_perfcounter_get: Try to put a countable in an available counter
+ * @adreno_dev: Adreno device to configure
+ * @groupid: Desired performance counter group
+ * @countable: Countable desired to be in a counter
+ * @offset: Return offset of the countable
+ * @flags: Used to setup kernel perf counters
+ *
+ * Try to place a countable in an available counter.  If the countable is
+ * already in a counter, reference count the counter/countable pair resource
+ * and return success
+ */
+
+int adreno_perfcounter_get(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable, unsigned int *offset,
+	unsigned int flags)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	unsigned int i, empty = -1;
+
+	/* always clear return variables */
+	if (offset)
+		*offset = 0;
+
+	/* perfcounter get/put/query not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	if (groupid >= counters->group_count)
+		return -EINVAL;
+
+	group = &(counters->groups[groupid]);
+
+	/*
+	 * Check if the countable is already associated with a counter.
+	 * Refcount and return the offset, otherwise, try and find an empty
+	 * counter and assign the countable to it.
+	 */
+	for (i = 0; i < group->reg_count; i++) {
+		if (group->regs[i].countable == countable) {
+			/* Countable already associated with counter */
+			group->regs[i].refcount++;
+			group->regs[i].flags |= flags;
+			if (offset)
+				*offset = group->regs[i].offset;
+			return 0;
+		} else if (group->regs[i].countable ==
+			KGSL_PERFCOUNTER_NOT_USED) {
+			/* keep track of unused counter */
+			empty = i;
+		}
+	}
+
+	/* no available counters, so do nothing else */
+	if (empty == -1)
+		return -EBUSY;
+
+	/* initialize the new counter */
+	group->regs[empty].countable = countable;
+	group->regs[empty].refcount = 1;
+
+	/* enable the new counter */
+	adreno_dev->gpudev->perfcounter_enable(adreno_dev, groupid, empty,
+		countable);
+
+	group->regs[empty].flags = flags;
+
+	if (offset)
+		*offset = group->regs[empty].offset;
+
+	return 0;
+}
+
+
+/**
+ * adreno_perfcounter_put: Release a countable from counter resource
+ * @adreno_dev: Adreno device to configure
+ * @groupid: Desired performance counter group
+ * @countable: Countable desired to be freed from a  counter
+ *
+ * Put a performance counter/countable pair that was previously received.  If
+ * noone else is using the countable, free up the counter for others.
+ */
+int adreno_perfcounter_put(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+
+	unsigned int i;
+
+	/* perfcounter get/put/query not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	if (groupid >= counters->group_count)
+		return -EINVAL;
+
+	group = &(counters->groups[groupid]);
+
+	for (i = 0; i < group->reg_count; i++) {
+		if (group->regs[i].countable == countable) {
+			if (group->regs[i].refcount > 0) {
+				group->regs[i].refcount--;
+
+				/*
+				 * book keeping to ensure we never free a
+				 * perf counter used by kernel
+				 */
+				if (group->regs[i].flags &&
+					group->regs[i].refcount == 0)
+					group->regs[i].refcount++;
+
+				/* make available if not used */
+				if (group->regs[i].refcount == 0)
+					group->regs[i].countable =
+						KGSL_PERFCOUNTER_NOT_USED;
+			}
+
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
 static irqreturn_t adreno_irq_handler(struct kgsl_device *device)
 {
 	irqreturn_t result;
@@ -254,26 +569,29 @@
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
-	result = kgsl_mmu_map_global(pagetable, &rb->buffer_desc,
-				     GSL_PT_PAGE_RV);
+	result = kgsl_mmu_map_global(pagetable, &rb->buffer_desc);
 	if (result)
 		goto error;
 
-	result = kgsl_mmu_map_global(pagetable, &rb->memptrs_desc,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &rb->memptrs_desc);
 	if (result)
 		goto unmap_buffer_desc;
 
-	result = kgsl_mmu_map_global(pagetable, &device->memstore,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->memstore);
 	if (result)
 		goto unmap_memptrs_desc;
 
-	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory);
 	if (result)
 		goto unmap_memstore_desc;
 
+	/*
+	 * Set the mpu end to the last "normal" global memory we use.
+	 * For the IOMMU, this will be used to restrict access to the
+	 * mapped registers.
+	 */
+	device->mh.mpu_range = device->mmu.setstate_memory.gpuaddr +
+				device->mmu.setstate_memory.size;
 	return result;
 
 unmap_memstore_desc:
@@ -294,7 +612,7 @@
 					uint32_t flags)
 {
 	unsigned int pt_val, reg_pt_val;
-	unsigned int link[250];
+	unsigned int link[230];
 	unsigned int *cmds = &link[0];
 	int sizedwords = 0;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
@@ -302,8 +620,14 @@
 	struct kgsl_context *context;
 	struct adreno_context *adreno_ctx = NULL;
 
-	if (!adreno_dev->drawctxt_active)
+	/*
+	 * If we're idle and we don't need to use the GPU to save context
+	 * state, use the CPU instead of the GPU to reprogram the
+	 * iommu for simplicity's sake.
+	 */
+	 if (!adreno_dev->drawctxt_active || device->ftbl->isidle(device))
 		return kgsl_mmu_device_setstate(&device->mmu, flags);
+
 	num_iommu_units = kgsl_mmu_get_num_iommu_units(&device->mmu);
 
 	context = idr_find(&device->context_idr, context_id);
@@ -427,7 +751,7 @@
 		adreno_dev->ringbuffer.timestamp[KGSL_MEMSTORE_GLOBAL], true);
 	}
 
-	if (sizedwords > (sizeof(link)/sizeof(unsigned int))) {
+	if (sizedwords > (ARRAY_SIZE(link))) {
 		KGSL_DRV_ERR(device, "Temp command buffer overflow\n");
 		BUG();
 	}
@@ -605,7 +929,7 @@
 	/* 8x25 returns 0 for minor id, but it should be 1 */
 	if (cpu_is_qsd8x50())
 		patchid = 1;
-	else if (cpu_is_msm8625() && minorid == 0)
+	else if ((cpu_is_msm8625() || cpu_is_msm8625q()) && minorid == 0)
 		minorid = 1;
 
 	chipid |= (minorid << 8) | patchid;
@@ -669,7 +993,6 @@
 	adreno_dev->instruction_size = adreno_gpulist[i].instruction_size;
 	adreno_dev->gmem_size = adreno_gpulist[i].gmem_size;
 	adreno_dev->gpulist_index = i;
-
 }
 
 static struct platform_device_id adreno_id_table[] = {
@@ -756,6 +1079,10 @@
 		&pdata->init_level))
 		pdata->init_level = 1;
 
+	if (adreno_of_read_property(parent, "qcom,step-pwrlevel",
+		&pdata->step_mul))
+		pdata->step_mul = 1;
+
 	if (pdata->init_level < 0 || pdata->init_level > pdata->num_levels) {
 		KGSL_CORE_ERR("Initial power level out of range\n");
 		pdata->init_level = 1;
@@ -981,9 +1308,17 @@
 			goto err;
 		}
 
-		if (adreno_of_read_property(child, "qcom,iommu-ctx-sids",
-			&ctxs[ctx_index].ctx_id))
+		ret = of_property_read_u32_array(child, "reg", reg_val, 2);
+		if (ret) {
+			KGSL_CORE_ERR("Unable to read KGSL IOMMU 'reg'\n");
 			goto err;
+		}
+		if (msm_soc_version_supports_iommu_v0())
+			ctxs[ctx_index].ctx_id = (reg_val[0] -
+				data->physstart) >> KGSL_IOMMU_CTX_SHIFT;
+		else
+			ctxs[ctx_index].ctx_id = ((reg_val[0] -
+				data->physstart) >> KGSL_IOMMU_CTX_SHIFT) - 8;
 
 		ctx_index++;
 	}
@@ -1038,15 +1373,17 @@
 	if (ret)
 		goto err;
 
-	/* Default value is 83, if not found in DT */
 	if (adreno_of_read_property(pdev->dev.of_node, "qcom,idle-timeout",
 		&pdata->idle_timeout))
-		pdata->idle_timeout = 83;
+		pdata->idle_timeout = HZ/12;
 
 	if (adreno_of_read_property(pdev->dev.of_node, "qcom,nap-allowed",
 		&pdata->nap_allowed))
 		pdata->nap_allowed = 1;
 
+	pdata->strtstp_sleepwake = of_property_read_bool(pdev->dev.of_node,
+						"qcom,strtstp-sleepwake");
+
 	if (adreno_of_read_property(pdev->dev.of_node, "qcom,clk-map",
 		&pdata->clk_map))
 		goto err;
@@ -1098,7 +1435,8 @@
 static int
 adreno_ocmem_gmem_malloc(struct adreno_device *adreno_dev)
 {
-	if (!adreno_is_a330(adreno_dev))
+	if (!(adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)))
 		return 0;
 
 	/* OCMEM is only needed once, do not support consective allocation */
@@ -1119,7 +1457,8 @@
 static void
 adreno_ocmem_gmem_free(struct adreno_device *adreno_dev)
 {
-	if (!adreno_is_a330(adreno_dev))
+	if (!(adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)))
 		return;
 
 	if (adreno_dev->ocmem_hdl == NULL)
@@ -1202,10 +1541,10 @@
 	return 0;
 }
 
-static int adreno_start(struct kgsl_device *device, unsigned int init_ram)
+static int adreno_init(struct kgsl_device *device)
 {
-	int status = -EINVAL;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
 	if (KGSL_STATE_DUMP_AND_FT != device->state)
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
@@ -1231,10 +1570,9 @@
 	if (adreno_dev->gpurev == ADRENO_REV_UNKNOWN) {
 		KGSL_DRV_ERR(device, "Unknown chip ID %x\n",
 			adreno_dev->chip_id);
-		goto error_clk_off;
+		BUG_ON(1);
 	}
 
-
 	/*
 	 * Check if firmware supports the sync lock PM4 packets needed
 	 * for IOMMUv1
@@ -1246,7 +1584,34 @@
 		adreno_gpulist[adreno_dev->gpulist_index].sync_lock_pfp_ver))
 		device->mmu.flags |= KGSL_MMU_FLAGS_IOMMU_SYNC;
 
-	/* Set up the MMU */
+	rb->timestamp[KGSL_MEMSTORE_GLOBAL] = 0;
+
+	/* Assign correct RBBM status register to hang detect regs
+	 */
+	ft_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+
+	adreno_perfcounter_init(device);
+
+	/* Power down the device */
+	kgsl_pwrctrl_disable(device);
+
+	return 0;
+}
+
+static int adreno_start(struct kgsl_device *device)
+{
+	int status = -EINVAL;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	kgsl_cffdump_open(device);
+
+	if (KGSL_STATE_DUMP_AND_FT != device->state)
+		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
+
+	/* Power up the device */
+	kgsl_pwrctrl_enable(device);
+
+	/* Set up a2xx special case */
 	if (adreno_is_a2xx(adreno_dev)) {
 		/*
 		 * the MH_CLNT_INTF_CTRL_CONFIG registers aren't present
@@ -1260,20 +1625,6 @@
 		kgsl_mh_start(device);
 	}
 
-	/* Assign correct RBBM status register to hang detect regs
-	 */
-	ft_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
-
-	/* Add A3XX specific registers for hang detection */
-	if (adreno_is_a3xx(adreno_dev)) {
-		ft_detect_regs[6] = A3XX_RBBM_PERFCTR_SP_7_LO;
-		ft_detect_regs[7] = A3XX_RBBM_PERFCTR_SP_7_HI;
-		ft_detect_regs[8] = A3XX_RBBM_PERFCTR_SP_6_LO;
-		ft_detect_regs[9] = A3XX_RBBM_PERFCTR_SP_6_HI;
-		ft_detect_regs[10] = A3XX_RBBM_PERFCTR_SP_5_LO;
-		ft_detect_regs[11] = A3XX_RBBM_PERFCTR_SP_5_HI;
-	}
-
 	status = kgsl_mmu_start(device);
 	if (status)
 		goto error_clk_off;
@@ -1290,22 +1641,30 @@
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
 	device->ftbl->irqctrl(device, 1);
 
-	status = adreno_ringbuffer_start(&adreno_dev->ringbuffer, init_ram);
-	if (status == 0) {
-		/* While fault tolerance is on we do not want timer to
-		 * fire and attempt to change any device state */
-		if (KGSL_STATE_DUMP_AND_FT != device->state)
-			mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
-		return 0;
-	}
+	status = adreno_ringbuffer_start(&adreno_dev->ringbuffer);
+	if (status)
+		goto error_irq_off;
 
+	/* While fault tolerance is on we do not want timer to
+	 * fire and attempt to change any device state */
+	if (KGSL_STATE_DUMP_AND_FT != device->state)
+		mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
+
+	adreno_perfcounter_start(adreno_dev);
+
+	device->reset_counter++;
+
+	return 0;
+
+error_irq_off:
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
 
 error_mmu_off:
 	kgsl_mmu_stop(&device->mmu);
 
 error_clk_off:
-	kgsl_pwrctrl_disable(device);
+	if (KGSL_STATE_DUMP_AND_FT != device->state)
+		kgsl_pwrctrl_disable(device);
 
 	return status;
 }
@@ -1329,6 +1688,8 @@
 	/* Power down the device */
 	kgsl_pwrctrl_disable(device);
 
+	kgsl_cffdump_close(device->id);
+
 	return 0;
 }
 
@@ -1411,6 +1772,8 @@
 			start_ptr = adreno_ringbuffer_dec_wrapped(start_ptr,
 									size);
 		kgsl_sharedmem_readl(&rb->buffer_desc, &val1, start_ptr);
+		/* Ensure above read is finished before next read */
+		rmb();
 		if (KGSL_CMD_IDENTIFIER == val1) {
 			if ((start_ptr / sizeof(unsigned int)) != rb->wptr)
 				start_ptr = adreno_ringbuffer_dec_wrapped(
@@ -1448,6 +1811,8 @@
 					temp_rb_rptr, size);
 		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i],
 					temp_rb_rptr);
+		/* Ensure above read is finished before next read */
+		rmb();
 
 		if (check && ((inc && val[i] == global_eop) ||
 			(!inc && (val[i] ==
@@ -1512,6 +1877,8 @@
 
 	while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
 		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
+		/* Ensure above read is finished before next read */
+		rmb();
 
 		if (check && val[i] == ib1) {
 			/* decrement i, i.e i = (i - 1 + 2) % 2 */
@@ -1553,7 +1920,7 @@
 	return status;
 }
 
-static int adreno_setup_ft_data(struct kgsl_device *device,
+static void adreno_setup_ft_data(struct kgsl_device *device,
 					struct adreno_ft_data *ft_data)
 {
 	int ret = 0;
@@ -1578,30 +1945,30 @@
 			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 			eoptimestamp));
 
+	/* Ensure context id and global eop ts read complete */
+	rmb();
+
 	ft_data->rb_buffer = vmalloc(rb->buffer_desc.size);
 	if (!ft_data->rb_buffer) {
 		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
 				rb->buffer_desc.size);
-		return -ENOMEM;
+		return;
 	}
 
 	ft_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
 	if (!ft_data->bad_rb_buffer) {
 		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
 				rb->buffer_desc.size);
-		ret = -ENOMEM;
-		goto done;
+		return;
 	}
 
 	ft_data->good_rb_buffer = vmalloc(rb->buffer_desc.size);
 	if (!ft_data->good_rb_buffer) {
 		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
 				rb->buffer_desc.size);
-		ret = -ENOMEM;
-		goto done;
+		return;
 	}
-
-	ft_data->status =  0;
+	ft_data->status = 0;
 
 	/* find the start of bad command sequence in rb */
 	context = idr_find(&device->context_idr, ft_data->context_id);
@@ -1612,20 +1979,23 @@
 		 * If there is no context then fault tolerance does not need to
 		 * replay anything, just reset GPU and thats it
 		 */
-		goto done;
+		return;
 	}
-	ret = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
-					ft_data->global_eop + 1, false);
-	if (ret)
-		goto done;
-
-	ft_data->start_of_replay_cmds = rb_rptr;
-
-	if (!adreno_dev->ft_policy)
-		adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
 
 	ft_data->ft_policy = adreno_dev->ft_policy;
 
+	if (!ft_data->ft_policy)
+		ft_data->ft_policy = KGSL_FT_DEFAULT_POLICY;
+
+	ret = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
+					ft_data->global_eop + 1, false);
+	if (ret) {
+		ft_data->ft_policy |= KGSL_FT_TEMP_DISABLE;
+		return;
+	} else
+		ft_data->ft_policy &= ~KGSL_FT_TEMP_DISABLE;
+
+	ft_data->start_of_replay_cmds = rb_rptr;
 
 	adreno_context = context->devctxt;
 	if (adreno_context->flags & CTXT_FLAGS_PREAMBLE) {
@@ -1636,20 +2006,12 @@
 				KGSL_FT_ERR(device,
 				"Start not found for replay IB sequence\n");
 				ret = 0;
-				goto done;
+				return;
 			}
 			ft_data->start_of_replay_cmds = rb_rptr;
 			ft_data->replay_for_snapshot = rb_rptr;
 		}
 	}
-
-done:
-	if (ret) {
-		vfree(ft_data->rb_buffer);
-		vfree(ft_data->bad_rb_buffer);
-		vfree(ft_data->good_rb_buffer);
-	}
-	return ret;
 }
 
 static int
@@ -1663,6 +2025,8 @@
 			&curr_global_ts,
 			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 			eoptimestamp));
+	/* Ensure above read is finished before long ib check */
+	rmb();
 
 	/* Mark long ib as handled */
 	adreno_dev->long_ib = 0;
@@ -1680,8 +2044,7 @@
 
 static int
 _adreno_ft_restart_device(struct kgsl_device *device,
-		   struct kgsl_context *context,
-		   struct adreno_ft_data *ft_data)
+			   struct kgsl_context *context)
 {
 
 	struct adreno_context *adreno_context = context->devctxt;
@@ -1692,7 +2055,12 @@
 		return 1;
 	}
 
-	if (adreno_start(device, true)) {
+	if (adreno_init(device)) {
+		KGSL_FT_ERR(device, "Device init failed\n");
+		return 1;
+	}
+
+	if (adreno_start(device)) {
 		KGSL_FT_ERR(device, "Device start failed\n");
 		return 1;
 	}
@@ -1751,11 +2119,30 @@
 			unsigned int *buff, unsigned int size)
 {
 	unsigned int ret = 0;
+	unsigned int retry_num = 0;
 
 	_adreno_debug_ft_info(device, ft_data);
 
-	if (_adreno_ft_restart_device(device, context, ft_data))
-		return 1;
+	do {
+		ret = _adreno_ft_restart_device(device, context);
+		if (ret == 0)
+			break;
+		/*
+		 * If device restart fails sleep for 20ms before
+		 * attempting restart. This allows GPU HW to settle
+		 * and improve the chances of next restart to be
+		 * successful.
+		 */
+		msleep(20);
+		KGSL_FT_ERR(device, "Retry device restart %d\n", retry_num);
+		retry_num++;
+	} while (retry_num < 4);
+
+	if (ret) {
+		KGSL_FT_ERR(device, "Device restart failed\n");
+		BUG_ON(1);
+		goto done;
+	}
 
 	if (size) {
 
@@ -1765,6 +2152,7 @@
 		ret = adreno_idle(device);
 	}
 
+done:
 	return ret;
 }
 
@@ -1779,11 +2167,13 @@
 	struct kgsl_context *context;
 	struct adreno_context *adreno_context = NULL;
 	struct adreno_context *last_active_ctx = adreno_dev->drawctxt_active;
+	unsigned int long_ib = 0;
 
 	context = idr_find(&device->context_idr, ft_data->context_id);
 	if (context == NULL) {
-		KGSL_FT_CRIT(device, "Last context unknown id:%d\n",
+		KGSL_FT_ERR(device, "Last context unknown id:%d\n",
 			ft_data->context_id);
+		goto play_good_cmds;
 	} else {
 		adreno_context = context->devctxt;
 		adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
@@ -1793,11 +2183,17 @@
 		 */
 		context->wait_on_invalid_ts = false;
 
+		if (!(adreno_context->flags & CTXT_FLAGS_PER_CONTEXT_TS)) {
+			KGSL_FT_ERR(device, "Fault tolerance not supported\n");
+			goto play_good_cmds;
+		}
+
 		/*
 		 *  This flag will be set by userspace for contexts
 		 *  that do not want to be fault tolerant (ex: OPENCL)
 		 */
 		if (adreno_context->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE) {
+			ft_data->status = 1;
 			KGSL_FT_ERR(device,
 			"No FT set for this context play good cmds\n");
 			goto play_good_cmds;
@@ -1805,51 +2201,57 @@
 
 	}
 
+	/* Check if we detected a long running IB, if false return */
+	if (adreno_dev->long_ib) {
+		long_ib = _adreno_check_long_ib(device);
+		if (!long_ib) {
+			adreno_context->flags &= ~CTXT_FLAGS_GPU_HANG;
+			return 0;
+		}
+	}
+
 	/*
 	 * Extract valid contents from rb which can still be executed after
 	 * hang
 	 */
 	adreno_ringbuffer_extract(rb, ft_data);
 
-	/* Check if we detected a long running IB,
-	 * if true do not attempt replay of bad cmds */
-	if (adreno_dev->long_ib) {
-		if (_adreno_check_long_ib(device)) {
-			ft_data->status = 1;
-			_adreno_debug_ft_info(device, ft_data);
-			goto play_good_cmds;
-		} else {
-			adreno_context->flags &= ~CTXT_FLAGS_GPU_HANG;
-			return 0;
-		}
-	}
-
-	/* Do not try the bad commands if  hang is due to a fault */
-	if (device->mmu.fault) {
-		KGSL_FT_ERR(device, "MMU fault skipping bad cmds\n");
-		device->mmu.fault = 0;
+	/* If long IB detected do not attempt replay of bad cmds */
+	if (long_ib) {
+		_adreno_debug_ft_info(device, ft_data);
 		goto play_good_cmds;
 	}
 
-	if (ft_data->ft_policy & KGSL_FT_DISABLE) {
+	if ((ft_data->ft_policy & KGSL_FT_DISABLE) ||
+		(ft_data->ft_policy & KGSL_FT_TEMP_DISABLE)) {
 		KGSL_FT_ERR(device, "NO FT policy play only good cmds\n");
+		ft_data->status = 1;
 		goto play_good_cmds;
 	}
 
+	/* Do not try the reply if hang is due to a pagefault */
+	if (adreno_context->pagefault) {
+		if ((ft_data->context_id == adreno_context->id) &&
+			(ft_data->global_eop == adreno_context->pagefault_ts)) {
+			ft_data->ft_policy &= ~KGSL_FT_REPLAY;
+			KGSL_FT_ERR(device, "MMU fault skipping replay\n");
+		}
+
+		adreno_context->pagefault = 0;
+	}
+
 	if (ft_data->ft_policy & KGSL_FT_REPLAY) {
-
 		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
 				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
 
 		if (ret) {
-			KGSL_FT_ERR(device, "Replay unsuccessful\n");
+			KGSL_FT_ERR(device, "Replay status: 1\n");
 			ft_data->status = 1;
 		} else
 			goto play_good_cmds;
 	}
 
 	if (ft_data->ft_policy & KGSL_FT_SKIPIB) {
-
 		for (i = 0; i < ft_data->bad_rb_size; i++) {
 			if ((ft_data->bad_rb_buffer[i] ==
 					CP_HDR_INDIRECT_BUFFER_PFD) &&
@@ -1874,7 +2276,7 @@
 				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
 
 		if (ret) {
-			KGSL_FT_ERR(device, "NOP faulty IB unsuccessful\n");
+			KGSL_FT_ERR(device, "NOP faulty IB status: 1\n");
 			ft_data->status = 1;
 		} else {
 			ft_data->status = 0;
@@ -1883,7 +2285,6 @@
 	}
 
 	if (ft_data->ft_policy & KGSL_FT_SKIPFRAME) {
-
 		for (i = 0; i < ft_data->bad_rb_size; i++) {
 			if (ft_data->bad_rb_buffer[i] ==
 					KGSL_END_OF_FRAME_IDENTIFIER) {
@@ -1905,7 +2306,7 @@
 				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
 
 		if (ret) {
-			KGSL_FT_ERR(device, "Skip EOF unsuccessful\n");
+			KGSL_FT_ERR(device, "Skip EOF status: 1\n");
 			ft_data->status = 1;
 		} else {
 			ft_data->status = 0;
@@ -1983,9 +2384,7 @@
 			/* setup new fault tolerance parameters and retry, this
 			 * means more than 1 contexts are causing hang */
 			adreno_destroy_ft_data(ft_data);
-			ret = adreno_setup_ft_data(device, ft_data);
-			if (ret)
-				goto done;
+			adreno_setup_ft_data(device, ft_data);
 			KGSL_FT_INFO(device,
 			"Retry. Parameters: "
 			"IB1: 0x%X, Bad context_id: %u, global_eop: 0x%x\n",
@@ -2050,7 +2449,12 @@
 		kgsl_pwrctrl_pwrlevel_change(device, pwr->max_pwrlevel);
 
 		/* Get the fault tolerance data as soon as hang is detected */
-		result = adreno_setup_ft_data(device, &ft_data);
+		adreno_setup_ft_data(device, &ft_data);
+		/*
+		 * Trigger an automatic dump of the state to
+		 * the console
+		 */
+		kgsl_postmortem_dump(device, 0);
 
 		/*
 		 * If long ib is detected, do not attempt postmortem or
@@ -2072,10 +2476,8 @@
 			kgsl_device_snapshot(device, 1);
 		}
 
-		if (!result) {
-			result = adreno_ft(device, &ft_data);
-			adreno_destroy_ft_data(&ft_data);
-		}
+		result = adreno_ft(device, &ft_data);
+		adreno_destroy_ft_data(&ft_data);
 
 		/* restore power level */
 		kgsl_pwrctrl_pwrlevel_change(device, curr_pwrlevel);
@@ -2228,39 +2630,6 @@
 			status = 0;
 		}
 		break;
-	case KGSL_PROP_FAULT_TOLERANCE: {
-			struct kgsl_ft_config ftd;
-
-			if (adreno_dev->ft_user_control == 0)
-				break;
-
-			if (sizebytes != sizeof(ftd))
-				break;
-
-			if (copy_from_user(&ftd, (void __user *) value,
-							   sizeof(ftd))) {
-				status = -EFAULT;
-				break;
-			}
-
-			if (ftd.ft_policy)
-				adreno_dev->ft_policy = ftd.ft_policy;
-			else
-				adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
-
-			if (ftd.ft_pf_policy)
-				adreno_dev->ft_pf_policy = ftd.ft_policy;
-			else
-				adreno_dev->ft_pf_policy =
-					KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
-
-			if (ftd.ft_pm_dump)
-				device->pm_dump_enable = 1;
-			else
-				device->pm_dump_enable = 0;
-
-		}
-		break;
 	default:
 		break;
 	}
@@ -2512,12 +2881,23 @@
 	return memdesc ? kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr) : NULL;
 }
 
-void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
-				unsigned int *value)
+/**
+ * adreno_read - General read function to read adreno device memory
+ * @device - Pointer to the GPU device struct (for adreno device)
+ * @base - Base address (kernel virtual) where the device memory is mapped
+ * @offsetwords - Offset in words from the base address, of the memory that
+ * is to be read
+ * @value - Value read from the device memory
+ * @mem_len - Length of the device memory mapped to the kernel
+ */
+static void adreno_read(struct kgsl_device *device, void *base,
+		unsigned int offsetwords, unsigned int *value,
+		unsigned int mem_len)
 {
+
 	unsigned int *reg;
-	BUG_ON(offsetwords*sizeof(uint32_t) >= device->reg_len);
-	reg = (unsigned int *)(device->reg_virt + (offsetwords << 2));
+	BUG_ON(offsetwords*sizeof(uint32_t) >= mem_len);
+	reg = (unsigned int *)(base + (offsetwords << 2));
 
 	if (!in_interrupt())
 		kgsl_pre_hwaccess(device);
@@ -2528,6 +2908,31 @@
 	rmb();
 }
 
+/**
+ * adreno_regread - Used to read adreno device registers
+ * @offsetwords - Word (4 Bytes) offset to the register to be read
+ * @value - Value read from device register
+ */
+void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
+	unsigned int *value)
+{
+	adreno_read(device, device->reg_virt, offsetwords, value,
+						device->reg_len);
+}
+
+/**
+ * adreno_shadermem_regread - Used to read GPU (adreno) shader memory
+ * @device - GPU device whose shader memory is to be read
+ * @offsetwords - Offset in words, of the shader memory address to be read
+ * @value - Pointer to where the read shader mem value is to be stored
+ */
+void adreno_shadermem_regread(struct kgsl_device *device,
+	unsigned int offsetwords, unsigned int *value)
+{
+	adreno_read(device, device->shader_mem_virt, offsetwords, value,
+					device->shader_mem_len);
+}
+
 void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value)
 {
@@ -2538,6 +2943,8 @@
 	if (!in_interrupt())
 		kgsl_pre_hwaccess(device);
 
+	trace_kgsl_regwrite(device, offsetwords, value);
+
 	kgsl_cffdump_regwrite(device->id, offsetwords << 2, value);
 	reg = (unsigned int *)(device->reg_virt + (offsetwords << 2));
 
@@ -2612,6 +3019,7 @@
 		kgsl_sharedmem_writel(&device->memstore,
 				KGSL_MEMSTORE_OFFSET(context_id,
 					ts_cmp_enable), enableflag);
+
 		/* Make sure the memstore write gets posted */
 		wmb();
 
@@ -2621,9 +3029,10 @@
 		 * get an interrupt
 		 */
 
-		if (context && device->state != KGSL_STATE_SLUMBER)
+		if (context && device->state != KGSL_STATE_SLUMBER) {
 			adreno_ringbuffer_issuecmds(device, context->devctxt,
 					KGSL_CMD_FLAGS_NONE, NULL, 0);
+		}
 	}
 
 	return 0;
@@ -2687,10 +3096,13 @@
 	if (!adreno_dev->long_ib_detect)
 		long_ib_detected = 0;
 
+	if (!(adreno_dev->ringbuffer.flags & KGSL_FLAGS_STARTED))
+		return 0;
+
 	if (is_adreno_rbbm_status_idle(device)) {
 
 		/*
-		 * On A20X if the RPTR != WPTR and the device is idle, then
+		 * On A2XX if the RPTR != WPTR and the device is idle, then
 		 * the last write to WPTR probably failed to latch so write it
 		 * again
 		 */
@@ -2731,7 +3143,7 @@
 			&curr_global_ts,
 			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 			eoptimestamp));
-
+	/* Make sure the memstore read has posted */
 	mb();
 
 	if (curr_global_ts == prev_global_ts) {
@@ -2742,6 +3154,8 @@
 				&curr_context_id,
 				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
 				current_context));
+			/* Make sure the memstore read has posted */
+			mb();
 			context = idr_find(&device->context_idr,
 				curr_context_id);
 			if (context != NULL) {
@@ -2753,8 +3167,6 @@
 			}
 		}
 
-		mb();
-
 		if (curr_context != NULL) {
 
 			curr_context->ib_gpu_time_used += KGSL_TIMEOUT_PART;
@@ -2863,6 +3275,8 @@
 	ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
 
 	adreno_regread(device, REG_CP_RB_RPTR, &rptr);
+
+	/* Make sure timestamp check finished before triggering a hang */
 	mb();
 
 	KGSL_DRV_WARN(device,
@@ -3090,7 +3504,8 @@
 		break;
 	}
 	case KGSL_TIMESTAMP_CONSUMED:
-		adreno_regread(device, REG_CP_TIMESTAMP, &timestamp);
+		kgsl_sharedmem_readl(&device->memstore, &timestamp,
+			KGSL_MEMSTORE_OFFSET(context_id, soptimestamp));
 		break;
 	case KGSL_TIMESTAMP_RETIRED:
 		kgsl_sharedmem_readl(&device->memstore, &timestamp,
@@ -3106,27 +3521,55 @@
 static long adreno_ioctl(struct kgsl_device_private *dev_priv,
 			      unsigned int cmd, void *data)
 {
+	struct kgsl_device *device = dev_priv->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	int result = 0;
-	struct kgsl_drawctxt_set_bin_base_offset *binbase;
-	struct kgsl_context *context;
 
 	switch (cmd) {
-	case IOCTL_KGSL_DRAWCTXT_SET_BIN_BASE_OFFSET:
+	case IOCTL_KGSL_DRAWCTXT_SET_BIN_BASE_OFFSET: {
+		struct kgsl_drawctxt_set_bin_base_offset *binbase = data;
+		struct kgsl_context *context;
+
 		binbase = data;
 
 		context = kgsl_find_context(dev_priv, binbase->drawctxt_id);
 		if (context) {
 			adreno_drawctxt_set_bin_base_offset(
-				dev_priv->device, context, binbase->offset);
+				device, context, binbase->offset);
 		} else {
 			result = -EINVAL;
-			KGSL_DRV_ERR(dev_priv->device,
+			KGSL_DRV_ERR(device,
 				"invalid drawctxt drawctxt_id %d "
 				"device_id=%d\n",
-				binbase->drawctxt_id, dev_priv->device->id);
+				binbase->drawctxt_id, device->id);
 		}
 		break;
-
+	}
+	case IOCTL_KGSL_PERFCOUNTER_GET: {
+		struct kgsl_perfcounter_get *get = data;
+		result = adreno_perfcounter_get(adreno_dev, get->groupid,
+			get->countable, &get->offset, PERFCOUNTER_FLAG_NONE);
+		break;
+	}
+	case IOCTL_KGSL_PERFCOUNTER_PUT: {
+		struct kgsl_perfcounter_put *put = data;
+		result = adreno_perfcounter_put(adreno_dev, put->groupid,
+			put->countable);
+		break;
+	}
+	case IOCTL_KGSL_PERFCOUNTER_QUERY: {
+		struct kgsl_perfcounter_query *query = data;
+		result = adreno_perfcounter_query_group(adreno_dev,
+			query->groupid, query->countables,
+			query->count, &query->max_counters);
+		break;
+	}
+	case IOCTL_KGSL_PERFCOUNTER_READ: {
+		struct kgsl_perfcounter_read *read = data;
+		result = adreno_perfcounter_read_group(adreno_dev,
+			read->reads, read->count);
+		break;
+	}
 	default:
 		KGSL_DRV_INFO(dev_priv->device,
 			"invalid ioctl code %08x\n", cmd);
@@ -3203,6 +3646,7 @@
 	.idle = adreno_idle,
 	.isidle = adreno_isidle,
 	.suspend_context = adreno_suspend_context,
+	.init = adreno_init,
 	.start = adreno_start,
 	.stop = adreno_stop,
 	.getproperty = adreno_getproperty,
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index d319c98..90d6027 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -39,6 +39,7 @@
 /* Command identifiers */
 #define KGSL_CONTEXT_TO_MEM_IDENTIFIER	0x2EADBEEF
 #define KGSL_CMD_IDENTIFIER		0x2EEDFACE
+#define KGSL_CMD_INTERNAL_IDENTIFIER	0x2EEDD00D
 #define KGSL_START_OF_IB_IDENTIFIER	0x2EADEABE
 #define KGSL_END_OF_IB_IDENTIFIER	0x2ABEDEAD
 #define KGSL_END_OF_FRAME_IDENTIFIER	0x2E0F2E0F
@@ -72,8 +73,10 @@
 	ADRENO_REV_A220 = 220,
 	ADRENO_REV_A225 = 225,
 	ADRENO_REV_A305 = 305,
+	ADRENO_REV_A305C = 306,
 	ADRENO_REV_A320 = 320,
 	ADRENO_REV_A330 = 330,
+	ADRENO_REV_A305B = 335,
 };
 
 struct adreno_gpudev;
@@ -103,7 +106,6 @@
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
 	unsigned int ft_policy;
-	unsigned int ft_user_control;
 	unsigned int long_ib_detect;
 	unsigned int long_ib;
 	unsigned int long_ib_ts;
@@ -111,6 +113,45 @@
 	unsigned int gpulist_index;
 	struct ocmem_buf *ocmem_hdl;
 	unsigned int ocmem_base;
+	unsigned int gpu_cycles;
+};
+
+#define PERFCOUNTER_FLAG_NONE 0x0
+#define PERFCOUNTER_FLAG_KERNEL 0x1
+
+/* Structs to maintain the list of active performance counters */
+
+/**
+ * struct adreno_perfcount_register: register state
+ * @countable: countable the register holds
+ * @refcount: number of users of the register
+ * @offset: register hardware offset
+ */
+struct adreno_perfcount_register {
+	unsigned int countable;
+	unsigned int refcount;
+	unsigned int offset;
+	unsigned int flags;
+};
+
+/**
+ * struct adreno_perfcount_group: registers for a hardware group
+ * @regs: available registers for this group
+ * @reg_count: total registers for this group
+ */
+struct adreno_perfcount_group {
+	struct adreno_perfcount_register *regs;
+	unsigned int reg_count;
+};
+
+/**
+ * adreno_perfcounts: all available perfcounter groups
+ * @groups: available groups for this device
+ * @group_count: total groups for this device
+ */
+struct adreno_perfcounters {
+	struct adreno_perfcount_group *groups;
+	unsigned int group_count;
 };
 
 struct adreno_gpudev {
@@ -124,6 +165,8 @@
 	/* keeps track of when we need to execute the draw workaround code */
 	int ctx_switches_since_last_draw;
 
+	struct adreno_perfcounters *perfcounters;
+
 	/* GPU specific function hooks */
 	int (*ctxt_create)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_save)(struct adreno_device *, struct adreno_context *);
@@ -134,9 +177,15 @@
 	void (*irq_control)(struct adreno_device *, int);
 	unsigned int (*irq_pending)(struct adreno_device *);
 	void * (*snapshot)(struct adreno_device *, void *, int *, int);
-	void (*rb_init)(struct adreno_device *, struct adreno_ringbuffer *);
+	int (*rb_init)(struct adreno_device *, struct adreno_ringbuffer *);
+	void (*perfcounter_init)(struct adreno_device *);
 	void (*start)(struct adreno_device *);
 	unsigned int (*busy_cycles)(struct adreno_device *);
+	void (*perfcounter_enable)(struct adreno_device *, unsigned int group,
+		unsigned int counter, unsigned int countable);
+	uint64_t (*perfcounter_read)(struct adreno_device *adreno_dev,
+		unsigned int group, unsigned int counter,
+		unsigned int offset);
 };
 
 /*
@@ -179,6 +228,22 @@
 	unsigned int replay_for_snapshot;
 };
 
+/* Fault Tolerance policy flags */
+#define  KGSL_FT_DISABLE                  BIT(0)
+#define  KGSL_FT_REPLAY                   BIT(1)
+#define  KGSL_FT_SKIPIB                   BIT(2)
+#define  KGSL_FT_SKIPFRAME                BIT(3)
+#define  KGSL_FT_TEMP_DISABLE             BIT(4)
+#define  KGSL_FT_DEFAULT_POLICY           (KGSL_FT_REPLAY + KGSL_FT_SKIPIB)
+
+/* Pagefault policy flags */
+#define KGSL_FT_PAGEFAULT_INT_ENABLE         0x00000001
+#define KGSL_FT_PAGEFAULT_GPUHALT_ENABLE     0x00000002
+#define KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE   0x00000004
+#define KGSL_FT_PAGEFAULT_LOG_ONE_PER_INT    0x00000008
+#define KGSL_FT_PAGEFAULT_DEFAULT_POLICY     (KGSL_FT_PAGEFAULT_INT_ENABLE + \
+					KGSL_FT_PAGEFAULT_GPUHALT_ENABLE)
+
 extern struct adreno_gpudev adreno_a2xx_gpudev;
 extern struct adreno_gpudev adreno_a3xx_gpudev;
 
@@ -210,7 +275,13 @@
 void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value);
 
+void adreno_shadermem_regread(struct kgsl_device *device,
+						unsigned int offsetwords,
+						unsigned int *value);
+
 int adreno_dump(struct kgsl_device *device, int manual);
+unsigned int adreno_a3xx_rbbm_clock_ctl_default(struct adreno_device
+							*adreno_dev);
 
 struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
@@ -234,6 +305,13 @@
 unsigned int adreno_ft_detect(struct kgsl_device *device,
 						unsigned int *prev_reg_val);
 
+int adreno_perfcounter_get(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable, unsigned int *offset,
+	unsigned int flags);
+
+int adreno_perfcounter_put(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable);
+
 static inline int adreno_is_a200(struct adreno_device *adreno_dev)
 {
 	return (adreno_dev->gpurev == ADRENO_REV_A200);
@@ -285,6 +363,16 @@
 	return (adreno_dev->gpurev == ADRENO_REV_A305);
 }
 
+static inline int adreno_is_a305b(struct adreno_device *adreno_dev)
+{
+	return (adreno_dev->gpurev == ADRENO_REV_A305B);
+}
+
+static inline int adreno_is_a305c(struct adreno_device *adreno_dev)
+{
+	return (adreno_dev->gpurev == ADRENO_REV_A305C);
+}
+
 static inline int adreno_is_a320(struct adreno_device *adreno_dev)
 {
 	return (adreno_dev->gpurev == ADRENO_REV_A320);
@@ -295,6 +383,12 @@
 	return (adreno_dev->gpurev == ADRENO_REV_A330);
 }
 
+static inline int adreno_is_a330v2(struct adreno_device *adreno_dev)
+{
+	return ((adreno_dev->gpurev == ADRENO_REV_A330) &&
+		(ADRENO_CHIPID_PATCH(adreno_dev->chip_id) > 0));
+}
+
 static inline int adreno_rb_ctxtswitch(unsigned int *cmd)
 {
 	return (cmd[0] == cp_nop_packet(1) &&
@@ -400,12 +494,13 @@
 	unsigned int *start = cmds;
 
 	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
-	*cmds++ = 0x00000000;
+	*cmds++ = 0;
 
 	if ((adreno_dev->gpurev == ADRENO_REV_A305) ||
+		(adreno_dev->gpurev == ADRENO_REV_A305C) ||
 		(adreno_dev->gpurev == ADRENO_REV_A320)) {
 		*cmds++ = cp_type3_packet(CP_WAIT_FOR_ME, 1);
-		*cmds++ = 0x00000000;
+		*cmds++ = 0;
 	}
 
 	return cmds - start;
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
index ba4e507..dd9bdc3 100644
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1515,18 +1515,26 @@
 			"Current active context has caused gpu hang\n");
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
-
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->reg_save[1],
+			context->reg_save[2] << 2, true);
 		/* save registers and constants. */
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->reg_save, 3);
 
 		if (context->flags & CTXT_FLAGS_SHADER_SAVE) {
+			kgsl_cffdump_syncmem(NULL, &context->gpustate,
+				context->shader_save[1],
+				context->shader_save[2] << 2, true);
 			/* save shader partitioning and instructions. */
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_PMODE,
 				context->shader_save, 3);
 
+			kgsl_cffdump_syncmem(NULL, &context->gpustate,
+				context->shader_fixup[1],
+				context->shader_fixup[2] << 2, true);
 			/*
 			 * fixup shader partitioning parameter for
 			 *  SET_SHADER_BASES.
@@ -1541,6 +1549,9 @@
 
 	if ((context->flags & CTXT_FLAGS_GMEM_SAVE) &&
 	    (context->flags & CTXT_FLAGS_GMEM_SHADOW)) {
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->context_gmem_shadow.gmem_save[1],
+			context->context_gmem_shadow.gmem_save[2] << 2, true);
 		/* save gmem.
 		 * (note: changes shader. shader must already be saved.)
 		 */
@@ -1548,6 +1559,10 @@
 			KGSL_CMD_FLAGS_PMODE,
 			context->context_gmem_shadow.gmem_save, 3);
 
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->chicken_restore[1],
+			context->chicken_restore[2] << 2, true);
+
 		/* Restore TP0_CHICKEN */
 		if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
 			adreno_ringbuffer_issuecmds(device, context,
@@ -1574,8 +1589,6 @@
 		return;
 	}
 
-	KGSL_CTXT_INFO(device, "context flags %08x\n", context->flags);
-
 	cmds[0] = cp_nop_packet(1);
 	cmds[1] = KGSL_CONTEXT_TO_MEM_IDENTIFIER;
 	cmds[2] = cp_type3_packet(CP_MEM_WRITE, 2);
@@ -1586,21 +1599,24 @@
 					cmds, 5);
 	kgsl_mmu_setstate(&device->mmu, context->pagetable, context->id);
 
-#ifndef CONFIG_MSM_KGSL_CFF_DUMP_NO_CONTEXT_MEM_DUMP
-	kgsl_cffdump_syncmem(NULL, &context->gpustate,
-		context->gpustate.gpuaddr, LCC_SHADOW_SIZE +
-		REG_SHADOW_SIZE + CMD_BUFFER_SIZE + TEX_SHADOW_SIZE, false);
-#endif
-
 	/* restore gmem.
 	 *  (note: changes shader. shader must not already be restored.)
 	 */
 	if (context->flags & CTXT_FLAGS_GMEM_RESTORE) {
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->context_gmem_shadow.gmem_restore[1],
+			context->context_gmem_shadow.gmem_restore[2] << 2,
+			true);
+
 		adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_PMODE,
 			context->context_gmem_shadow.gmem_restore, 3);
 
 		if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
+			kgsl_cffdump_syncmem(NULL, &context->gpustate,
+				context->chicken_restore[1],
+				context->chicken_restore[2] << 2, true);
+
 			/* Restore TP0_CHICKEN */
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
@@ -1611,6 +1627,9 @@
 	}
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->reg_restore[1],
+			context->reg_restore[2] << 2, true);
 
 		/* restore registers and constants. */
 		adreno_ringbuffer_issuecmds(device, context,
@@ -1618,6 +1637,10 @@
 
 		/* restore shader instructions & partitioning. */
 		if (context->flags & CTXT_FLAGS_SHADER_RESTORE) {
+			kgsl_cffdump_syncmem(NULL, &context->gpustate,
+				context->shader_restore[1],
+				context->shader_restore[2] << 2, true);
+
 			adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->shader_restore, 3);
@@ -1727,12 +1750,8 @@
 	adreno_regwrite(device, REG_CP_INT_ACK, status);
 
 	if (status & (CP_INT_CNTL__IB1_INT_MASK | CP_INT_CNTL__RB_INT_MASK)) {
-		KGSL_CMD_WARN(rb->device, "ringbuffer ib1/rb interrupt\n");
 		queue_work(device->work_queue, &device->ts_expired_ws);
 		wake_up_interruptible_all(&device->wait_queue);
-		atomic_notifier_call_chain(&(device->ts_notifier_list),
-					   device->id,
-					   NULL);
 	}
 }
 
@@ -1821,23 +1840,26 @@
 static unsigned int a2xx_irq_pending(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
-	unsigned int rbbm, cp, mh;
+	unsigned int status;
 
-	adreno_regread(device, REG_RBBM_INT_CNTL, &rbbm);
-	adreno_regread(device, REG_CP_INT_CNTL, &cp);
-	adreno_regread(device, MH_INTERRUPT_MASK, &mh);
+	adreno_regread(device, REG_MASTER_INT_SIGNAL, &status);
 
-	return ((rbbm & RBBM_INT_MASK) || (cp & CP_INT_MASK) ||
-		(mh & kgsl_mmu_get_int_mask())) ? 1 : 0;
+	return (status &
+		(MASTER_INT_SIGNAL__MH_INT_STAT |
+		 MASTER_INT_SIGNAL__CP_INT_STAT |
+		 MASTER_INT_SIGNAL__RBBM_INT_STAT)) ? 1 : 0;
 }
 
-static void a2xx_rb_init(struct adreno_device *adreno_dev,
+static int a2xx_rb_init(struct adreno_device *adreno_dev,
 			struct adreno_ringbuffer *rb)
 {
 	unsigned int *cmds, cmds_gpu;
 
 	/* ME_INIT */
 	cmds = adreno_ringbuffer_allocspace(rb, NULL, 19);
+	if (cmds == NULL)
+		return -ENOMEM;
+
 	cmds_gpu = rb->buffer_desc.gpuaddr + sizeof(uint)*(rb->wptr-19);
 
 	GSL_RB_WRITE(cmds, cmds_gpu, cp_type3_packet(CP_ME_INIT, 18));
@@ -1890,6 +1912,8 @@
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
 	adreno_ringbuffer_submit(rb);
+
+	return 0;
 }
 
 static unsigned int a2xx_busy_cycles(struct adreno_device *adreno_dev)
diff --git a/drivers/gpu/msm/adreno_a2xx_snapshot.c b/drivers/gpu/msm/adreno_a2xx_snapshot.c
index 75795b1..2c86f82 100644
--- a/drivers/gpu/msm/adreno_a2xx_snapshot.c
+++ b/drivers/gpu/msm/adreno_a2xx_snapshot.c
@@ -224,6 +224,31 @@
 	return DEBUG_SECTION_SZ(MIUDEBUG_COUNT);
 }
 
+/* Snapshot the istore memory */
+static int a2xx_snapshot_istore(struct kgsl_device *device, void *snapshot,
+	int remain, void *priv)
+{
+	struct kgsl_snapshot_istore *header = snapshot;
+	unsigned int *data = snapshot + sizeof(*header);
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	int count, i;
+
+	count = adreno_dev->istore_size * adreno_dev->instruction_size;
+
+	if (remain < (count * 4) + sizeof(*header)) {
+		KGSL_DRV_ERR(device,
+			"snapshot: Not enough memory for the istore section");
+		return 0;
+	}
+
+	header->count = adreno_dev->istore_size;
+
+	for (i = 0; i < count; i++)
+		kgsl_regread(device, ADRENO_ISTORE_START + i, &data[i]);
+
+	return (count * 4) + sizeof(*header);
+}
+
 /* A2XX GPU snapshot function - this is where all of the A2XX specific
  * bits and pieces are grabbed into the snapshot memory
  */
@@ -338,6 +363,18 @@
 		}
 	}
 
+	/*
+	 * Only dump the istore on a hang - reading it on a running system
+	 * has a non zero chance of hanging the GPU.
+	 */
+
+	if (adreno_is_a2xx(adreno_dev) && hang) {
+		snapshot = kgsl_snapshot_add_section(device,
+			KGSL_SNAPSHOT_SECTION_ISTORE, snapshot, remain,
+			a2xx_snapshot_istore, NULL);
+	}
+
+
 	/* Reset the clock gating */
 	adreno_regwrite(device, REG_RBBM_PM_OVERRIDE2, pmoverride);
 
diff --git a/drivers/gpu/msm/adreno_a3xx.c b/drivers/gpu/msm/adreno_a3xx.c
index 3d9ec6d..13c723a 100644
--- a/drivers/gpu/msm/adreno_a3xx.c
+++ b/drivers/gpu/msm/adreno_a3xx.c
@@ -52,8 +52,8 @@
 	0x2240, 0x227e,
 	0x2280, 0x228b, 0x22c0, 0x22c0, 0x22c4, 0x22ce, 0x22d0, 0x22d8,
 	0x22df, 0x22e6, 0x22e8, 0x22e9, 0x22ec, 0x22ec, 0x22f0, 0x22f7,
-	0x22ff, 0x22ff, 0x2340, 0x2343, 0x2348, 0x2349, 0x2350, 0x2356,
-	0x2360, 0x2360, 0x2440, 0x2440, 0x2444, 0x2444, 0x2448, 0x244d,
+	0x22ff, 0x22ff, 0x2340, 0x2343,
+	0x2440, 0x2440, 0x2444, 0x2444, 0x2448, 0x244d,
 	0x2468, 0x2469, 0x246c, 0x246d, 0x2470, 0x2470, 0x2472, 0x2472,
 	0x2474, 0x2475, 0x2479, 0x247a, 0x24c0, 0x24d3, 0x24e4, 0x24ef,
 	0x2500, 0x2509, 0x250c, 0x250c, 0x250e, 0x250e, 0x2510, 0x2511,
@@ -61,8 +61,8 @@
 	0x25f0, 0x25f0,
 	0x2640, 0x267e, 0x2680, 0x268b, 0x26c0, 0x26c0, 0x26c4, 0x26ce,
 	0x26d0, 0x26d8, 0x26df, 0x26e6, 0x26e8, 0x26e9, 0x26ec, 0x26ec,
-	0x26f0, 0x26f7, 0x26ff, 0x26ff, 0x2740, 0x2743, 0x2748, 0x2749,
-	0x2750, 0x2756, 0x2760, 0x2760, 0x300C, 0x300E, 0x301C, 0x301D,
+	0x26f0, 0x26f7, 0x26ff, 0x26ff, 0x2740, 0x2743,
+	0x300C, 0x300E, 0x301C, 0x301D,
 	0x302A, 0x302A, 0x302C, 0x302D, 0x3030, 0x3031, 0x3034, 0x3036,
 	0x303C, 0x303C, 0x305E, 0x305F,
 };
@@ -445,6 +445,25 @@
 	tmp_ctx.cmd = cmd;
 }
 
+unsigned int adreno_a3xx_rbbm_clock_ctl_default(struct adreno_device
+							*adreno_dev)
+{
+	if (adreno_is_a305(adreno_dev))
+		return A305_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a305c(adreno_dev))
+		return A305C_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a320(adreno_dev))
+		return A320_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a330v2(adreno_dev))
+		return A330v2_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a330(adreno_dev))
+		return A330_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a305b(adreno_dev))
+		return A305B_RBBM_CLOCK_CTL_DEFAULT;
+
+	BUG_ON(1);
+}
+
 /* Copy GMEM contents to system memory shadow. */
 static unsigned int *build_gmem2sys_cmds(struct adreno_device *adreno_dev,
 					 struct adreno_context *drawctxt,
@@ -454,7 +473,7 @@
 	unsigned int *start = cmds;
 
 	*cmds++ = cp_type0_packet(A3XX_RBBM_CLOCK_CTL, 1);
-	*cmds++ = A3XX_RBBM_CLOCK_CTL_DEFAULT;
+	*cmds++ = adreno_a3xx_rbbm_clock_ctl_default(adreno_dev);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MODE_CONTROL);
@@ -1250,7 +1269,7 @@
 	unsigned int *start = cmds;
 
 	*cmds++ = cp_type0_packet(A3XX_RBBM_CLOCK_CTL, 1);
-	*cmds++ = A3XX_RBBM_CLOCK_CTL_DEFAULT;
+	*cmds++ = adreno_a3xx_rbbm_clock_ctl_default(adreno_dev);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG);
@@ -2400,6 +2419,11 @@
 		 * already be saved.)
 		 */
 
+		kgsl_cffdump_syncmem(NULL,
+			&context->gpustate,
+			context->context_gmem_shadow.gmem_save[1],
+			context->context_gmem_shadow.gmem_save[2] << 2, true);
+
 		adreno_ringbuffer_issuecmds(device, context,
 					KGSL_CMD_FLAGS_PMODE,
 					    context->context_gmem_shadow.
@@ -2421,8 +2445,6 @@
 		return;
 	}
 
-	KGSL_CTXT_INFO(device, "context flags %08x\n", context->flags);
-
 	cmds[0] = cp_nop_packet(1);
 	cmds[1] = KGSL_CONTEXT_TO_MEM_IDENTIFIER;
 	cmds[2] = cp_type3_packet(CP_MEM_WRITE, 2);
@@ -2439,6 +2461,12 @@
 	 */
 
 	if (context->flags & CTXT_FLAGS_GMEM_RESTORE) {
+		kgsl_cffdump_syncmem(NULL,
+			&context->gpustate,
+			context->context_gmem_shadow.gmem_restore[1],
+			context->context_gmem_shadow.gmem_restore[2] << 2,
+			true);
+
 		adreno_ringbuffer_issuecmds(device, context,
 					KGSL_CMD_FLAGS_PMODE,
 					    context->context_gmem_shadow.
@@ -2471,11 +2499,14 @@
 	}
 }
 
-static void a3xx_rb_init(struct adreno_device *adreno_dev,
+static int a3xx_rb_init(struct adreno_device *adreno_dev,
 			 struct adreno_ringbuffer *rb)
 {
 	unsigned int *cmds, cmds_gpu;
 	cmds = adreno_ringbuffer_allocspace(rb, NULL, 18);
+	if (cmds == NULL)
+		return -ENOMEM;
+
 	cmds_gpu = rb->buffer_desc.gpuaddr + sizeof(uint) * (rb->wptr - 18);
 
 	GSL_RB_WRITE(cmds, cmds_gpu, cp_type3_packet(CP_ME_INIT, 17));
@@ -2499,6 +2530,8 @@
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
 	adreno_ringbuffer_submit(rb);
+
+	return 0;
 }
 
 static void a3xx_err_callback(struct adreno_device *adreno_dev, int bit)
@@ -2581,9 +2614,213 @@
 
 	/* Schedule work to free mem and issue ibs */
 	queue_work(device->work_queue, &device->ts_expired_ws);
+}
 
-	atomic_notifier_call_chain(&device->ts_notifier_list,
-				   device->id, NULL);
+/**
+ * struct a3xx_perfcounter_register - Define a performance counter register
+ * @load_bit: the bit to set in RBBM_LOAD_CMD0/RBBM_LOAD_CMD1 to force the RBBM
+ * to load the reset value into the appropriate counter
+ * @select: The dword offset of the register to write the selected
+ * countable into
+ */
+
+struct a3xx_perfcounter_register {
+	unsigned int load_bit;
+	unsigned int select;
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_cp[] = {
+	{ 0, A3XX_CP_PERFCOUNTER_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_rbbm[] = {
+	{ 1, A3XX_RBBM_PERFCOUNTER0_SELECT },
+	{ 2, A3XX_RBBM_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_pc[] = {
+	{ 3, A3XX_PC_PERFCOUNTER0_SELECT },
+	{ 4, A3XX_PC_PERFCOUNTER1_SELECT },
+	{ 5, A3XX_PC_PERFCOUNTER2_SELECT },
+	{ 6, A3XX_PC_PERFCOUNTER3_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_vfd[] = {
+	{ 7, A3XX_VFD_PERFCOUNTER0_SELECT },
+	{ 8, A3XX_VFD_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_hlsq[] = {
+	{ 9, A3XX_HLSQ_PERFCOUNTER0_SELECT },
+	{ 10, A3XX_HLSQ_PERFCOUNTER1_SELECT },
+	{ 11, A3XX_HLSQ_PERFCOUNTER2_SELECT },
+	{ 12, A3XX_HLSQ_PERFCOUNTER3_SELECT },
+	{ 13, A3XX_HLSQ_PERFCOUNTER4_SELECT },
+	{ 14, A3XX_HLSQ_PERFCOUNTER5_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_vpc[] = {
+	{ 15, A3XX_VPC_PERFCOUNTER0_SELECT },
+	{ 16, A3XX_VPC_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_tse[] = {
+	{ 17, A3XX_GRAS_PERFCOUNTER0_SELECT },
+	{ 18, A3XX_GRAS_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_ras[] = {
+	{ 19, A3XX_GRAS_PERFCOUNTER2_SELECT },
+	{ 20, A3XX_GRAS_PERFCOUNTER3_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_uche[] = {
+	{ 21, A3XX_UCHE_PERFCOUNTER0_SELECT },
+	{ 22, A3XX_UCHE_PERFCOUNTER1_SELECT },
+	{ 23, A3XX_UCHE_PERFCOUNTER2_SELECT },
+	{ 24, A3XX_UCHE_PERFCOUNTER3_SELECT },
+	{ 25, A3XX_UCHE_PERFCOUNTER4_SELECT },
+	{ 26, A3XX_UCHE_PERFCOUNTER5_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_tp[] = {
+	{ 27, A3XX_TP_PERFCOUNTER0_SELECT },
+	{ 28, A3XX_TP_PERFCOUNTER1_SELECT },
+	{ 29, A3XX_TP_PERFCOUNTER2_SELECT },
+	{ 30, A3XX_TP_PERFCOUNTER3_SELECT },
+	{ 31, A3XX_TP_PERFCOUNTER4_SELECT },
+	{ 32, A3XX_TP_PERFCOUNTER5_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_sp[] = {
+	{ 33, A3XX_SP_PERFCOUNTER0_SELECT },
+	{ 34, A3XX_SP_PERFCOUNTER1_SELECT },
+	{ 35, A3XX_SP_PERFCOUNTER2_SELECT },
+	{ 36, A3XX_SP_PERFCOUNTER3_SELECT },
+	{ 37, A3XX_SP_PERFCOUNTER4_SELECT },
+	{ 38, A3XX_SP_PERFCOUNTER5_SELECT },
+	{ 39, A3XX_SP_PERFCOUNTER6_SELECT },
+	{ 40, A3XX_SP_PERFCOUNTER7_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_rb[] = {
+	{ 41, A3XX_RB_PERFCOUNTER0_SELECT },
+	{ 42, A3XX_RB_PERFCOUNTER1_SELECT },
+};
+
+#define REGCOUNTER_GROUP(_x) { (_x), ARRAY_SIZE((_x)) }
+
+static struct {
+	struct a3xx_perfcounter_register *regs;
+	int count;
+} a3xx_perfcounter_reglist[] = {
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_cp),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_rbbm),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_pc),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_vfd),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_hlsq),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_vpc),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_tse),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_ras),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_uche),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_tp),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_sp),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_rb),
+};
+
+static void a3xx_perfcounter_enable_pwr(struct kgsl_device *device,
+	unsigned int countable)
+{
+	unsigned int in, out;
+
+	adreno_regread(device, A3XX_RBBM_RBBM_CTL, &in);
+
+	if (countable == 0)
+		out = in | RBBM_RBBM_CTL_RESET_PWR_CTR0;
+	else
+		out = in | RBBM_RBBM_CTL_RESET_PWR_CTR1;
+
+	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, out);
+
+	if (countable == 0)
+		out = in | RBBM_RBBM_CTL_ENABLE_PWR_CTR0;
+	else
+		out = in | RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
+
+	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, out);
+
+	return;
+}
+
+/*
+ * a3xx_perfcounter_enable - Configure a performance counter for a countable
+ * @adreno_dev -  Adreno device to configure
+ * @group - Desired performance counter group
+ * @counter - Desired performance counter in the group
+ * @countable - Desired countable
+ *
+ * Physically set up a counter within a group with the desired countable
+ */
+
+static void a3xx_perfcounter_enable(struct adreno_device *adreno_dev,
+	unsigned int group, unsigned int counter, unsigned int countable)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	unsigned int val = 0;
+	struct a3xx_perfcounter_register *reg;
+
+	if (group > ARRAY_SIZE(a3xx_perfcounter_reglist))
+		return;
+
+	if (counter > a3xx_perfcounter_reglist[group].count)
+		return;
+
+	/* Special case - power */
+	if (group == KGSL_PERFCOUNTER_GROUP_PWR)
+		return a3xx_perfcounter_enable_pwr(device, countable);
+
+	reg = &(a3xx_perfcounter_reglist[group].regs[counter]);
+
+	/* Select the desired perfcounter */
+	adreno_regwrite(device, reg->select, countable);
+
+	if (reg->load_bit < 32) {
+		val = 1 << reg->load_bit;
+		adreno_regwrite(device, A3XX_RBBM_PERFCTR_LOAD_CMD0, val);
+	} else {
+		val  = 1 << (reg->load_bit - 32);
+		adreno_regwrite(device, A3XX_RBBM_PERFCTR_LOAD_CMD1, val);
+	}
+}
+
+static uint64_t a3xx_perfcounter_read(struct adreno_device *adreno_dev,
+	unsigned int group, unsigned int counter,
+	unsigned int offset)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct a3xx_perfcounter_register *reg = NULL;
+	unsigned int lo = 0, hi = 0;
+	unsigned int val;
+
+	if (group > ARRAY_SIZE(a3xx_perfcounter_reglist))
+		return 0;
+
+	reg = &(a3xx_perfcounter_reglist[group].regs[counter]);
+
+	/* Freeze the counter */
+	adreno_regread(device, A3XX_RBBM_PERFCTR_CTL, &val);
+	val &= ~reg->load_bit;
+	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, val);
+
+	/* Read the values */
+	adreno_regread(device, offset, &lo);
+	adreno_regread(device, offset + 1, &hi);
+
+	/* Re-Enable the counter */
+	val |= reg->load_bit;
+	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, val);
+
+	return (((uint64_t) hi) << 32) | lo;
 }
 
 #define A3XX_IRQ_CALLBACK(_c) { .func = _c }
@@ -2687,26 +2924,22 @@
 static unsigned int a3xx_busy_cycles(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
-	unsigned int reg, val;
-
-	/* Freeze the counter */
-	adreno_regread(device, A3XX_RBBM_RBBM_CTL, &reg);
-	reg &= ~RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
-	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
+	unsigned int val;
+	unsigned int ret = 0;
 
 	/* Read the value */
 	adreno_regread(device, A3XX_RBBM_PERFCTR_PWR_1_LO, &val);
 
-	/* Reset the counter */
-	reg |= RBBM_RBBM_CTL_RESET_PWR_CTR1;
-	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
+	/* Return 0 for the first read */
+	if (adreno_dev->gpu_cycles != 0) {
+		if (val < adreno_dev->gpu_cycles)
+			ret = (0xFFFFFFFF - adreno_dev->gpu_cycles) + val;
+		else
+			ret = val - adreno_dev->gpu_cycles;
+	}
 
-	/* Re-enable the counter */
-	reg &= ~RBBM_RBBM_CTL_RESET_PWR_CTR1;
-	reg |= RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
-	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
-
-	return val;
+	adreno_dev->gpu_cycles = val;
+	return ret;
 }
 
 struct a3xx_vbif_data {
@@ -2734,6 +2967,29 @@
 	{0, 0},
 };
 
+static struct a3xx_vbif_data a305b_vbif[] = {
+	{ A3XX_VBIF_IN_RD_LIM_CONF0, 0x00181818 },
+	{ A3XX_VBIF_IN_WR_LIM_CONF0, 0x00181818 },
+	{ A3XX_VBIF_OUT_RD_LIM_CONF0, 0x00000018 },
+	{ A3XX_VBIF_OUT_WR_LIM_CONF0, 0x00000018 },
+	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x00000303 },
+	{ A3XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x0003 },
+	{0, 0},
+};
+
+static struct a3xx_vbif_data a305c_vbif[] = {
+	{ A3XX_VBIF_IN_RD_LIM_CONF0, 0x00101010 },
+	{ A3XX_VBIF_IN_WR_LIM_CONF0, 0x00101010 },
+	{ A3XX_VBIF_OUT_RD_LIM_CONF0, 0x00000010 },
+	{ A3XX_VBIF_OUT_WR_LIM_CONF0, 0x00000010 },
+	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x00000101 },
+	{ A3XX_VBIF_ARB_CTL, 0x00000010 },
+	/* Set up AOOO */
+	{ A3XX_VBIF_OUT_AXI_AOOO_EN, 0x00000007 },
+	{ A3XX_VBIF_OUT_AXI_AOOO, 0x00070007 },
+	{0, 0},
+};
+
 static struct a3xx_vbif_data a320_vbif[] = {
 	/* Set up 16 deep read/write request queues */
 	{ A3XX_VBIF_IN_RD_LIM_CONF0, 0x10101010 },
@@ -2784,17 +3040,81 @@
 	{0, 0},
 };
 
+/*
+ * Most of the VBIF registers on 8974v2 have the correct values at power on, so
+ * we won't modify those if we don't need to
+ */
+static struct a3xx_vbif_data a330v2_vbif[] = {
+	/* Enable 1k sort */
+	{ A3XX_VBIF_ABIT_SORT, 0x0001003F },
+	{ A3XX_VBIF_ABIT_SORT_CONF, 0x000000A4 },
+	/* Enable WR-REQ */
+	{ A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x00003F },
+	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303 },
+	/* Set up VBIF_ROUND_ROBIN_QOS_ARB */
+	{ A3XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x0003 },
+	{0, 0},
+};
+
+static struct {
+	int(*devfunc)(struct adreno_device *);
+	struct a3xx_vbif_data *vbif;
+} a3xx_vbif_platforms[] = {
+	{ adreno_is_a305, a305_vbif },
+	{ adreno_is_a305c, a305c_vbif },
+	{ adreno_is_a320, a320_vbif },
+	/* A330v2 needs to be ahead of A330 so the right device matches */
+	{ adreno_is_a330v2, a330v2_vbif },
+	{ adreno_is_a330, a330_vbif },
+	{ adreno_is_a305b, a305b_vbif },
+};
+
+static void a3xx_perfcounter_init(struct adreno_device *adreno_dev)
+{
+	/*
+	 * Set SP to count SP_ALU_ACTIVE_CYCLES, it includes
+	 * all ALU instruction execution regardless precision or shader ID.
+	 * Set SP to count SP0_ICL1_MISSES, It counts
+	 * USP L1 instruction miss request.
+	 * Set SP to count SP_FS_FULL_ALU_INSTRUCTIONS, it
+	 * counts USP flow control instruction execution.
+	 * we will use this to augment our hang detection
+	 */
+	if (adreno_dev->fast_hang_detect) {
+		adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+			SP_ALU_ACTIVE_CYCLES, &ft_detect_regs[6],
+			PERFCOUNTER_FLAG_KERNEL);
+		ft_detect_regs[7] = ft_detect_regs[6] + 1;
+		adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+			SP0_ICL1_MISSES, &ft_detect_regs[8],
+			PERFCOUNTER_FLAG_KERNEL);
+		ft_detect_regs[9] = ft_detect_regs[8] + 1;
+		adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+			SP_FS_CFLOW_INSTRUCTIONS, &ft_detect_regs[10],
+			PERFCOUNTER_FLAG_KERNEL);
+		ft_detect_regs[11] = ft_detect_regs[10] + 1;
+	}
+
+	adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+		SP_FS_FULL_ALU_INSTRUCTIONS, NULL, PERFCOUNTER_FLAG_KERNEL);
+
+	/* Reserve and start countable 1 in the PWR perfcounter group */
+	adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_PWR, 1,
+			NULL, PERFCOUNTER_FLAG_KERNEL);
+}
+
 static void a3xx_start(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 	struct a3xx_vbif_data *vbif = NULL;
+	int i;
 
-	if (adreno_is_a305(adreno_dev))
-		vbif = a305_vbif;
-	else if (adreno_is_a320(adreno_dev))
-		vbif = a320_vbif;
-	else if (adreno_is_a330(adreno_dev))
-		vbif = a330_vbif;
+	for (i = 0; i < ARRAY_SIZE(a3xx_vbif_platforms); i++) {
+		if (a3xx_vbif_platforms[i].devfunc(adreno_dev)) {
+			vbif = a3xx_vbif_platforms[i].vbif;
+			break;
+		}
+	}
 
 	BUG_ON(vbif == NULL);
 
@@ -2832,10 +3152,18 @@
 
 	/* Enable Clock gating */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL,
-			A3XX_RBBM_CLOCK_CTL_DEFAULT);
+		adreno_a3xx_rbbm_clock_ctl_default(adreno_dev));
+
+	if (adreno_is_a330v2(adreno_dev))
+		adreno_regwrite(device, A3XX_RBBM_GPR0_CTL,
+			A330v2_RBBM_GPR0_CTL_DEFAULT);
+	else if (adreno_is_a330(adreno_dev))
+		adreno_regwrite(device, A3XX_RBBM_GPR0_CTL,
+			A330_RBBM_GPR0_CTL_DEFAULT);
 
 	/* Set the OCMEM base address for A330 */
-	if (adreno_is_a330(adreno_dev)) {
+	if (adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)) {
 		adreno_regwrite(device, A3XX_RB_GMEM_BASE_ADDR,
 			(unsigned int)(adreno_dev->ocmem_base >> 14));
 	}
@@ -2843,25 +3171,121 @@
 	/* Turn on performance counters */
 	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, 0x01);
 
-	/*
-	 * Set SP perfcounter 5 to count SP_ALU_ACTIVE_CYCLES, it includes
-	 * all ALU instruction execution regardless precision or shader ID.
-	 * Set SP perfcounter 6 to count SP0_ICL1_MISSES, It counts
-	 * USP L1 instruction miss request.
-	 * Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS, it
-	 * counts USP flow control instruction execution.
-	 * we will use this to augment our hang detection
-	 */
-	if (adreno_dev->fast_hang_detect) {
-		adreno_regwrite(device, A3XX_SP_PERFCOUNTER5_SELECT,
-			SP_ALU_ACTIVE_CYCLES);
-		adreno_regwrite(device, A3XX_SP_PERFCOUNTER6_SELECT,
-			SP0_ICL1_MISSES);
-		adreno_regwrite(device, A3XX_SP_PERFCOUNTER7_SELECT,
-			SP_FS_CFLOW_INSTRUCTIONS);
-	}
+	/* Turn on the GPU busy counter and let it run free */
+
+	adreno_dev->gpu_cycles = 0;
 }
 
+/*
+ * Define the available perfcounter groups - these get used by
+ * adreno_perfcounter_get and adreno_perfcounter_put
+ */
+
+static struct adreno_perfcount_register a3xx_perfcounters_cp[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_CP_0_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_rbbm[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RBBM_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RBBM_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_pc[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_3_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_vfd[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VFD_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VFD_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_hlsq[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_5_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_vpc[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VPC_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VPC_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_tse[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TSE_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TSE_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_ras[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RAS_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RAS_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_uche[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_5_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_tp[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_5_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_sp[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_5_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_6_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_7_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_rb[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RB_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RB_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_pwr[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PWR_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PWR_1_LO, 0 },
+};
+
+static struct adreno_perfcount_group a3xx_perfcounter_groups[] = {
+	{ a3xx_perfcounters_cp, ARRAY_SIZE(a3xx_perfcounters_cp) },
+	{ a3xx_perfcounters_rbbm, ARRAY_SIZE(a3xx_perfcounters_rbbm) },
+	{ a3xx_perfcounters_pc, ARRAY_SIZE(a3xx_perfcounters_pc) },
+	{ a3xx_perfcounters_vfd, ARRAY_SIZE(a3xx_perfcounters_vfd) },
+	{ a3xx_perfcounters_hlsq, ARRAY_SIZE(a3xx_perfcounters_hlsq) },
+	{ a3xx_perfcounters_vpc, ARRAY_SIZE(a3xx_perfcounters_vpc) },
+	{ a3xx_perfcounters_tse, ARRAY_SIZE(a3xx_perfcounters_tse) },
+	{ a3xx_perfcounters_ras, ARRAY_SIZE(a3xx_perfcounters_ras) },
+	{ a3xx_perfcounters_uche, ARRAY_SIZE(a3xx_perfcounters_uche) },
+	{ a3xx_perfcounters_tp, ARRAY_SIZE(a3xx_perfcounters_tp) },
+	{ a3xx_perfcounters_sp, ARRAY_SIZE(a3xx_perfcounters_sp) },
+	{ a3xx_perfcounters_rb, ARRAY_SIZE(a3xx_perfcounters_rb) },
+	{ a3xx_perfcounters_pwr, ARRAY_SIZE(a3xx_perfcounters_pwr) },
+};
+
+static struct adreno_perfcounters a3xx_perfcounters = {
+	a3xx_perfcounter_groups,
+	ARRAY_SIZE(a3xx_perfcounter_groups),
+};
+
 /* Defined in adreno_a3xx_snapshot.c */
 void *a3xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 	int *remain, int hang);
@@ -2870,16 +3294,20 @@
 	.reg_rbbm_status = A3XX_RBBM_STATUS,
 	.reg_cp_pfp_ucode_addr = A3XX_CP_PFP_UCODE_ADDR,
 	.reg_cp_pfp_ucode_data = A3XX_CP_PFP_UCODE_DATA,
+	.perfcounters = &a3xx_perfcounters,
 
 	.ctxt_create = a3xx_drawctxt_create,
 	.ctxt_save = a3xx_drawctxt_save,
 	.ctxt_restore = a3xx_drawctxt_restore,
 	.ctxt_draw_workaround = NULL,
 	.rb_init = a3xx_rb_init,
+	.perfcounter_init = a3xx_perfcounter_init,
 	.irq_control = a3xx_irq_control,
 	.irq_handler = a3xx_irq_handler,
 	.irq_pending = a3xx_irq_pending,
 	.busy_cycles = a3xx_busy_cycles,
 	.start = a3xx_start,
 	.snapshot = a3xx_snapshot,
+	.perfcounter_enable = a3xx_perfcounter_enable,
+	.perfcounter_read = a3xx_perfcounter_read,
 };
diff --git a/drivers/gpu/msm/adreno_a3xx_snapshot.c b/drivers/gpu/msm/adreno_a3xx_snapshot.c
index d9d5ec8..58e3126 100644
--- a/drivers/gpu/msm/adreno_a3xx_snapshot.c
+++ b/drivers/gpu/msm/adreno_a3xx_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -11,6 +11,7 @@
  *
  */
 
+#include <linux/io.h>
 #include "kgsl.h"
 #include "adreno.h"
 #include "kgsl_snapshot.h"
@@ -19,14 +20,43 @@
 #define DEBUG_SECTION_SZ(_dwords) (((_dwords) * sizeof(unsigned int)) \
 		+ sizeof(struct kgsl_snapshot_debug))
 
+/* Shader memory size in words */
 #define SHADER_MEMORY_SIZE 0x4000
 
+/**
+ * _rbbm_debug_bus_read - Helper function to read data from the RBBM
+ * debug bus.
+ * @device - GPU device to read/write registers
+ * @block_id - Debug bus block to read from
+ * @index - Index in the debug bus block to read
+ * @ret - Value of the register read
+ */
+static void _rbbm_debug_bus_read(struct kgsl_device *device,
+	unsigned int block_id, unsigned int index, unsigned int *val)
+{
+	unsigned int block = (block_id << 8) | 1 << 16;
+	adreno_regwrite(device, A3XX_RBBM_DEBUG_BUS_CTL, block | index);
+	adreno_regread(device, A3XX_RBBM_DEBUG_BUS_DATA_STATUS, val);
+}
+
+/**
+ * a3xx_snapshot_shader_memory - Helper function to dump the GPU shader
+ * memory to the snapshot buffer.
+ * @device - GPU device whose shader memory is to be dumped
+ * @snapshot - Pointer to binary snapshot data blob being made
+ * @remain - Number of remaining bytes in the snapshot blob
+ * @priv - Unused parameter
+ */
 static int a3xx_snapshot_shader_memory(struct kgsl_device *device,
 	void *snapshot, int remain, void *priv)
 {
 	struct kgsl_snapshot_debug *header = snapshot;
+	unsigned int i;
 	unsigned int *data = snapshot + sizeof(*header);
-	int i;
+	unsigned int shader_read_len = SHADER_MEMORY_SIZE;
+
+	if (SHADER_MEMORY_SIZE > (device->shader_mem_len >> 2))
+		shader_read_len = (device->shader_mem_len >> 2);
 
 	if (remain < DEBUG_SECTION_SZ(SHADER_MEMORY_SIZE)) {
 		SNAPSHOT_ERR_NOMEM(device, "SHADER MEMORY");
@@ -36,8 +66,22 @@
 	header->type = SNAPSHOT_DEBUG_SHADER_MEMORY;
 	header->size = SHADER_MEMORY_SIZE;
 
-	for (i = 0; i < SHADER_MEMORY_SIZE; i++)
-		adreno_regread(device, 0x4000 + i, &data[i]);
+	/* Map shader memory to kernel, for dumping */
+	if (device->shader_mem_virt == NULL)
+		device->shader_mem_virt = devm_ioremap(device->dev,
+					device->shader_mem_phys,
+					device->shader_mem_len);
+
+	if (device->shader_mem_virt == NULL) {
+		KGSL_DRV_ERR(device,
+		"Unable to map shader memory region\n");
+		return 0;
+	}
+
+	/* Now, dump shader memory to snapshot */
+	for (i = 0; i < shader_read_len; i++)
+		adreno_shadermem_regread(device, i, &data[i]);
+
 
 	return DEBUG_SECTION_SZ(SHADER_MEMORY_SIZE);
 }
@@ -170,7 +214,8 @@
 	int i, size;
 
 	/* The size of the ROQ buffer is core dependent */
-	size = adreno_is_a330(adreno_dev) ?
+	size = (adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)) ?
 		A330_CP_ROQ_SIZE : A320_CP_ROQ_SIZE;
 
 	if (remain < DEBUG_SECTION_SZ(size)) {
@@ -220,66 +265,77 @@
 	return DEBUG_SECTION_SZ(size);
 }
 
-#define DEBUGFS_BLOCK_SIZE 0x40
+struct debugbus_block {
+	unsigned int block_id;
+	unsigned int dwords;
+};
 
 static int a3xx_snapshot_debugbus_block(struct kgsl_device *device,
 	void *snapshot, int remain, void *priv)
 {
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
 	struct kgsl_snapshot_debugbus *header = snapshot;
-	unsigned int id = (unsigned int) priv;
-	unsigned int val;
+	struct debugbus_block *block = priv;
 	int i;
 	unsigned int *data = snapshot + sizeof(*header);
-	int size =
-		(DEBUGFS_BLOCK_SIZE * sizeof(unsigned int)) + sizeof(*header);
+	unsigned int dwords;
+	int size;
+
+	/*
+	 * For A305 and A320 all debug bus regions are the same size (0x40). For
+	 * A330, they can be different sizes - most are still 0x40, but some
+	 * like CP are larger
+	 */
+
+	dwords = (adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)) ?
+		block->dwords : 0x40;
+
+	size = (dwords * sizeof(unsigned int)) + sizeof(*header);
 
 	if (remain < size) {
 		SNAPSHOT_ERR_NOMEM(device, "DEBUGBUS");
 		return 0;
 	}
 
-	val = (id << 8) | (1 << 16);
+	header->id = block->block_id;
+	header->count = dwords;
 
-	header->id = id;
-	header->count = DEBUGFS_BLOCK_SIZE;
-
-	for (i = 0; i < DEBUGFS_BLOCK_SIZE; i++) {
-		adreno_regwrite(device, A3XX_RBBM_DEBUG_BUS_CTL, val | i);
-		adreno_regread(device, A3XX_RBBM_DEBUG_BUS_DATA_STATUS,
-			&data[i]);
-	}
+	for (i = 0; i < dwords; i++)
+		_rbbm_debug_bus_read(device, block->block_id, i, &data[i]);
 
 	return size;
 }
 
-static unsigned int debugbus_blocks[] = {
-	RBBM_BLOCK_ID_CP,
-	RBBM_BLOCK_ID_RBBM,
-	RBBM_BLOCK_ID_VBIF,
-	RBBM_BLOCK_ID_HLSQ,
-	RBBM_BLOCK_ID_UCHE,
-	RBBM_BLOCK_ID_PC,
-	RBBM_BLOCK_ID_VFD,
-	RBBM_BLOCK_ID_VPC,
-	RBBM_BLOCK_ID_TSE,
-	RBBM_BLOCK_ID_RAS,
-	RBBM_BLOCK_ID_VSC,
-	RBBM_BLOCK_ID_SP_0,
-	RBBM_BLOCK_ID_SP_1,
-	RBBM_BLOCK_ID_SP_2,
-	RBBM_BLOCK_ID_SP_3,
-	RBBM_BLOCK_ID_TPL1_0,
-	RBBM_BLOCK_ID_TPL1_1,
-	RBBM_BLOCK_ID_TPL1_2,
-	RBBM_BLOCK_ID_TPL1_3,
-	RBBM_BLOCK_ID_RB_0,
-	RBBM_BLOCK_ID_RB_1,
-	RBBM_BLOCK_ID_RB_2,
-	RBBM_BLOCK_ID_RB_3,
-	RBBM_BLOCK_ID_MARB_0,
-	RBBM_BLOCK_ID_MARB_1,
-	RBBM_BLOCK_ID_MARB_2,
-	RBBM_BLOCK_ID_MARB_3,
+static struct debugbus_block debugbus_blocks[] = {
+	{ RBBM_BLOCK_ID_CP, 0x52, },
+	{ RBBM_BLOCK_ID_RBBM, 0x40, },
+	{ RBBM_BLOCK_ID_VBIF, 0x40, },
+	{ RBBM_BLOCK_ID_HLSQ, 0x40, },
+	{ RBBM_BLOCK_ID_UCHE, 0x40, },
+	{ RBBM_BLOCK_ID_PC, 0x40, },
+	{ RBBM_BLOCK_ID_VFD, 0x40, },
+	{ RBBM_BLOCK_ID_VPC, 0x40, },
+	{ RBBM_BLOCK_ID_TSE, 0x40, },
+	{ RBBM_BLOCK_ID_RAS, 0x40, },
+	{ RBBM_BLOCK_ID_VSC, 0x40, },
+	{ RBBM_BLOCK_ID_SP_0, 0x40, },
+	{ RBBM_BLOCK_ID_SP_1, 0x40, },
+	{ RBBM_BLOCK_ID_SP_2, 0x40, },
+	{ RBBM_BLOCK_ID_SP_3, 0x40, },
+	{ RBBM_BLOCK_ID_TPL1_0, 0x40, },
+	{ RBBM_BLOCK_ID_TPL1_1, 0x40, },
+	{ RBBM_BLOCK_ID_TPL1_2, 0x40, },
+	{ RBBM_BLOCK_ID_TPL1_3, 0x40, },
+	{ RBBM_BLOCK_ID_RB_0, 0x40, },
+	{ RBBM_BLOCK_ID_RB_1, 0x40, },
+	{ RBBM_BLOCK_ID_RB_2, 0x40, },
+	{ RBBM_BLOCK_ID_RB_3, 0x40, },
+	{ RBBM_BLOCK_ID_MARB_0, 0x40, },
+	{ RBBM_BLOCK_ID_MARB_1, 0x40, },
+	{ RBBM_BLOCK_ID_MARB_2, 0x40, },
+	{ RBBM_BLOCK_ID_MARB_3, 0x40, },
 };
 
 static void *a3xx_snapshot_debugbus(struct kgsl_device *device,
@@ -291,7 +347,7 @@
 		snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUGBUS, snapshot, remain,
 			a3xx_snapshot_debugbus_block,
-			(void *) debugbus_blocks[i]);
+			(void *) &debugbus_blocks[i]);
 	}
 
 	return snapshot;
@@ -309,18 +365,58 @@
 	struct kgsl_snapshot_registers_list *list,
 	struct adreno_device *adreno_dev)
 {
-	/* HLSQ specific registers */
+	struct kgsl_device *device = &adreno_dev->dev;
+
 	/*
-	 * Don't dump any a3xx HLSQ registers just yet.  Reading the HLSQ
-	 * registers can cause the device to hang if the HLSQ block is
-	 * busy.  Add specific checks for each a3xx core as the requirements
-	 * are discovered.  Disable by default for now.
+	 * Trying to read HLSQ registers when the HLSQ block is busy
+	 * will cause the device to hang.  The RBBM_DEBUG_BUS has information
+	 * that will tell us if the HLSQ block is busy or not.  Read values
+	 * from the debug bus to ensure the HLSQ block is not busy (this
+	 * is hardware dependent).  If the HLSQ block is busy do not
+	 * dump the registers, otherwise dump the HLSQ registers.
 	 */
-	if (!adreno_is_a3xx(adreno_dev)) {
-		regs[list->count].regs = (unsigned int *) a3xx_hlsq_registers;
-		regs[list->count].count = a3xx_hlsq_registers_count;
-		list->count++;
+
+	if (adreno_is_a330(adreno_dev)) {
+		/*
+		 * stall_ctxt_full status bit: RBBM_BLOCK_ID_HLSQ index 49 [27]
+		 *
+		 * if (!stall_context_full)
+		 * then dump HLSQ registers
+		 */
+		unsigned int stall_context_full = 0;
+
+		_rbbm_debug_bus_read(device, RBBM_BLOCK_ID_HLSQ, 49,
+				&stall_context_full);
+		stall_context_full &= 0x08000000;
+
+		if (stall_context_full)
+			return;
+	} else {
+		/*
+		 * tpif status bits: RBBM_BLOCK_ID_HLSQ index 4 [4:0]
+		 * spif status bits: RBBM_BLOCK_ID_HLSQ index 7 [5:0]
+		 *
+		 * if ((tpif == 0, 1, 28) && (spif == 0, 1, 10))
+		 * then dump HLSQ registers
+		 */
+		unsigned int next_pif = 0;
+
+		/* check tpif */
+		_rbbm_debug_bus_read(device, RBBM_BLOCK_ID_HLSQ, 4, &next_pif);
+		next_pif &= 0x1f;
+		if (next_pif != 0 && next_pif != 1 && next_pif != 28)
+			return;
+
+		/* check spif */
+		_rbbm_debug_bus_read(device, RBBM_BLOCK_ID_HLSQ, 7, &next_pif);
+		next_pif &= 0x3f;
+		if (next_pif != 0 && next_pif != 1 && next_pif != 10)
+			return;
 	}
+
+	regs[list->count].regs = (unsigned int *) a3xx_hlsq_registers;
+	regs[list->count].count = a3xx_hlsq_registers_count;
+	list->count++;
 }
 
 static void _snapshot_a330_regs(struct kgsl_snapshot_registers *regs,
@@ -342,6 +438,7 @@
 	struct kgsl_device *device = &adreno_dev->dev;
 	struct kgsl_snapshot_registers_list list;
 	struct kgsl_snapshot_registers regs[5];
+	int size;
 
 	list.registers = regs;
 	list.count = 0;
@@ -352,7 +449,7 @@
 	/* Store relevant registers in list to snapshot */
 	_snapshot_a3xx_regs(regs, &list);
 	_snapshot_hlsq_regs(regs, &list, adreno_dev);
-	if (adreno_is_a330(adreno_dev))
+	if (adreno_is_a330(adreno_dev) || adreno_is_a305b(adreno_dev))
 		_snapshot_a330_regs(regs, &list);
 
 	/* Master set of (non debug) registers */
@@ -360,10 +457,15 @@
 		KGSL_SNAPSHOT_SECTION_REGS, snapshot, remain,
 		kgsl_snapshot_dump_regs, &list);
 
-	/* CP_STATE_DEBUG indexed registers */
+	/*
+	 * CP_STATE_DEBUG indexed registers - 20 on 305 and 320 and 46 on A330
+	 */
+	size = (adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)) ? 0x2E : 0x14;
+
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
 			remain, REG_CP_STATE_DEBUG_INDEX,
-			REG_CP_STATE_DEBUG_DATA, 0x0, 0x14);
+			REG_CP_STATE_DEBUG_DATA, 0x0, size);
 
 	/* CP_ME indexed registers */
 	snapshot = kgsl_snapshot_indexed_registers(device, snapshot,
@@ -404,7 +506,8 @@
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a3xx_snapshot_cp_roq, NULL);
 
-	if (adreno_is_a330(adreno_dev)) {
+	if (adreno_is_a330(adreno_dev) ||
+		adreno_is_a305b(adreno_dev)) {
 		snapshot = kgsl_snapshot_add_section(device,
 			KGSL_SNAPSHOT_SECTION_DEBUG, snapshot, remain,
 			a330_snapshot_cp_merciu, NULL);
@@ -414,7 +517,7 @@
 
 	/* Enable Clock gating */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL,
-			A3XX_RBBM_CLOCK_CTL_DEFAULT);
+		adreno_a3xx_rbbm_clock_ctl_default(adreno_dev));
 
 	return snapshot;
 }
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 890c8a1..ef599e9 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -64,11 +64,6 @@
 	adreno_dev->fast_hang_detect = 1;
 	debugfs_create_u32("fast_hang_detect", 0644, device->d_debugfs,
 			   &adreno_dev->fast_hang_detect);
-
-	/* Top level switch to enable/disable userspace FT control */
-	adreno_dev->ft_user_control = 0;
-	debugfs_create_u32("ft_user_control", 0644, device->d_debugfs,
-			   &adreno_dev->ft_user_control);
 	/*
 	 * FT policy can be set to any of the options below.
 	 * KGSL_FT_DISABLE -> BIT(0) Set to disable FT
@@ -80,7 +75,6 @@
 	adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
 	debugfs_create_u32("ft_policy", 0644, device->d_debugfs,
 			   &adreno_dev->ft_policy);
-
 	/* By default enable long IB detection */
 	adreno_dev->long_ib_detect = 1;
 	debugfs_create_u32("long_ib_detect", 0644, device->d_debugfs,
@@ -96,7 +90,7 @@
 	 * KGSL_FT_PAGEFAULT_LOG_ONE_PER_INT -> BIT(3) Set to log only one
 	 * pagefault per INT.
 	 */
-	adreno_dev->ft_pf_policy = KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
-	debugfs_create_u32("ft_pagefault_policy", 0644, device->d_debugfs,
-			   &adreno_dev->ft_pf_policy);
+	 adreno_dev->ft_pf_policy = KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
+	 debugfs_create_u32("ft_pagefault_policy", 0644, device->d_debugfs,
+			&adreno_dev->ft_pf_policy);
 }
diff --git a/drivers/gpu/msm/adreno_debugfs.h b/drivers/gpu/msm/adreno_debugfs.h
deleted file mode 100644
index 1c97ebb..0000000
--- a/drivers/gpu/msm/adreno_debugfs.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2002,2008-2012, Code Aurora Forum. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-#ifndef __ADRENO_DEBUGFS_H
-#define __ADRENO_DEBUGFS_H
-
-#ifdef CONFIG_DEBUG_FS
-
-int adreno_debugfs_init(struct kgsl_device *device);
-
-extern int adreno_pm_regs_enabled;
-extern int adreno_pm_ib_enabled;
-
-static inline int is_adreno_pm_regs_enabled(void)
-{
-	return adreno_pm_regs_enabled;
-}
-
-static inline int is_adreno_pm_ib_enabled(void)
-{
-	return adreno_pm_ib_enabled;
-}
-
-#else
-static inline int adreno_debugfs_init(struct kgsl_device *device)
-{
-	return 0;
-}
-
-static inline int kgsl_pmregs_enabled(void)
-{
-	
-	return 1;
-}
-#endif
-
-#endif 
diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c
index 6fbcdee..176717d 100644
--- a/drivers/gpu/msm/adreno_drawctxt.c
+++ b/drivers/gpu/msm/adreno_drawctxt.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -12,6 +12,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/msm_kgsl.h>
 
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
@@ -143,7 +144,7 @@
  */
 int adreno_drawctxt_create(struct kgsl_device *device,
 			struct kgsl_pagetable *pagetable,
-			struct kgsl_context *context, uint32_t flags)
+			struct kgsl_context *context, uint32_t *flags)
 {
 	struct adreno_context *drawctxt;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
@@ -162,26 +163,36 @@
 	drawctxt->id = context->id;
 	rb->timestamp[context->id] = 0;
 
-	if (flags & KGSL_CONTEXT_PREAMBLE)
+	*flags &= (KGSL_CONTEXT_PREAMBLE |
+		KGSL_CONTEXT_NO_GMEM_ALLOC |
+		KGSL_CONTEXT_PER_CONTEXT_TS |
+		KGSL_CONTEXT_USER_GENERATED_TS |
+		KGSL_CONTEXT_NO_FAULT_TOLERANCE |
+		KGSL_CONTEXT_TYPE_MASK);
+
+	if (*flags & KGSL_CONTEXT_PREAMBLE)
 		drawctxt->flags |= CTXT_FLAGS_PREAMBLE;
 
-	if (flags & KGSL_CONTEXT_NO_GMEM_ALLOC)
+	if (*flags & KGSL_CONTEXT_NO_GMEM_ALLOC)
 		drawctxt->flags |= CTXT_FLAGS_NOGMEMALLOC;
 
-	if (flags & KGSL_CONTEXT_PER_CONTEXT_TS)
+	if (*flags & KGSL_CONTEXT_PER_CONTEXT_TS)
 		drawctxt->flags |= CTXT_FLAGS_PER_CONTEXT_TS;
 
-	if (flags & KGSL_CONTEXT_USER_GENERATED_TS) {
-		if (!(flags & KGSL_CONTEXT_PER_CONTEXT_TS)) {
+	if (*flags & KGSL_CONTEXT_USER_GENERATED_TS) {
+		if (!(*flags & KGSL_CONTEXT_PER_CONTEXT_TS)) {
 			ret = -EINVAL;
 			goto err;
 		}
 		drawctxt->flags |= CTXT_FLAGS_USER_GENERATED_TS;
 	}
 
-	if (flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE)
+	if (*flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE)
 		drawctxt->flags |= CTXT_FLAGS_NO_FAULT_TOLERANCE;
 
+	drawctxt->type =
+		(*flags & KGSL_CONTEXT_TYPE_MASK) >> KGSL_CONTEXT_TYPE_SHIFT;
+
 	ret = adreno_dev->gpudev->ctxt_create(adreno_dev, drawctxt);
 	if (ret)
 		goto err;
@@ -242,10 +253,6 @@
 	if (device->state != KGSL_STATE_HUNG)
 		adreno_idle(device);
 
-	if (adreno_is_a20x(adreno_dev) && adreno_dev->drawctxt_active)
-		kgsl_setstate(&device->mmu, adreno_dev->drawctxt_active->id,
-			KGSL_MMUFLAGS_PTUPDATE);
-
 	kgsl_sharedmem_free(&drawctxt->gpustate);
 	kgsl_sharedmem_free(&drawctxt->context_gmem_shadow.gmemshadow);
 
@@ -306,8 +313,10 @@
 		return;
 	}
 
-	KGSL_CTXT_INFO(device, "from %p to %p flags %d\n",
-			adreno_dev->drawctxt_active, drawctxt, flags);
+	KGSL_CTXT_INFO(device, "from %d to %d flags %d\n",
+		adreno_dev->drawctxt_active ?
+		adreno_dev->drawctxt_active->id : 0,
+		drawctxt ? drawctxt->id : 0, flags);
 
 	/* Save the old context */
 	adreno_dev->gpudev->ctxt_save(adreno_dev, adreno_dev->drawctxt_active);
diff --git a/drivers/gpu/msm/adreno_drawctxt.h b/drivers/gpu/msm/adreno_drawctxt.h
index fd60688..f0f3b6b 100644
--- a/drivers/gpu/msm/adreno_drawctxt.h
+++ b/drivers/gpu/msm/adreno_drawctxt.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -57,6 +57,14 @@
 /* Context no fault tolerance */
 #define CTXT_FLAGS_NO_FAULT_TOLERANCE  BIT(16)
 
+/* Symbolic table for the adreno draw context type */
+#define ADRENO_DRAWCTXT_TYPES \
+	{ KGSL_CONTEXT_TYPE_ANY, "any" }, \
+	{ KGSL_CONTEXT_TYPE_GL, "GL" }, \
+	{ KGSL_CONTEXT_TYPE_CL, "CL" }, \
+	{ KGSL_CONTEXT_TYPE_C2D, "C2D" }, \
+	{ KGSL_CONTEXT_TYPE_RS, "RS" }
+
 struct kgsl_device;
 struct adreno_device;
 struct kgsl_device_private;
@@ -93,6 +101,9 @@
 	unsigned int id;
 	unsigned int ib_gpu_time_used;
 	uint32_t flags;
+	uint32_t pagefault;
+	unsigned long pagefault_ts;
+	unsigned int type;
 	struct kgsl_pagetable *pagetable;
 	struct kgsl_memdesc gpustate;
 	unsigned int reg_restore[3];
@@ -125,7 +136,7 @@
 int adreno_drawctxt_create(struct kgsl_device *device,
 			struct kgsl_pagetable *pagetable,
 			struct kgsl_context *context,
-			uint32_t flags);
+			uint32_t *flags);
 
 void adreno_drawctxt_destroy(struct kgsl_device *device,
 			  struct kgsl_context *context);
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index a3fa312..f449870 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -143,10 +143,10 @@
 #define CP_IM_STORE            0x2c
 
 /* test 2 memory locations to dword values specified */
-#define CP_TEST_TWO_MEMS    0x71
+#define CP_TEST_TWO_MEMS	0x71
 
 /* PFP waits until the FIFO between the PFP and the ME is empty */
-#define CP_WAIT_FOR_ME      0x13
+#define CP_WAIT_FOR_ME		0x13
 
 /*
  * for a20x
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
index cf1cf90..5b52fd8 100644
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -12,6 +12,7 @@
  */
 
 #include <linux/vmalloc.h>
+#include <mach/board.h>
 
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
@@ -50,6 +51,7 @@
 	{CP_DRAW_INDX,			"DRW_NDX_"},
 	{CP_DRAW_INDX_BIN,		"DRW_NDXB"},
 	{CP_EVENT_WRITE,		"EVENT_WT"},
+	{CP_MEM_WRITE,			"MEM_WRIT"},
 	{CP_IM_LOAD,			"IN__LOAD"},
 	{CP_IM_LOAD_IMMEDIATE,		"IM_LOADI"},
 	{CP_IM_STORE,			"IM_STORE"},
@@ -69,6 +71,14 @@
 	{CP_WAIT_FOR_IDLE,		"WAIT4IDL"},
 };
 
+static const struct pm_id_name pm3_nop_values[] = {
+	{KGSL_CONTEXT_TO_MEM_IDENTIFIER,	"CTX_SWCH"},
+	{KGSL_CMD_IDENTIFIER,			"CMD__EXT"},
+	{KGSL_CMD_INTERNAL_IDENTIFIER,		"CMD__INT"},
+	{KGSL_START_OF_IB_IDENTIFIER,		"IB_START"},
+	{KGSL_END_OF_IB_IDENTIFIER,		"IB___END"},
+};
+
 static uint32_t adreno_is_pm4_len(uint32_t word)
 {
 	if (word == INVALID_RB_CMD)
@@ -128,6 +138,28 @@
 	return "????????";
 }
 
+static bool adreno_is_pm3_nop_value(uint32_t word)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pm3_nop_values); ++i) {
+		if (word == pm3_nop_values[i].id)
+			return 1;
+	}
+	return 0;
+}
+
+static const char *adreno_pm3_nop_name(uint32_t word)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pm3_nop_values); ++i) {
+		if (word == pm3_nop_values[i].id)
+			return pm3_nop_values[i].name;
+	}
+	return "????????";
+}
+
 static void adreno_dump_regs(struct kgsl_device *device,
 			   const int *registers, int size)
 {
@@ -244,8 +276,13 @@
 				"%s", adreno_pm4_name(ptr4[j]));
 			*argp = -(adreno_is_pm4_len(ptr4[j])+1);
 		} else {
-			lx += scnprintf(linebuf + lx, linebuflen - lx,
-				"%8.8X", ptr4[j]);
+			if (adreno_is_pm3_nop_value(ptr4[j]))
+				lx += scnprintf(linebuf + lx, linebuflen - lx,
+					"%s", adreno_pm3_nop_name(ptr4[j]));
+			else
+				lx += scnprintf(linebuf + lx, linebuflen - lx,
+					"%8.8X", ptr4[j]);
+
 			if (*argp > 1)
 				--*argp;
 			else if (*argp == 1) {
@@ -665,7 +702,7 @@
 		" %08X\n", r1, r2, r3);
 
 	KGSL_LOG_DUMP(device, "PAGETABLE SIZE: %08X ",
-		kgsl_mmu_get_ptsize());
+		kgsl_mmu_get_ptsize(&device->mmu));
 
 	kgsl_regread(device, MH_MMU_TRAN_ERROR, &r1);
 	KGSL_LOG_DUMP(device, "        TRAN_ERROR = %08X\n", r1);
@@ -703,6 +740,7 @@
 	mb();
 
 	if (device->pm_dump_enable) {
+
 		if (adreno_is_a2xx(adreno_dev))
 			adreno_dump_a2xx(device);
 		else if (adreno_is_a3xx(adreno_dev))
@@ -728,7 +766,7 @@
 	if (!device->pm_dump_enable) {
 
 		KGSL_LOG_DUMP(device,
-			"RBBM STATUS %08X | IB1:%08X/%08X | IB2: %08X/%08X"
+			"STATUS %08X | IB1:%08X/%08X | IB2: %08X/%08X"
 			" | RPTR: %04X | WPTR: %04X\n",
 			rbbm_status,  cp_ib1_base, cp_ib1_bufsz, cp_ib2_base,
 			cp_ib2_bufsz, cp_rb_rptr, cp_rb_wptr);
@@ -890,7 +928,8 @@
 			adreno_dump_regs(device, a3xx_registers,
 					a3xx_registers_count);
 
-			if (adreno_is_a330(adreno_dev))
+			if (adreno_is_a330(adreno_dev) ||
+				adreno_is_a305b(adreno_dev))
 				adreno_dump_regs(device, a330_registers,
 					a330_registers_count);
 		}
diff --git a/drivers/gpu/msm/adreno_postmortem.h b/drivers/gpu/msm/adreno_postmortem.h
deleted file mode 100644
index 7706037..0000000
--- a/drivers/gpu/msm/adreno_postmortem.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef __ADRENO_POSTMORTEM_H
-#define __ADRENO_POSTMORTEM_H
-
-struct kgsl_device;
-
-int adreno_postmortem_dump(struct kgsl_device *device, int manual);
-
-#endif 
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 179027c..a4bb4fa 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -18,6 +18,7 @@
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
 #include "kgsl_cffdump.h"
+#include "kgsl_trace.h"
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
@@ -319,7 +320,7 @@
 	return 0;
 }
 
-int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
+int adreno_ringbuffer_start(struct adreno_ringbuffer *rb)
 {
 	int status;
 	/*cp_rb_cntl_u cp_rb_cntl; */
@@ -331,9 +332,6 @@
 	if (rb->flags & KGSL_FLAGS_STARTED)
 		return 0;
 
-	if (init_ram)
-		rb->timestamp[KGSL_MEMSTORE_GLOBAL] = 0;
-
 	kgsl_sharedmem_set(&rb->memptrs_desc, 0, 0,
 			   sizeof(struct kgsl_rbmemptrs));
 
@@ -433,8 +431,11 @@
 		return status;
 
 	/* CP ROQ queue sizes (bytes) - RB:16, ST:16, IB1:32, IB2:64 */
-	if (adreno_is_a305(adreno_dev) || adreno_is_a320(adreno_dev))
+	if (adreno_is_a305(adreno_dev) || adreno_is_a305c(adreno_dev) ||
+		adreno_is_a320(adreno_dev))
 		adreno_regwrite(device, REG_CP_QUEUE_THRESHOLDS, 0x000E0602);
+	else if (adreno_is_a330(adreno_dev) || adreno_is_a305b(adreno_dev))
+		adreno_regwrite(device, REG_CP_QUEUE_THRESHOLDS, 0x003E2008);
 
 	rb->rptr = 0;
 	rb->wptr = 0;
@@ -443,7 +444,9 @@
 	adreno_regwrite(device, REG_CP_ME_CNTL, 0);
 
 	/* ME init is GPU specific, so jump into the sub-function */
-	adreno_dev->gpudev->rb_init(adreno_dev, rb);
+	status = adreno_dev->gpudev->rb_init(adreno_dev, rb);
+	if (status)
+		return status;
 
 	/* idle device to validate ME INIT */
 	status = adreno_idle(device);
@@ -481,6 +484,7 @@
 	 */
 	rb->sizedwords = KGSL_RB_SIZE >> 2;
 
+	rb->buffer_desc.flags = KGSL_MEMFLAGS_GPUREADONLY;
 	/* allocate memory for ringbuffer */
 	status = kgsl_allocate_contiguous(&rb->buffer_desc,
 		(rb->sizedwords << 2));
@@ -564,6 +568,8 @@
 	total_sizedwords += flags & KGSL_CMD_FLAGS_PMODE ? 4 : 0;
 	/* 2 dwords to store the start of command sequence */
 	total_sizedwords += 2;
+	/* internal ib command identifier for the ringbuffer */
+	total_sizedwords += (flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE) ? 2 : 0;
 
 	/* Add CP_COND_EXEC commands to generate CP_INTERRUPT */
 	total_sizedwords += context ? 13 : 0;
@@ -571,17 +577,25 @@
 	if (adreno_is_a3xx(adreno_dev))
 		total_sizedwords += 7;
 
+	if (adreno_is_a2xx(adreno_dev))
+		total_sizedwords += 2; /* CP_WAIT_FOR_IDLE */
+
 	total_sizedwords += 2; /* scratchpad ts for fault tolerance */
+	total_sizedwords += 3; /* sop timestamp */
+	total_sizedwords += 4; /* eop timestamp */
+
 	if (context && context->flags & CTXT_FLAGS_PER_CONTEXT_TS &&
 			!(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
-		total_sizedwords += 3; /* sop timestamp */
-		total_sizedwords += 4; /* eop timestamp */
 		total_sizedwords += 3; /* global timestamp without cache
 					* flush for non-zero context */
-	} else {
-		total_sizedwords += 4; /* global timestamp for fault tolerance*/
 	}
 
+	if (adreno_is_a20x(adreno_dev))
+		total_sizedwords += 2; /* CACHE_FLUSH */
+
+	if (flags & KGSL_CMD_FLAGS_EOF)
+		total_sizedwords += 2;
+
 	ringcmds = adreno_ringbuffer_allocspace(rb, context, total_sizedwords);
 	if (!ringcmds) {
 		/*
@@ -597,23 +611,9 @@
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_CMD_IDENTIFIER);
 
-	if (flags & KGSL_CMD_FLAGS_PMODE) {
-		/* disable protected mode error checking */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_SET_PROTECTED_MODE, 1));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0);
-	}
-
-	for (i = 0; i < sizedwords; i++) {
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, *cmds);
-		cmds++;
-	}
-
-	if (flags & KGSL_CMD_FLAGS_PMODE) {
-		/* re-enable protected mode error checking */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_SET_PROTECTED_MODE, 1));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 1);
+	if (flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_CMD_INTERNAL_IDENTIFIER);
 	}
 
 	/* always increment the global timestamp. once. */
@@ -635,10 +635,45 @@
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type0_packet(REG_CP_TIMESTAMP, 1));
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
 
+	/* start-of-pipeline timestamp */
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type3_packet(CP_MEM_WRITE, 2));
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
+		KGSL_MEMSTORE_OFFSET(context_id, soptimestamp)));
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
+
+	if (flags & KGSL_CMD_FLAGS_PMODE) {
+		/* disable protected mode error checking */
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_SET_PROTECTED_MODE, 1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0);
+	}
+
+	for (i = 0; i < sizedwords; i++) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, *cmds);
+		cmds++;
+	}
+
+	if (flags & KGSL_CMD_FLAGS_PMODE) {
+		/* re-enable protected mode error checking */
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_SET_PROTECTED_MODE, 1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 1);
+	}
+
+	/* HW Workaround for MMU Page fault
+	* due to memory getting free early before
+	* GPU completes it.
+	*/
+	if (adreno_is_a2xx(adreno_dev)) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_WAIT_FOR_IDLE, 1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x00);
+	}
+
 	if (adreno_is_a3xx(adreno_dev)) {
 		/*
-		 * FLush HLSQ lazy updates to make sure there are no
-		 * rsources pending for indirect loads after the timestamp
+		 * Flush HLSQ lazy updates to make sure there are no
+		 * resources pending for indirect loads after the timestamp
 		 */
 
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
@@ -649,22 +684,19 @@
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x00);
 	}
 
+	/*
+	 * end-of-pipeline timestamp.  If per context timestamps is not
+	 * enabled, then context_id will be KGSL_MEMSTORE_GLOBAL so all
+	 * eop timestamps will work out.
+	 */
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type3_packet(CP_EVENT_WRITE, 3));
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, CACHE_FLUSH_TS);
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
+		KGSL_MEMSTORE_OFFSET(context_id, eoptimestamp)));
+	GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
+
 	if (context && context->flags & CTXT_FLAGS_PER_CONTEXT_TS
 			&& !(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
-		/* start-of-pipeline timestamp */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_MEM_WRITE, 2));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(context_id, soptimestamp)));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
-
-		/* end-of-pipeline timestamp */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_EVENT_WRITE, 3));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, CACHE_FLUSH_TS);
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(context_id, eoptimestamp)));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
 
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_MEM_WRITE, 2));
@@ -673,16 +705,14 @@
 				eoptimestamp)));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
-	} else {
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_EVENT_WRITE, 3));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, CACHE_FLUSH_TS);
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-						eoptimestamp)));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-				rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
 	}
+
+	if (adreno_is_a20x(adreno_dev)) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_EVENT_WRITE, 1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, CACHE_FLUSH);
+	}
+
 	if (context) {
 		/* Conditional execution based on memory values */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
@@ -960,43 +990,31 @@
 {
 	struct kgsl_device *device = dev_priv->device;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int *link;
+	unsigned int *link = 0;
 	unsigned int *cmds;
 	unsigned int i;
-	struct adreno_context *drawctxt;
+	struct adreno_context *drawctxt = NULL;
 	unsigned int start_index = 0;
+	int ret;
 
-	if (device->state & KGSL_STATE_HUNG)
-		return -EBUSY;
+	if (device->state & KGSL_STATE_HUNG) {
+		ret = -EBUSY;
+		goto done;
+	}
+
 	if (!(adreno_dev->ringbuffer.flags & KGSL_FLAGS_STARTED) ||
-	      context == NULL || ibdesc == 0 || numibs == 0)
-		return -EINVAL;
-
+	      context == NULL || ibdesc == 0 || numibs == 0) {
+		ret = -EINVAL;
+		goto done;
+	}
 	drawctxt = context->devctxt;
 
 	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG) {
 		KGSL_CTXT_ERR(device, "proc %s failed fault tolerance"
 			" will not accept commands for context %d\n",
 			drawctxt->pid_name, drawctxt->id);
-		return -EDEADLK;
-	}
-
-	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
-		KGSL_CTXT_ERR(device,
-			"proc %s triggered fault tolerance"
-			" skipping commands for context till EOF %d\n",
-			drawctxt->pid_name, drawctxt->id);
-		if (flags & KGSL_CMD_FLAGS_EOF)
-			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
-		numibs = 0;
-	}
-
-	cmds = link = kzalloc(sizeof(unsigned int) * (numibs * 3 + 4),
-				GFP_KERNEL);
-	if (!link) {
-		KGSL_CORE_ERR("kzalloc(%d) failed\n",
-			sizeof(unsigned int) * (numibs * 3 + 4));
-		return -ENOMEM;
+		ret = -EDEADLK;
+		goto done;
 	}
 
 	/*When preamble is enabled, the preamble buffer with state restoration
@@ -1007,6 +1025,26 @@
 		adreno_dev->drawctxt_active == drawctxt)
 		start_index = 1;
 
+	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
+		KGSL_CTXT_ERR(device,
+			"proc %s triggered fault tolerance"
+			" skipping commands for context till EOF %d\n",
+			drawctxt->pid_name, drawctxt->id);
+		if (flags & KGSL_CMD_FLAGS_EOF)
+			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
+		if (start_index)
+			numibs = 1;
+		else
+			numibs = 0;
+	}
+
+	cmds = link = kzalloc(sizeof(unsigned int) * (numibs * 3 + 4),
+				GFP_KERNEL);
+	if (!link) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
 	if (!start_index) {
 		*cmds++ = cp_nop_packet(1);
 		*cmds++ = KGSL_START_OF_IB_IDENTIFIER;
@@ -1021,9 +1059,15 @@
 		if (unlikely(adreno_dev->ib_check_level >= 1 &&
 		    !_parse_ibs(dev_priv, ibdesc[i].gpuaddr,
 				ibdesc[i].sizedwords))) {
-			kfree(link);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto done;
 		}
+
+		if (ibdesc[i].sizedwords == 0) {
+			ret = -EINVAL;
+			goto done;
+		}
+
 		*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 		*cmds++ = ibdesc[i].gpuaddr;
 		*cmds++ = ibdesc[i].sizedwords;
@@ -1043,11 +1087,6 @@
 					(flags & KGSL_CMD_FLAGS_EOF),
 					&link[0], (cmds - link), *timestamp);
 
-	KGSL_CMD_INFO(device, "ctxt %d g %08x numibs %d ts %d\n",
-		context->id, (unsigned int)ibdesc, numibs, *timestamp);
-
-	kfree(link);
-
 #ifdef CONFIG_MSM_KGSL_CFF_DUMP
 	/*
 	 * insert wait for idle after every IB1
@@ -1063,9 +1102,16 @@
 	 */
 	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_FT) {
 		drawctxt->flags &= ~CTXT_FLAGS_GPU_HANG_FT;
-		return -EPROTO;
+		ret = -EPROTO;
 	} else
-		return 0;
+		ret = 0;
+
+done:
+	trace_kgsl_issueibcmds(device, context->id, ibdesc, numibs,
+		*timestamp, flags, ret, drawctxt->type);
+
+	kfree(link);
+	return ret;
 }
 
 static void _turn_preamble_on_for_ib_seq(struct adreno_ringbuffer *rb,
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index fa03c05..e563ec7 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -97,8 +97,7 @@
 
 int adreno_ringbuffer_init(struct kgsl_device *device);
 
-int adreno_ringbuffer_start(struct adreno_ringbuffer *rb,
-				unsigned int init_ram);
+int adreno_ringbuffer_start(struct adreno_ringbuffer *rb);
 
 void adreno_ringbuffer_stop(struct adreno_ringbuffer *rb);
 
diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c
index f23586e..a76ed87 100644
--- a/drivers/gpu/msm/adreno_snapshot.c
+++ b/drivers/gpu/msm/adreno_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -162,6 +162,12 @@
 static unsigned int sp_fs_pvt_mem_addr;
 
 /*
+ * Cached value of SP_VS_OBJ_START_REG and SP_FS_OBJ_START_REG.
+ */
+static unsigned int sp_vs_obj_start_reg;
+static unsigned int sp_fs_obj_start_reg;
+
+/*
  * Each load state block has two possible types.  Each type has a different
  * number of dwords per unit.  Use this handy lookup table to make sure
  * we dump the right amount of data from the indirect buffer
@@ -373,6 +379,26 @@
 		sp_fs_pvt_mem_addr = 0;
 	}
 
+	if (sp_vs_obj_start_reg) {
+		ret = kgsl_snapshot_get_object(device, ptbase,
+			sp_vs_obj_start_reg & 0xFFFFFFE0, 0,
+			SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+		snapshot_frozen_objsize += ret;
+		sp_vs_obj_start_reg = 0;
+	}
+
+	if (sp_fs_obj_start_reg) {
+		ret = kgsl_snapshot_get_object(device, ptbase,
+			sp_fs_obj_start_reg & 0xFFFFFFE0, 0,
+			SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+		snapshot_frozen_objsize += ret;
+		sp_fs_obj_start_reg = 0;
+	}
+
 	/* Finally: VBOs */
 
 	/* The number of active VBOs is stored in VFD_CONTROL_O[31:27] */
@@ -444,7 +470,7 @@
 	int offset = type0_pkt_offset(*ptr);
 	int i;
 
-	for (i = 0; i < size; i++, offset++) {
+	for (i = 0; i < size - 1; i++, offset++) {
 
 		/* Visiblity stream buffer */
 
@@ -505,11 +531,20 @@
 			case A3XX_SP_FS_PVT_MEM_ADDR_REG:
 				sp_fs_pvt_mem_addr = ptr[i + 1];
 				break;
+			case A3XX_SP_VS_OBJ_START_REG:
+				sp_vs_obj_start_reg = ptr[i + 1];
+				break;
+			case A3XX_SP_FS_OBJ_START_REG:
+				sp_fs_obj_start_reg = ptr[i + 1];
+				break;
 			}
 		}
 	}
 }
 
+static inline int parse_ib(struct kgsl_device *device, unsigned int ptbase,
+		unsigned int gpuaddr, unsigned int dwords);
+
 /* Add an IB as a GPU object, but first, parse it to find more goodies within */
 
 static int ib_add_gpu_object(struct kgsl_device *device, unsigned int ptbase,
@@ -549,32 +584,12 @@
 			if (adreno_cmd_is_ib(src[i])) {
 				unsigned int gpuaddr = src[i + 1];
 				unsigned int size = src[i + 2];
-				unsigned int ibbase;
 
-				/* Address of the last processed IB2 */
-				kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
+				ret = parse_ib(device, ptbase, gpuaddr, size);
 
-				/*
-				 * If this is the last IB2 that was executed,
-				 * then push it to make sure it goes into the
-				 * static space
-				 */
-
-				if (ibbase == gpuaddr)
-					push_object(device,
-						SNAPSHOT_OBJ_TYPE_IB, ptbase,
-						gpuaddr, size);
-				else {
-					ret = ib_add_gpu_object(device,
-						ptbase, gpuaddr, size);
-
-					/*
-					 * If adding the IB failed then stop
-					 * parsing
-					 */
-					if (ret < 0)
-						goto done;
-				}
+				/* If adding the IB failed then stop parsing */
+				if (ret < 0)
+					goto done;
 			} else {
 				ret = ib_parse_type3(device, &src[i], ptbase);
 				/*
@@ -604,29 +619,34 @@
 	return ret;
 }
 
-/* Snapshot the istore memory */
-static int snapshot_istore(struct kgsl_device *device, void *snapshot,
-	int remain, void *priv)
+/*
+ * We want to store the last executed IB1 and IB2 in the static region to ensure
+ * that we get at least some information out of the snapshot even if we can't
+ * access the dynamic data from the sysfs file.  Push all other IBs on the
+ * dynamic list
+ */
+static inline int parse_ib(struct kgsl_device *device, unsigned int ptbase,
+		unsigned int gpuaddr, unsigned int dwords)
 {
-	struct kgsl_snapshot_istore *header = snapshot;
-	unsigned int *data = snapshot + sizeof(*header);
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	int count, i;
+	unsigned int ib1base, ib2base;
+	int ret = 0;
 
-	count = adreno_dev->istore_size * adreno_dev->instruction_size;
+	/*
+	 * Check the IB address - if it is either the last executed IB1 or the
+	 * last executed IB2 then push it into the static blob otherwise put
+	 * it in the dynamic list
+	 */
 
-	if (remain < (count * 4) + sizeof(*header)) {
-		KGSL_DRV_ERR(device,
-			"snapshot: Not enough memory for the istore section");
-		return 0;
-	}
+	kgsl_regread(device, REG_CP_IB1_BASE, &ib1base);
+	kgsl_regread(device, REG_CP_IB2_BASE, &ib2base);
 
-	header->count = adreno_dev->istore_size;
+	if (gpuaddr == ib1base || gpuaddr == ib2base)
+		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
+			gpuaddr, dwords);
+	else
+		ret = ib_add_gpu_object(device, ptbase, gpuaddr, dwords);
 
-	for (i = 0; i < count; i++)
-		kgsl_regread(device, ADRENO_ISTORE_START + i, &data[i]);
-
-	return (count * 4) + sizeof(*header);
+	return ret;
 }
 
 /* Snapshot the ringbuffer memory */
@@ -779,12 +799,11 @@
 			 * others get marked at GPU objects
 			 */
 
-			if (ibaddr == ibbase || memdesc != NULL)
+			if (memdesc != NULL)
 				push_object(device, SNAPSHOT_OBJ_TYPE_IB,
 					ptbase, ibaddr, ibsize);
 			else
-				ib_add_gpu_object(device, ptbase, ibaddr,
-					ibsize);
+				parse_ib(device, ptbase, ibaddr, ibsize);
 		}
 
 		index = index + 1;
@@ -799,6 +818,64 @@
 	return size + sizeof(*header);
 }
 
+static int snapshot_capture_mem_list(struct kgsl_device *device, void *snapshot,
+			int remain, void *priv)
+{
+	struct kgsl_snapshot_replay_mem_list *header = snapshot;
+	struct kgsl_process_private *private;
+	unsigned int ptbase;
+	struct rb_node *node;
+	struct kgsl_mem_entry *entry = NULL;
+	int num_mem;
+	unsigned int *data = snapshot + sizeof(*header);
+
+	ptbase = kgsl_mmu_get_current_ptbase(&device->mmu);
+	mutex_lock(&kgsl_driver.process_mutex);
+	list_for_each_entry(private, &kgsl_driver.process_list, list) {
+		if (kgsl_mmu_pt_equal(&device->mmu, private->pagetable,
+			ptbase))
+			break;
+	}
+	mutex_unlock(&kgsl_driver.process_mutex);
+	if (!private) {
+		KGSL_DRV_ERR(device,
+		"Failed to get pointer to process private structure\n");
+		return 0;
+	}
+	/* We need to know the number of memory objects that the process has */
+	spin_lock(&private->mem_lock);
+	for (node = rb_first(&private->mem_rb), num_mem = 0; node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+		node = rb_next(&entry->node);
+		num_mem++;
+	}
+
+	if (remain < ((num_mem * 3 * sizeof(unsigned int)) +
+			sizeof(*header))) {
+		KGSL_DRV_ERR(device,
+			"snapshot: Not enough memory for the mem list section");
+		spin_unlock(&private->mem_lock);
+		return 0;
+	}
+	header->num_entries = num_mem;
+	header->ptbase = ptbase;
+	/*
+	 * Walk throught the memory list and store the
+	 * tuples(gpuaddr, size, memtype) in snapshot
+	 */
+	for (node = rb_first(&private->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+		node = rb_next(&entry->node);
+
+		*data++ = entry->memdesc.gpuaddr;
+		*data++ = entry->memdesc.size;
+		*data++ = (entry->memdesc.priv & KGSL_MEMTYPE_MASK) >>
+							KGSL_MEMTYPE_SHIFT;
+	}
+	spin_unlock(&private->mem_lock);
+	return sizeof(*header) + (num_mem * 3 * sizeof(unsigned int));
+}
+
 /* Snapshot the memory for an indirect buffer */
 static int snapshot_ib(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
@@ -829,15 +906,14 @@
 				continue;
 
 			if (adreno_cmd_is_ib(*src))
-				push_object(device, SNAPSHOT_OBJ_TYPE_IB,
-					obj->ptbase, src[1], src[2]);
-			else {
+				ret = parse_ib(device, obj->ptbase, src[1],
+					src[2]);
+			else
 				ret = ib_parse_type3(device, src, obj->ptbase);
 
-				/* Stop parsing if the type3 decode fails */
-				if (ret < 0)
-					break;
-			}
+			/* Stop parsing if the type3 decode fails */
+			if (ret < 0)
+				break;
 		}
 	}
 
@@ -903,6 +979,13 @@
 		snapshot, remain, snapshot_rb, NULL);
 
 	/*
+	 * Add a section that lists (gpuaddr, size, memtype) tuples of the
+	 * hanging process
+	 */
+	snapshot = kgsl_snapshot_add_section(device,
+			KGSL_SNAPSHOT_SECTION_MEMLIST, snapshot, remain,
+			snapshot_capture_mem_list, NULL);
+	/*
 	 * Make sure that the last IB1 that was being executed is dumped.
 	 * Since this was the last IB1 that was processed, we should have
 	 * already added it to the list during the ringbuffer parse but we
@@ -950,17 +1033,6 @@
 	for (i = 0; i < objbufptr; i++)
 		snapshot = dump_object(device, i, snapshot, remain);
 
-	/*
-	 * Only dump the istore on a hang - reading it on a running system
-	 * has a non 0 chance of hanging the GPU
-	 */
-
-	if (hang) {
-		snapshot = kgsl_snapshot_add_section(device,
-			KGSL_SNAPSHOT_SECTION_ISTORE, snapshot, remain,
-			snapshot_istore, NULL);
-	}
-
 	/* Add GPU specific sections - registers mainly, but other stuff too */
 	if (adreno_dev->gpudev->snapshot)
 		snapshot = adreno_dev->gpudev->snapshot(adreno_dev, snapshot,
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
index 3582a41..53ef392 100644
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -18,7 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
-#include <linux/android_pmem.h>
+
 #include <linux/vmalloc.h>
 #include <linux/pm_runtime.h>
 #include <linux/genlock.h>
@@ -28,6 +28,7 @@
 #include <linux/msm_ion.h>
 #include <linux/io.h>
 #include <mach/socinfo.h>
+#include <linux/mman.h>
 
 #include "kgsl.h"
 #include "kgsl_debugfs.h"
@@ -52,6 +53,52 @@
 
 static struct ion_client *kgsl_ion_client;
 
+int kgsl_memfree_hist_init(void)
+{
+	void *base;
+
+	base = kzalloc(KGSL_MEMFREE_HIST_SIZE, GFP_KERNEL);
+	kgsl_driver.memfree_hist.base_hist_rb = base;
+	if (base == NULL)
+		return -ENOMEM;
+	kgsl_driver.memfree_hist.size = KGSL_MEMFREE_HIST_SIZE;
+	kgsl_driver.memfree_hist.wptr = base;
+	return 0;
+}
+
+void kgsl_memfree_hist_exit(void)
+{
+	kfree(kgsl_driver.memfree_hist.base_hist_rb);
+	kgsl_driver.memfree_hist.base_hist_rb = NULL;
+}
+
+void kgsl_memfree_hist_set_event(unsigned int pid, unsigned int gpuaddr,
+			unsigned int size, int flags)
+{
+	struct kgsl_memfree_hist_elem *p;
+
+	void *base = kgsl_driver.memfree_hist.base_hist_rb;
+	int rbsize = kgsl_driver.memfree_hist.size;
+
+	if (base == NULL)
+		return;
+
+	mutex_lock(&kgsl_driver.memfree_hist_mutex);
+	p = kgsl_driver.memfree_hist.wptr;
+	p->pid = pid;
+	p->gpuaddr = gpuaddr;
+	p->size = size;
+	p->flags = flags;
+
+	kgsl_driver.memfree_hist.wptr++;
+	if ((void *)kgsl_driver.memfree_hist.wptr >= base+rbsize) {
+		kgsl_driver.memfree_hist.wptr =
+			(struct kgsl_memfree_hist_elem *)base;
+	}
+	mutex_unlock(&kgsl_driver.memfree_hist_mutex);
+}
+
+
 /* kgsl_get_mem_entry - get the mem_entry structure for the specified object
  * @device - Pointer to the device structure
  * @ptbase - the pagetable base of the object
@@ -99,13 +146,6 @@
 	return entry;
 }
 
-unsigned int kgsl_get_alloc_size(int detailed)
-{
-	unsigned int ret = 0;
-
-	return ret;
-}
-
 void
 kgsl_mem_entry_destroy(struct kref *kref)
 {
@@ -143,9 +183,19 @@
 }
 EXPORT_SYMBOL(kgsl_mem_entry_destroy);
 
-static
-void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry,
-				   struct kgsl_process_private *process)
+/**
+ * kgsl_mem_entry_track_gpuaddr - Insert a mem_entry in the address tree
+ * @process: the process that owns the memory
+ * @entry: the memory entry
+ *
+ * Insert a kgsl_mem_entry in to the rb_tree for searching by GPU address.
+ * Not all mem_entries will have gpu addresses when first created, so this
+ * function may be called after creation when the GPU address is finally
+ * assigned.
+ */
+static void
+kgsl_mem_entry_track_gpuaddr(struct kgsl_process_private *process,
+				struct kgsl_mem_entry *entry)
 {
 	struct rb_node **node;
 	struct rb_node *parent = NULL;
@@ -170,8 +220,48 @@
 	rb_insert_color(&entry->node, &process->mem_rb);
 
 	spin_unlock(&process->mem_lock);
+}
 
+/**
+ * kgsl_mem_entry_attach_process - Attach a mem_entry to its owner process
+ * @entry: the memory entry
+ * @process: the owner process
+ *
+ * Attach a newly created mem_entry to its owner process so that
+ * it can be found later. The mem_entry will be added to mem_idr and have
+ * its 'id' field assigned. If the GPU address has been set, the entry
+ * will also be added to the mem_rb tree.
+ *
+ * @returns - 0 on success or error code on failure.
+ */
+static int
+kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry,
+				   struct kgsl_process_private *process)
+{
+	int ret;
+
+	while (1) {
+		if (idr_pre_get(&process->mem_idr, GFP_KERNEL) == 0) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		spin_lock(&process->mem_lock);
+		ret = idr_get_new_above(&process->mem_idr, entry, 1,
+					&entry->id);
+		spin_unlock(&process->mem_lock);
+
+		if (ret == 0)
+			break;
+		else if (ret != -EAGAIN)
+			goto err;
+	}
 	entry->priv = process;
+
+	if (entry->memdesc.gpuaddr != 0)
+		kgsl_mem_entry_track_gpuaddr(process, entry);
+err:
+	return ret;
 }
 
 /* Detach a memory entry from a process and unmap it from the MMU */
@@ -181,6 +271,17 @@
 	if (entry == NULL)
 		return;
 
+	spin_lock(&entry->priv->mem_lock);
+
+	if (entry->id != 0)
+		idr_remove(&entry->priv->mem_idr, entry->id);
+	entry->id = 0;
+
+	if (entry->memdesc.gpuaddr != 0)
+		rb_erase(&entry->node, &entry->priv->mem_rb);
+
+	spin_unlock(&entry->priv->mem_lock);
+
 	entry->priv->stats[entry->memtype].cur -= entry->memdesc.size;
 	entry->priv = NULL;
 
@@ -199,14 +300,19 @@
 
 	context = kzalloc(sizeof(*context), GFP_KERNEL);
 
-	if (context == NULL)
-		return NULL;
+	if (context == NULL) {
+		KGSL_DRV_INFO(dev_priv->device, "kzalloc(%d) failed\n",
+				sizeof(*context));
+		return ERR_PTR(-ENOMEM);
+	}
 
 	while (1) {
 		if (idr_pre_get(&dev_priv->device->context_idr,
 				GFP_KERNEL) == 0) {
-			kfree(context);
-			return NULL;
+			KGSL_DRV_INFO(dev_priv->device,
+					"idr_pre_get: ENOMEM\n");
+			ret = -ENOMEM;
+			goto func_end;
 		}
 
 		ret = idr_get_new_above(&dev_priv->device->context_idr,
@@ -216,26 +322,25 @@
 			break;
 	}
 
-	if (ret) {
-		kfree(context);
-		return NULL;
-	}
+	if (ret)
+		goto func_end;
 
 	/* MAX - 1, there is one memdesc in memstore for device info */
 	if (id >= KGSL_MEMSTORE_MAX) {
-		KGSL_DRV_ERR(dev_priv->device, "cannot have more than %d "
+		KGSL_DRV_INFO(dev_priv->device, "cannot have more than %d "
 				"ctxts due to memstore limitation\n",
 				KGSL_MEMSTORE_MAX);
 		idr_remove(&dev_priv->device->context_idr, id);
-		kfree(context);
-		return NULL;
+		ret = -ENOSPC;
+		goto func_end;
 	}
 
 	kref_init(&context->refcount);
 	context->id = id;
 	context->dev_priv = dev_priv;
 
-	if (kgsl_sync_timeline_create(context)) {
+	ret = kgsl_sync_timeline_create(context);
+	if (ret) {
 		idr_remove(&dev_priv->device->context_idr, id);
 		goto func_end;
 	}
@@ -257,7 +362,7 @@
 func_end:
 	if (ret) {
 		kfree(context);
-		return NULL;
+		return ERR_PTR(ret);
 	}
 
 	return context;
@@ -362,24 +467,6 @@
 	return ret;
 }
 
-int kgsl_register_ts_notifier(struct kgsl_device *device,
-			      struct notifier_block *nb)
-{
-	BUG_ON(device == NULL);
-	return atomic_notifier_chain_register(&device->ts_notifier_list,
-					      nb);
-}
-EXPORT_SYMBOL(kgsl_register_ts_notifier);
-
-int kgsl_unregister_ts_notifier(struct kgsl_device *device,
-				struct notifier_block *nb)
-{
-	BUG_ON(device == NULL);
-	return atomic_notifier_chain_unregister(&device->ts_notifier_list,
-						nb);
-}
-EXPORT_SYMBOL(kgsl_unregister_ts_notifier);
-
 int kgsl_check_timestamp(struct kgsl_device *device,
 	struct kgsl_context *context, unsigned int timestamp)
 {
@@ -430,7 +517,7 @@
 			INIT_COMPLETION(device->hwaccess_gate);
 			device->ftbl->suspend_context(device);
 			device->ftbl->stop(device);
-			pm_qos_update_request(&device->pm_qos_req_dma,
+			pm_qos_update_request(&device->pwrctrl.pm_qos_req_dma,
 						PM_QOS_DEFAULT_VALUE);
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND);
 			break;
@@ -579,12 +666,15 @@
 	private->pid = task_tgid_nr(current);
 	private->mem_rb = RB_ROOT;
 
+	idr_init(&private->mem_idr);
+
 	if (kgsl_mmu_enabled())
 	{
 		unsigned long pt_name;
+		struct kgsl_mmu *mmu = &cur_dev_priv->device->mmu;
 
 		pt_name = task_tgid_nr(current);
-		private->pagetable = kgsl_mmu_getpagetable(pt_name);
+		private->pagetable = kgsl_mmu_getpagetable(mmu, pt_name);
 		if (private->pagetable == NULL) {
 			kfree(private);
 			private = NULL;
@@ -607,7 +697,7 @@
 			 struct kgsl_process_private *private)
 {
 	struct kgsl_mem_entry *entry = NULL;
-	struct rb_node *node;
+	int next = 0;
 
 	if (!private)
 		return;
@@ -622,14 +712,22 @@
 
 	list_del(&private->list);
 
-	for (node = rb_first(&private->mem_rb); node; ) {
-		entry = rb_entry(node, struct kgsl_mem_entry, node);
-		node = rb_next(&entry->node);
-
-		rb_erase(&entry->node, &private->mem_rb);
+	while (1) {
+		rcu_read_lock();
+		entry = idr_get_next(&private->mem_idr, &next);
+		rcu_read_unlock();
+		if (entry == NULL)
+			break;
 		kgsl_mem_entry_detach_process(entry);
+		/*
+		 * Always start back at the beginning, to
+		 * ensure all entries are removed,
+		 * like list_for_each_entry_safe.
+		 */
+		next = 0;
 	}
 	kgsl_mmu_putpagetable(private->pagetable);
+	idr_destroy(&private->mem_idr);
 	kfree(private);
 unlock:
 	mutex_unlock(&kgsl_driver.process_mutex);
@@ -717,13 +815,6 @@
 	dev_priv->device = device;
 	filep->private_data = dev_priv;
 
-	/* Get file (per process) private struct */
-	dev_priv->process_priv = kgsl_get_process_private(dev_priv);
-	if (dev_priv->process_priv ==  NULL) {
-		result = -ENOMEM;
-		goto err_freedevpriv;
-	}
-
 	mutex_lock(&device->mutex);
 	kgsl_check_suspended(device);
 
@@ -731,26 +822,45 @@
 		kgsl_sharedmem_set(&device->memstore, 0, 0,
 				device->memstore.size);
 
-		result = device->ftbl->start(device, true);
+		result = device->ftbl->init(device);
+		if (result)
+			goto err_freedevpriv;
 
-		if (result) {
-			mutex_unlock(&device->mutex);
-			goto err_putprocess;
-		}
+		result = device->ftbl->start(device);
+		if (result)
+			goto err_freedevpriv;
+
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 	}
 	device->open_count++;
 	mutex_unlock(&device->mutex);
 
+	/*
+	 * Get file (per process) private struct. This must be done
+	 * after the first start so that the global pagetable mappings
+	 * are set up before we create the per-process pagetable.
+	 */
+	dev_priv->process_priv = kgsl_get_process_private(dev_priv);
+	if (dev_priv->process_priv ==  NULL) {
+		result = -ENOMEM;
+		goto err_stop;
+	}
+
 	KGSL_DRV_INFO(device, "Initialized %s: mmu=%s pagetable_count=%d\n",
 		device->name, kgsl_mmu_enabled() ? "on" : "off",
 		kgsl_pagetable_count);
 
 	return result;
 
-err_putprocess:
-	kgsl_put_process_private(device, dev_priv->process_priv);
+err_stop:
+	mutex_lock(&device->mutex);
+	device->open_count--;
+	if (device->open_count == 0) {
+		result = device->ftbl->stop(device);
+		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
+	}
 err_freedevpriv:
+	mutex_unlock(&device->mutex);
 	filep->private_data = NULL;
 	kfree(dev_priv);
 err_pmruntime:
@@ -765,7 +875,7 @@
 {
 	struct rb_node *node = private->mem_rb.rb_node;
 
-	if (!kgsl_mmu_gpuaddr_in_range(gpuaddr))
+	if (!kgsl_mmu_gpuaddr_in_range(private->pagetable, gpuaddr))
 		return NULL;
 
 	while (node != NULL) {
@@ -798,6 +908,77 @@
 	return kgsl_sharedmem_find_region(private, gpuaddr, 1);
 }
 
+/**
+ * kgsl_sharedmem_region_empty - Check if an addression region is empty
+ *
+ * @private: private data for the process to check.
+ * @gpuaddr: start address of the region
+ * @size: length of the region.
+ *
+ * Checks that there are no existing allocations within an address
+ * region. Note that unlike other kgsl_sharedmem* search functions,
+ * this one manages locking on its own.
+ */
+int
+kgsl_sharedmem_region_empty(struct kgsl_process_private *private,
+	unsigned int gpuaddr, size_t size)
+{
+	int result = 1;
+	unsigned int gpuaddr_end = gpuaddr + size;
+
+	struct rb_node *node = private->mem_rb.rb_node;
+
+	if (!kgsl_mmu_gpuaddr_in_range(private->pagetable, gpuaddr))
+		return 0;
+
+	/* don't overflow */
+	if (gpuaddr_end < gpuaddr)
+		return 0;
+
+	spin_lock(&private->mem_lock);
+	node = private->mem_rb.rb_node;
+	while (node != NULL) {
+		struct kgsl_mem_entry *entry;
+		unsigned int memdesc_start, memdesc_end;
+
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+
+		memdesc_start = entry->memdesc.gpuaddr;
+		memdesc_end = memdesc_start
+				+ kgsl_memdesc_mmapsize(&entry->memdesc);
+
+		if (gpuaddr_end <= memdesc_start)
+			node = node->rb_left;
+		else if (memdesc_end <= gpuaddr)
+			node = node->rb_right;
+		else {
+			result = 0;
+			break;
+		}
+	}
+	spin_unlock(&private->mem_lock);
+	return result;
+}
+
+/**
+ * kgsl_sharedmem_find_id - find a memory entry by id
+ * @process: the owning process
+ * @id: id to find
+ *
+ * @returns - the mem_entry or NULL
+ */
+static inline struct kgsl_mem_entry *
+kgsl_sharedmem_find_id(struct kgsl_process_private *process, unsigned int id)
+{
+	struct kgsl_mem_entry *entry;
+
+	rcu_read_lock();
+	entry = idr_find(&process->mem_idr, id);
+	rcu_read_unlock();
+
+	return entry;
+}
+
 /*call all ioctl sub functions with driver locked*/
 static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 					  unsigned int cmd, void *data)
@@ -934,11 +1115,8 @@
 	int result;
 
 	context = kgsl_find_context(dev_priv, param->context_id);
-	if (context == NULL) {
-		KGSL_DRV_ERR(dev_priv->device, "invalid context_id %d\n",
-			param->context_id);
+	if (context == NULL)
 		return -EINVAL;
-	}
 	/*
 	 * A reference count is needed here, because waittimestamp may
 	 * block with the device mutex unlocked and userspace could
@@ -955,6 +1133,7 @@
 				      unsigned int cmd, void *data)
 {
 	int result = 0;
+	int i = 0;
 	struct kgsl_ringbuffer_issueibcmds *param = data;
 	struct kgsl_ibdesc *ibdesc;
 	struct kgsl_context *context;
@@ -962,20 +1141,11 @@
 	context = kgsl_find_context(dev_priv, param->drawctxt_id);
 	if (context == NULL) {
 		result = -EINVAL;
-		KGSL_DRV_ERR(dev_priv->device,
-			"invalid context_id %d\n",
-			param->drawctxt_id);
 		goto done;
 	}
 
 	if (param->flags & KGSL_CONTEXT_SUBMIT_IB_LIST) {
-		KGSL_DRV_INFO(dev_priv->device,
-			"Using IB list mode for ib submission, numibs: %d\n",
-			param->numibs);
 		if (!param->numibs) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"Invalid numibs as parameter: %d\n",
-				 param->numibs);
 			result = -EINVAL;
 			goto done;
 		}
@@ -986,9 +1156,6 @@
 		 */
 
 		if (param->numibs > 10000) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"Too many IBs submitted. count: %d max 10000\n",
-				param->numibs);
 			result = -EINVAL;
 			goto done;
 		}
@@ -1028,6 +1195,18 @@
 		param->numibs = 1;
 	}
 
+	for (i = 0; i < param->numibs; i++) {
+		struct kgsl_pagetable *pt = dev_priv->process_priv->pagetable;
+
+		if (!kgsl_mmu_gpuaddr_in_range(pt, ibdesc[i].gpuaddr)) {
+			result = -ERANGE;
+			KGSL_DRV_ERR(dev_priv->device,
+				     "invalid ib base GPU virtual addr %x\n",
+				     ibdesc[i].gpuaddr);
+			goto free_ibdesc;
+		}
+	}
+
 	result = dev_priv->device->ftbl->issueibcmds(dev_priv,
 					     context,
 					     ibdesc,
@@ -1035,8 +1214,6 @@
 					     &param->timestamp,
 					     param->flags);
 
-	trace_kgsl_issueibcmds(dev_priv->device, param, ibdesc, result);
-
 free_ibdesc:
 	kfree(ibdesc);
 done:
@@ -1075,11 +1252,8 @@
 	struct kgsl_context *context;
 
 	context = kgsl_find_context(dev_priv, param->context_id);
-	if (context == NULL) {
-		KGSL_DRV_ERR(dev_priv->device, "invalid context_id %d\n",
-			param->context_id);
+	if (context == NULL)
 		return -EINVAL;
-	}
 
 	return _cmdstream_readtimestamp(dev_priv, context,
 			param->type, &param->timestamp);
@@ -1089,9 +1263,6 @@
 	void *priv, u32 id, u32 timestamp)
 {
 	struct kgsl_mem_entry *entry = priv;
-	spin_lock(&entry->priv->mem_lock);
-	rb_erase(&entry->node, &entry->priv->mem_rb);
-	spin_unlock(&entry->priv->mem_lock);
 	trace_kgsl_mem_timestamp_free(device, entry, id, timestamp, 0);
 	kgsl_mem_entry_detach_process(entry);
 }
@@ -1144,11 +1315,8 @@
 	struct kgsl_context *context;
 
 	context = kgsl_find_context(dev_priv, param->context_id);
-	if (context == NULL) {
-		KGSL_DRV_ERR(dev_priv->device,
-			"invalid drawctxt context_id %d\n", param->context_id);
+	if (context == NULL)
 		return -EINVAL;
-	}
 
 	return _cmdstream_freememontimestamp(dev_priv, param->gpuaddr,
 			context, param->timestamp, param->type);
@@ -1163,22 +1331,22 @@
 
 	context = kgsl_create_context(dev_priv);
 
-	if (context == NULL) {
-		result = -ENOMEM;
+	if (IS_ERR(context)) {
+		result = PTR_ERR(context);
 		goto done;
 	}
 
 	if (dev_priv->device->ftbl->drawctxt_create) {
 		result = dev_priv->device->ftbl->drawctxt_create(
 			dev_priv->device, dev_priv->process_priv->pagetable,
-			context, param->flags);
+			context, &param->flags);
 		if (result)
 			goto done;
 	}
 	trace_kgsl_context_create(dev_priv->device, context, param->flags);
 	param->drawctxt_id = context->id;
 done:
-	if (result && context)
+	if (result && !IS_ERR(context))
 		kgsl_context_detach(context);
 
 	return result;
@@ -1206,27 +1374,52 @@
 static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv,
 					unsigned int cmd, void *data)
 {
-	int result = 0;
 	struct kgsl_sharedmem_free *param = data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_mem_entry *entry = NULL;
 
 	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
-	if (entry)
-		rb_erase(&entry->node, &private->mem_rb);
-
 	spin_unlock(&private->mem_lock);
 
-	if (entry) {
-		trace_kgsl_mem_free(entry);
-		kgsl_mem_entry_detach_process(entry);
-	} else {
-		KGSL_CORE_ERR("invalid gpuaddr %08x\n", param->gpuaddr);
-		result = -EINVAL;
+	if (!entry) {
+		KGSL_MEM_INFO(dev_priv->device, "invalid gpuaddr %08x\n",
+				param->gpuaddr);
+		return -EINVAL;
 	}
+	trace_kgsl_mem_free(entry);
 
-	return result;
+	kgsl_memfree_hist_set_event(entry->priv->pid,
+				    entry->memdesc.gpuaddr,
+				    entry->memdesc.size,
+				    entry->memdesc.flags);
+
+	kgsl_mem_entry_detach_process(entry);
+	return 0;
+}
+
+static long kgsl_ioctl_gpumem_free_id(struct kgsl_device_private *dev_priv,
+					unsigned int cmd, void *data)
+{
+	struct kgsl_gpumem_free_id *param = data;
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_mem_entry *entry = NULL;
+
+	entry = kgsl_sharedmem_find_id(private, param->id);
+
+	if (!entry) {
+		KGSL_MEM_INFO(dev_priv->device, "invalid id %d\n", param->id);
+		return -EINVAL;
+	}
+	trace_kgsl_mem_free(entry);
+
+	kgsl_memfree_hist_set_event(entry->priv->pid,
+				    entry->memdesc.gpuaddr,
+				    entry->memdesc.size,
+				    entry->memdesc.flags);
+
+	kgsl_mem_entry_detach_process(entry);
+	return 0;
 }
 
 static struct vm_area_struct *kgsl_get_vma_from_start_addr(unsigned int addr)
@@ -1257,11 +1450,10 @@
 	dev_t rdev;
 	struct fb_info *info;
 
+	*start = 0;
+	*vstart = 0;
+	*len = 0;
 	*filep = NULL;
-#ifdef CONFIG_ANDROID_PMEM
-	if (!get_pmem_file(fd, start, vstart, len, filep))
-		return 0;
-#endif
 
 	fbfile = fget(fd);
 	if (fbfile == NULL) {
@@ -1302,10 +1494,8 @@
 
 	ret = -ERANGE;
 
-	if (phys == 0) {
-		KGSL_CORE_ERR("kgsl_get_phys_file returned phys=0\n");
+	if (phys == 0)
 		goto err;
-	}
 
 	/* Make sure the length of the region, the offset and the desired
 	 * size are all page aligned or bail
@@ -1313,19 +1503,13 @@
 	if ((len & ~PAGE_MASK) ||
 		(offset & ~PAGE_MASK) ||
 		(size & ~PAGE_MASK)) {
-		KGSL_CORE_ERR("length %lu, offset %u or size %u "
-				"is not page aligned\n",
-				len, offset, size);
+		KGSL_CORE_ERR("length offset or size is not page aligned\n");
 		goto err;
 	}
 
 	/* The size or offset can never be greater than the PMEM length */
-	if (offset >= len || size > len) {
-		KGSL_CORE_ERR("offset %u or size %u "
-				"exceeds pmem length %lu\n",
-				offset, size, len);
+	if (offset >= len || size > len)
 		goto err;
-	}
 
 	/* If size is 0, then adjust it to default to the size of the region
 	 * minus the offset.  If size isn't zero, then make sure that it will
@@ -1343,6 +1527,8 @@
 	entry->memdesc.size = size;
 	entry->memdesc.physaddr = phys + offset;
 	entry->memdesc.hostptr = (void *) (virt + offset);
+	/* USE_CPU_MAP is not impemented for PMEM. */
+	entry->memdesc.flags &= ~KGSL_MEMFLAGS_USE_CPU_MAP;
 
 	ret = memdesc_sg_phys(&entry->memdesc, phys + offset, size);
 	if (ret)
@@ -1350,18 +1536,14 @@
 
 	return 0;
 err:
-#ifdef CONFIG_ANDROID_PMEM
-	put_pmem_file(filep);
-#endif
 	return ret;
 }
 
 static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
-	void *addr, int size)
+	unsigned long paddr, int size)
 {
 	int i;
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
-	unsigned long paddr = (unsigned long) addr;
 
 	memdesc->sg = kgsl_sg_alloc(sglen);
 
@@ -1412,34 +1594,33 @@
 	return -EINVAL;
 }
 
-static int kgsl_setup_hostptr(struct kgsl_mem_entry *entry,
+static int kgsl_setup_useraddr(struct kgsl_mem_entry *entry,
 			      struct kgsl_pagetable *pagetable,
-			      void *hostptr, unsigned int offset,
+			      unsigned long useraddr, unsigned int offset,
 			      size_t size)
 {
 	struct vm_area_struct *vma;
 	unsigned int len;
 
 	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, (unsigned int) hostptr);
+	vma = find_vma(current->mm, useraddr);
 	up_read(&current->mm->mmap_sem);
 
 	if (!vma) {
-		KGSL_CORE_ERR("find_vma(%p) failed\n", hostptr);
+		KGSL_CORE_ERR("find_vma(%lx) failed\n", useraddr);
 		return -EINVAL;
 	}
 
 	/* We don't necessarily start at vma->vm_start */
-	len = vma->vm_end - (unsigned long) hostptr;
+	len = vma->vm_end - useraddr;
 
 	if (offset >= len)
 		return -EINVAL;
 
-	if (!KGSL_IS_PAGE_ALIGNED((unsigned long) hostptr) ||
+	if (!KGSL_IS_PAGE_ALIGNED(useraddr) ||
 	    !KGSL_IS_PAGE_ALIGNED(len)) {
-		KGSL_CORE_ERR("user address len(%u)"
-			      "and start(%p) must be page"
-			      "aligned\n", len, hostptr);
+		KGSL_CORE_ERR("bad alignment: start(%lx) len(%u)\n",
+			      useraddr, len);
 		return -EINVAL;
 	}
 
@@ -1460,28 +1641,29 @@
 
 	entry->memdesc.pagetable = pagetable;
 	entry->memdesc.size = size;
-	entry->memdesc.hostptr = hostptr + (offset & PAGE_MASK);
+	entry->memdesc.useraddr = useraddr + (offset & PAGE_MASK);
+	if (kgsl_memdesc_use_cpu_map(&entry->memdesc))
+		entry->memdesc.gpuaddr = entry->memdesc.useraddr;
 
-	return memdesc_sg_virt(&entry->memdesc,
-		hostptr + (offset & PAGE_MASK), size);
+	return memdesc_sg_virt(&entry->memdesc, entry->memdesc.useraddr,
+				size);
 }
 
 #ifdef CONFIG_ASHMEM
 static int kgsl_setup_ashmem(struct kgsl_mem_entry *entry,
 			     struct kgsl_pagetable *pagetable,
-			     int fd, void *hostptr, size_t size)
+			     int fd, unsigned long useraddr, size_t size)
 {
 	int ret;
 	struct vm_area_struct *vma;
 	struct file *filep, *vmfile;
 	unsigned long len;
-	unsigned int hostaddr = (unsigned int) hostptr;
 
-	vma = kgsl_get_vma_from_start_addr(hostaddr);
+	vma = kgsl_get_vma_from_start_addr(useraddr);
 	if (vma == NULL)
 		return -EINVAL;
 
-	if (vma->vm_pgoff || vma->vm_start != hostaddr) {
+	if (vma->vm_pgoff || vma->vm_start != useraddr) {
 		KGSL_CORE_ERR("Invalid vma region\n");
 		return -EINVAL;
 	}
@@ -1492,8 +1674,8 @@
 		size = len;
 
 	if (size != len) {
-		KGSL_CORE_ERR("Invalid size %d for vma region %p\n",
-			      size, hostptr);
+		KGSL_CORE_ERR("Invalid size %d for vma region %lx\n",
+			      size, useraddr);
 		return -EINVAL;
 	}
 
@@ -1513,9 +1695,11 @@
 	entry->priv_data = filep;
 	entry->memdesc.pagetable = pagetable;
 	entry->memdesc.size = ALIGN(size, PAGE_SIZE);
-	entry->memdesc.hostptr = hostptr;
+	entry->memdesc.useraddr = useraddr;
+	if (kgsl_memdesc_use_cpu_map(&entry->memdesc))
+		entry->memdesc.gpuaddr = entry->memdesc.useraddr;
 
-	ret = memdesc_sg_virt(&entry->memdesc, hostptr, size);
+	ret = memdesc_sg_virt(&entry->memdesc, useraddr, size);
 	if (ret)
 		goto err;
 
@@ -1528,18 +1712,23 @@
 #else
 static int kgsl_setup_ashmem(struct kgsl_mem_entry *entry,
 			     struct kgsl_pagetable *pagetable,
-			     int fd, void *hostptr, size_t size)
+			     int fd, unsigned long useraddr, size_t size)
 {
 	return -EINVAL;
 }
 #endif
 
 static int kgsl_setup_ion(struct kgsl_mem_entry *entry,
-		struct kgsl_pagetable *pagetable, int fd)
+		struct kgsl_pagetable *pagetable, void *data)
 {
 	struct ion_handle *handle;
 	struct scatterlist *s;
 	struct sg_table *sg_table;
+	struct kgsl_map_user_mem *param = data;
+	int fd = param->fd;
+
+	if (!param->len)
+		return -EINVAL;
 
 	if (IS_ERR_OR_NULL(kgsl_ion_client))
 		return -ENODEV;
@@ -1554,6 +1743,8 @@
 	entry->priv_data = handle;
 	entry->memdesc.pagetable = pagetable;
 	entry->memdesc.size = 0;
+	/* USE_CPU_MAP is not impemented for ION. */
+	entry->memdesc.flags &= ~KGSL_MEMFLAGS_USE_CPU_MAP;
 
 	sg_table = ion_sg_table(kgsl_ion_client, handle);
 
@@ -1571,6 +1762,8 @@
 		entry->memdesc.sglen++;
 	}
 
+	entry->memdesc.size = PAGE_ALIGN(entry->memdesc.size);
+
 	return 0;
 err:
 	ion_free(kgsl_ion_client, handle);
@@ -1596,7 +1789,20 @@
 	else
 		memtype = param->memtype;
 
+	/*
+	 * Mask off unknown flags from userspace. This way the caller can
+	 * check if a flag is supported by looking at the returned flags.
+	 * Note: CACHEMODE is ignored for this call. Caching should be
+	 * determined by type of allocation being mapped.
+	 */
+	param->flags &= KGSL_MEMFLAGS_GPUREADONLY
+			| KGSL_MEMTYPE_MASK
+			| KGSL_MEMALIGN_MASK
+			| KGSL_MEMFLAGS_USE_CPU_MAP;
+
 	entry->memdesc.flags = param->flags;
+	if (!kgsl_mmu_use_cpu_map(private->pagetable->mmu))
+		entry->memdesc.flags &= ~KGSL_MEMFLAGS_USE_CPU_MAP;
 
 	switch (memtype) {
 	case KGSL_USER_MEM_TYPE_PMEM:
@@ -1622,8 +1828,8 @@
 		if (param->hostptr == 0)
 			break;
 
-		result = kgsl_setup_hostptr(entry, private->pagetable,
-					    (void *) param->hostptr,
+		result = kgsl_setup_useraddr(entry, private->pagetable,
+					    param->hostptr,
 					    param->offset, param->len);
 		entry->memtype = KGSL_MEM_ENTRY_USER;
 		break;
@@ -1640,14 +1846,13 @@
 			break;
 
 		result = kgsl_setup_ashmem(entry, private->pagetable,
-					   param->fd, (void *) param->hostptr,
+					   param->fd, param->hostptr,
 					   param->len);
 
 		entry->memtype = KGSL_MEM_ENTRY_ASHMEM;
 		break;
 	case KGSL_USER_MEM_TYPE_ION:
-		result = kgsl_setup_ion(entry, private->pagetable,
-			param->fd);
+		result = kgsl_setup_ion(entry, private->pagetable, data);
 		break;
 	default:
 		KGSL_CORE_ERR("Invalid memory type: %x\n", memtype);
@@ -1662,27 +1867,31 @@
 	else if (entry->memdesc.size >= SZ_64K)
 		kgsl_memdesc_set_align(&entry->memdesc, ilog2(SZ_64));
 
-	result = kgsl_mmu_map(private->pagetable,
-			      &entry->memdesc,
-			      GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-
+	result = kgsl_mmu_map(private->pagetable, &entry->memdesc);
 	if (result)
 		goto error_put_file_ptr;
 
 	/* Adjust the returned value for a non 4k aligned offset */
 	param->gpuaddr = entry->memdesc.gpuaddr + (param->offset & ~PAGE_MASK);
+	/* echo back flags */
+	param->flags = entry->memdesc.flags;
+
+	result = kgsl_mem_entry_attach_process(entry, private);
+	if (result)
+		goto error_unmap;
 
 	KGSL_STATS_ADD(param->len, kgsl_driver.stats.mapped,
 		kgsl_driver.stats.mapped_max);
 
 	kgsl_process_add_stats(private, entry->memtype, param->len);
 
-	kgsl_mem_entry_attach_process(entry, private);
 	trace_kgsl_mem_map(entry, param->fd);
 
 	kgsl_check_idle(dev_priv->device);
 	return result;
 
+error_unmap:
+	kgsl_mmu_unmap(private->pagetable, &entry->memdesc);
 error_put_file_ptr:
 	switch (entry->memtype) {
 	case KGSL_MEM_ENTRY_PMEM:
@@ -1702,33 +1911,136 @@
 	return result;
 }
 
-/*This function flushes a graphics memory allocation from CPU cache
- *when caching is enabled with MMU*/
+static int _kgsl_gpumem_sync_cache(struct kgsl_mem_entry *entry, int op)
+{
+	int ret = 0;
+	int cacheop;
+	int mode;
+
+	/*
+	 * Flush is defined as (clean | invalidate).  If both bits are set, then
+	 * do a flush, otherwise check for the individual bits and clean or inv
+	 * as requested
+	 */
+
+	if ((op & KGSL_GPUMEM_CACHE_FLUSH) == KGSL_GPUMEM_CACHE_FLUSH)
+		cacheop = KGSL_CACHE_OP_FLUSH;
+	else if (op & KGSL_GPUMEM_CACHE_CLEAN)
+		cacheop = KGSL_CACHE_OP_CLEAN;
+	else if (op & KGSL_GPUMEM_CACHE_INV)
+		cacheop = KGSL_CACHE_OP_INV;
+	else {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	mode = kgsl_memdesc_get_cachemode(&entry->memdesc);
+	if (mode != KGSL_CACHEMODE_UNCACHED
+		&& mode != KGSL_CACHEMODE_WRITECOMBINE) {
+		trace_kgsl_mem_sync_cache(entry, op);
+		kgsl_cache_range_op(&entry->memdesc, cacheop);
+	}
+
+done:
+	return ret;
+}
+
+/* New cache sync function - supports both directions (clean and invalidate) */
+
+static long
+kgsl_ioctl_gpumem_sync_cache(struct kgsl_device_private *dev_priv,
+	unsigned int cmd, void *data)
+{
+	struct kgsl_gpumem_sync_cache *param = data;
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_mem_entry *entry = NULL;
+
+	if (param->id != 0) {
+		entry = kgsl_sharedmem_find_id(private, param->id);
+		if (entry == NULL) {
+			KGSL_MEM_INFO(dev_priv->device, "can't find id %d\n",
+					param->id);
+			return -EINVAL;
+		}
+	} else if (param->gpuaddr != 0) {
+		spin_lock(&private->mem_lock);
+		entry = kgsl_sharedmem_find(private, param->gpuaddr);
+		spin_unlock(&private->mem_lock);
+		if (entry == NULL) {
+			KGSL_MEM_INFO(dev_priv->device,
+					"can't find gpuaddr %x\n",
+					param->gpuaddr);
+			return -EINVAL;
+		}
+	} else {
+		return -EINVAL;
+	}
+
+	return _kgsl_gpumem_sync_cache(entry, param->op);
+}
+
+/* Legacy cache function, does a flush (clean  + invalidate) */
+
 static long
 kgsl_ioctl_sharedmem_flush_cache(struct kgsl_device_private *dev_priv,
 				 unsigned int cmd, void *data)
 {
-	int result = 0;
-	struct kgsl_mem_entry *entry;
 	struct kgsl_sharedmem_free *param = data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_mem_entry *entry = NULL;
 
 	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
-	if (!entry) {
-		KGSL_CORE_ERR("invalid gpuaddr %08x\n", param->gpuaddr);
-		result = -EINVAL;
-		goto done;
-	}
-	if (!entry->memdesc.hostptr) {
-		KGSL_CORE_ERR("invalid hostptr with gpuaddr %08x\n",
-			param->gpuaddr);
-			goto done;
+	spin_unlock(&private->mem_lock);
+	if (entry == NULL) {
+		KGSL_MEM_INFO(dev_priv->device,
+				"can't find gpuaddr %x\n",
+				param->gpuaddr);
+		return -EINVAL;
 	}
 
-	kgsl_cache_range_op(&entry->memdesc, KGSL_CACHE_OP_CLEAN);
-done:
-	spin_unlock(&private->mem_lock);
+	return _kgsl_gpumem_sync_cache(entry, KGSL_GPUMEM_CACHE_FLUSH);
+}
+
+/*
+ * The common parts of kgsl_ioctl_gpumem_alloc and kgsl_ioctl_gpumem_alloc_id.
+ */
+int
+_gpumem_alloc(struct kgsl_device_private *dev_priv,
+		struct kgsl_mem_entry **ret_entry,
+		unsigned int size, unsigned int flags)
+{
+	int result;
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_mem_entry *entry;
+
+	/*
+	 * Mask off unknown flags from userspace. This way the caller can
+	 * check if a flag is supported by looking at the returned flags.
+	 */
+	flags &= KGSL_MEMFLAGS_GPUREADONLY
+		| KGSL_CACHEMODE_MASK
+		| KGSL_MEMTYPE_MASK
+		| KGSL_MEMALIGN_MASK
+		| KGSL_MEMFLAGS_USE_CPU_MAP;
+
+	entry = kgsl_mem_entry_create();
+	if (entry == NULL)
+		return -ENOMEM;
+
+	result = kgsl_allocate_user(&entry->memdesc, private->pagetable, size,
+				    flags);
+	if (result != 0)
+		goto err;
+
+	entry->memtype = KGSL_MEM_ENTRY_KERNEL;
+
+	kgsl_check_idle(dev_priv->device);
+	*ret_entry = entry;
+	return result;
+err:
+	kfree(entry);
+	*ret_entry = NULL;
 	return result;
 }
 
@@ -1738,29 +2050,115 @@
 {
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_gpumem_alloc *param = data;
-	struct kgsl_mem_entry *entry;
+	struct kgsl_mem_entry *entry = NULL;
 	int result;
 
-	entry = kgsl_mem_entry_create();
-	if (entry == NULL)
-		return -ENOMEM;
+	param->flags &= ~KGSL_MEMFLAGS_USE_CPU_MAP;
+	result = _gpumem_alloc(dev_priv, &entry, param->size, param->flags);
+	if (result)
+		return result;
 
-	result = kgsl_allocate_user(&entry->memdesc, private->pagetable,
-		param->size, param->flags);
+	result = kgsl_mmu_map(private->pagetable, &entry->memdesc);
+	if (result)
+		goto err;
 
-	if (result == 0) {
-		entry->memtype = KGSL_MEM_ENTRY_KERNEL;
-		kgsl_mem_entry_attach_process(entry, private);
-		param->gpuaddr = entry->memdesc.gpuaddr;
+	result = kgsl_mem_entry_attach_process(entry, private);
+	if (result != 0)
+		goto err;
 
-		kgsl_process_add_stats(private, entry->memtype, param->size);
-		trace_kgsl_mem_alloc(entry);
-	} else
-		kfree(entry);
+	kgsl_process_add_stats(private, entry->memtype, param->size);
+	trace_kgsl_mem_alloc(entry);
 
-	kgsl_check_idle(dev_priv->device);
+	param->gpuaddr = entry->memdesc.gpuaddr;
+	param->size = entry->memdesc.size;
+	param->flags = entry->memdesc.flags;
+	return result;
+err:
+	kgsl_sharedmem_free(&entry->memdesc);
+	kfree(entry);
 	return result;
 }
+
+static long
+kgsl_ioctl_gpumem_alloc_id(struct kgsl_device_private *dev_priv,
+			unsigned int cmd, void *data)
+{
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_gpumem_alloc_id *param = data;
+	struct kgsl_mem_entry *entry = NULL;
+	int result;
+
+	if (!kgsl_mmu_use_cpu_map(private->pagetable->mmu))
+		param->flags &= ~KGSL_MEMFLAGS_USE_CPU_MAP;
+
+	result = _gpumem_alloc(dev_priv, &entry, param->size, param->flags);
+	if (result != 0)
+		goto err;
+
+	if (!kgsl_memdesc_use_cpu_map(&entry->memdesc)) {
+		result = kgsl_mmu_map(private->pagetable, &entry->memdesc);
+		if (result)
+			goto err;
+	}
+
+	result = kgsl_mem_entry_attach_process(entry, private);
+	if (result != 0)
+		goto err;
+
+	kgsl_process_add_stats(private, entry->memtype, param->size);
+	trace_kgsl_mem_alloc(entry);
+
+	param->id = entry->id;
+	param->flags = entry->memdesc.flags;
+	param->size = entry->memdesc.size;
+	param->mmapsize = kgsl_memdesc_mmapsize(&entry->memdesc);
+	param->gpuaddr = entry->memdesc.gpuaddr;
+	return result;
+err:
+	if (entry)
+		kgsl_sharedmem_free(&entry->memdesc);
+	kfree(entry);
+	return result;
+}
+
+static long
+kgsl_ioctl_gpumem_get_info(struct kgsl_device_private *dev_priv,
+			unsigned int cmd, void *data)
+{
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_gpumem_get_info *param = data;
+	struct kgsl_mem_entry *entry = NULL;
+	int result = 0;
+
+	if (param->id != 0) {
+		entry = kgsl_sharedmem_find_id(private, param->id);
+		if (entry == NULL) {
+			KGSL_MEM_INFO(dev_priv->device, "can't find id %d\n",
+					param->id);
+			return -EINVAL;
+		}
+	} else if (param->gpuaddr != 0) {
+		spin_lock(&private->mem_lock);
+		entry = kgsl_sharedmem_find(private, param->gpuaddr);
+		spin_unlock(&private->mem_lock);
+		if (entry == NULL) {
+			KGSL_MEM_INFO(dev_priv->device,
+					"can't find gpuaddr %lx\n",
+					param->gpuaddr);
+			return -EINVAL;
+		}
+	} else {
+		return -EINVAL;
+	}
+	param->gpuaddr = entry->memdesc.gpuaddr;
+	param->id = entry->id;
+	param->flags = entry->memdesc.flags;
+	param->size = entry->memdesc.size;
+	param->mmapsize = kgsl_memdesc_mmapsize(&entry->memdesc);
+	param->useraddr = entry->memdesc.useraddr;
+	return result;
+}
+
 static long kgsl_ioctl_cff_syncmem(struct kgsl_device_private *dev_priv,
 					unsigned int cmd, void *data)
 {
@@ -1924,7 +2322,7 @@
 static const struct {
 	unsigned int cmd;
 	kgsl_ioctl_func_t func;
-	int flags;
+	unsigned int flags;
 } kgsl_ioctl_funcs[] = {
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_DEVICE_GETPROPERTY,
 			kgsl_ioctl_device_getproperty,
@@ -1975,7 +2373,15 @@
 			KGSL_IOCTL_LOCK),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_SETPROPERTY,
 			kgsl_ioctl_device_setproperty,
-			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE)
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
+	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_ALLOC_ID,
+			kgsl_ioctl_gpumem_alloc_id, 0),
+	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_FREE_ID,
+			kgsl_ioctl_gpumem_free_id, 0),
+	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_GET_INFO,
+			kgsl_ioctl_gpumem_get_info, 0),
+	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_SYNC_CACHE,
+			kgsl_ioctl_gpumem_sync_cache, 0),
 };
 
 static long kgsl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
@@ -2067,7 +2473,11 @@
 		mutex_unlock(&dev_priv->device->mutex);
 	}
 
-	if (ret == 0 && (cmd & IOC_OUT)) {
+	/*
+	 * Still copy back on failure, but assume function took
+	 * all necessary precautions sanitizing the return values.
+	 */
+	if (cmd & IOC_OUT) {
 		if (copy_to_user((void __user *) arg, uptr, _IOC_SIZE(cmd)))
 			ret = -EFAULT;
 	}
@@ -2135,6 +2545,8 @@
 kgsl_gpumem_vm_close(struct vm_area_struct *vma)
 {
 	struct kgsl_mem_entry *entry  = vma->vm_private_data;
+
+	entry->memdesc.useraddr = 0;
 	kgsl_mem_entry_put(entry);
 }
 
@@ -2144,8 +2556,145 @@
 	.close = kgsl_gpumem_vm_close,
 };
 
+static int
+get_mmap_entry(struct kgsl_process_private *private,
+		struct kgsl_mem_entry **out_entry, unsigned long pgoff,
+		unsigned long len)
+{
+	int ret = -EINVAL;
+	struct kgsl_mem_entry *entry;
+
+	entry = kgsl_sharedmem_find_id(private, pgoff);
+	if (entry == NULL) {
+		spin_lock(&private->mem_lock);
+		entry = kgsl_sharedmem_find(private, pgoff << PAGE_SHIFT);
+		spin_unlock(&private->mem_lock);
+	}
+
+	if (!entry)
+		return -EINVAL;
+
+	kgsl_mem_entry_get(entry);
+
+	if (!entry->memdesc.ops ||
+		!entry->memdesc.ops->vmflags ||
+		!entry->memdesc.ops->vmfault) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	if (entry->memdesc.useraddr != 0) {
+		ret = -EBUSY;
+		goto err_put;
+	}
+
+	if (len != kgsl_memdesc_mmapsize(&entry->memdesc)) {
+		ret = -ERANGE;
+		goto err_put;
+	}
+
+	*out_entry = entry;
+	return 0;
+err_put:
+	kgsl_mem_entry_put(entry);
+	return ret;
+}
+
+static unsigned long
+kgsl_get_unmapped_area(struct file *file, unsigned long addr,
+			unsigned long len, unsigned long pgoff,
+			unsigned long flags)
+{
+	unsigned long ret = 0;
+	unsigned long vma_offset = pgoff << PAGE_SHIFT;
+	struct kgsl_device_private *dev_priv = file->private_data;
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	struct kgsl_device *device = dev_priv->device;
+	struct kgsl_mem_entry *entry = NULL;
+	unsigned int align;
+	unsigned int retry = 0;
+
+	if (vma_offset == device->memstore.gpuaddr)
+		return get_unmapped_area(NULL, addr, len, pgoff, flags);
+
+	ret = get_mmap_entry(private, &entry, pgoff, len);
+	if (ret)
+		return ret;
+
+	if (!kgsl_memdesc_use_cpu_map(&entry->memdesc) || (flags & MAP_FIXED)) {
+		/*
+		 * If we're not going to use the same mapping on the gpu,
+		 * any address is fine.
+		 * For MAP_FIXED, hopefully the caller knows what they're doing,
+		 * but we may fail in mmap() if there is already something
+		 * at the virtual address chosen.
+		 */
+		ret = get_unmapped_area(NULL, addr, len, pgoff, flags);
+		goto put;
+	}
+	if (entry->memdesc.gpuaddr != 0) {
+		KGSL_MEM_INFO(device,
+				"pgoff %lx already mapped to gpuaddr %x\n",
+				pgoff, entry->memdesc.gpuaddr);
+		ret = -EBUSY;
+		goto put;
+	}
+
+	align = kgsl_memdesc_get_align(&entry->memdesc);
+	if (align >= ilog2(SZ_1M))
+		align = ilog2(SZ_1M);
+	else if (align >= ilog2(SZ_64K))
+		align = ilog2(SZ_64K);
+	else if (align <= PAGE_SHIFT)
+		align = 0;
+
+	if (align)
+		len += 1 << align;
+	do {
+		ret = get_unmapped_area(NULL, addr, len, pgoff, flags);
+		if (IS_ERR_VALUE(ret))
+			break;
+		if (align)
+			ret = ALIGN(ret, (1 << align));
+
+		/*make sure there isn't a GPU only mapping at this address */
+		if (kgsl_sharedmem_region_empty(private, ret, len))
+			break;
+
+		trace_kgsl_mem_unmapped_area_collision(entry, addr, len, ret);
+
+		/*
+		 * If we collided, bump the hint address so that
+		 * get_umapped_area knows to look somewhere else.
+		 */
+		addr = (addr == 0) ? ret + len : addr + len;
+
+		/*
+		 * The addr hint can be set by userspace to be near
+		 * the end of the address space. Make sure we search
+		 * the whole address space at least once by wrapping
+		 * back around once.
+		 */
+		if (!retry && (addr + len >= TASK_SIZE)) {
+			addr = 0;
+			retry = 1;
+		} else {
+			ret = -EBUSY;
+		}
+	} while (addr + len < TASK_SIZE);
+
+	if (IS_ERR_VALUE(ret))
+		KGSL_MEM_INFO(device,
+				"pid %d pgoff %lx len %ld failed error %ld\n",
+				private->pid, pgoff, len, ret);
+put:
+	kgsl_mem_entry_put(entry);
+	return ret;
+}
+
 static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	unsigned int ret, cache;
 	unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT;
 	struct kgsl_device_private *dev_priv = file->private_data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
@@ -2157,31 +2706,76 @@
 	if (vma_offset == device->memstore.gpuaddr)
 		return kgsl_mmap_memstore(device, vma);
 
-	/* Find a chunk of GPU memory */
+	ret = get_mmap_entry(private, &entry, vma->vm_pgoff,
+				vma->vm_end - vma->vm_start);
+	if (ret)
+		return ret;
 
-	spin_lock(&private->mem_lock);
-	entry = kgsl_sharedmem_find(private, vma_offset);
+	if (kgsl_memdesc_use_cpu_map(&entry->memdesc)) {
+		entry->memdesc.gpuaddr = vma->vm_start;
 
-	if (entry)
-		kgsl_mem_entry_get(entry);
-
-	spin_unlock(&private->mem_lock);
-
-	if (entry == NULL)
-		return -EINVAL;
-
-	if (!entry->memdesc.ops ||
-		!entry->memdesc.ops->vmflags ||
-		!entry->memdesc.ops->vmfault)
-		return -EINVAL;
+		ret = kgsl_mmu_map(private->pagetable, &entry->memdesc);
+		if (ret) {
+			kgsl_mem_entry_put(entry);
+			return ret;
+		}
+		kgsl_mem_entry_track_gpuaddr(private, entry);
+	}
 
 	vma->vm_flags |= entry->memdesc.ops->vmflags(&entry->memdesc);
 
 	vma->vm_private_data = entry;
-	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+	/* Determine user-side caching policy */
+
+	cache = kgsl_memdesc_get_cachemode(&entry->memdesc);
+
+	switch (cache) {
+	case KGSL_CACHEMODE_UNCACHED:
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		break;
+	case KGSL_CACHEMODE_WRITETHROUGH:
+		vma->vm_page_prot = pgprot_writethroughcache(vma->vm_page_prot);
+		break;
+	case KGSL_CACHEMODE_WRITEBACK:
+		vma->vm_page_prot = pgprot_writebackcache(vma->vm_page_prot);
+		break;
+	case KGSL_CACHEMODE_WRITECOMBINE:
+	default:
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		break;
+	}
+
 	vma->vm_ops = &kgsl_gpumem_vm_ops;
+
+	if (cache == KGSL_CACHEMODE_WRITEBACK
+		|| cache == KGSL_CACHEMODE_WRITETHROUGH) {
+		struct scatterlist *s;
+		int i;
+		int sglen = entry->memdesc.sglen;
+		unsigned long addr = vma->vm_start;
+
+		/* don't map in the guard page, it should always fault */
+		if (kgsl_memdesc_has_guard_page(&entry->memdesc))
+			sglen--;
+
+		for_each_sg(entry->memdesc.sg, s, sglen, i) {
+			int j;
+			for (j = 0; j < (sg_dma_len(s) >> PAGE_SHIFT); j++) {
+				struct page *page = sg_page(s);
+				page = nth_page(page, j);
+				vm_insert_page(vma, addr, page);
+				addr += PAGE_SIZE;
+			}
+		}
+	}
+
 	vma->vm_file = file;
 
+	entry->memdesc.useraddr = vma->vm_start;
+
+	trace_kgsl_mem_mmap(entry);
+
 	return 0;
 }
 
@@ -2198,6 +2792,7 @@
 	.release = kgsl_release,
 	.open = kgsl_open,
 	.mmap = kgsl_mmap,
+	.get_unmapped_area = kgsl_get_unmapped_area,
 	.unlocked_ioctl = kgsl_ioctl,
 };
 
@@ -2205,6 +2800,8 @@
 	.process_mutex = __MUTEX_INITIALIZER(kgsl_driver.process_mutex),
 	.ptlock = __SPIN_LOCK_UNLOCKED(kgsl_driver.ptlock),
 	.devlock = __MUTEX_INITIALIZER(kgsl_driver.devlock),
+	.memfree_hist_mutex =
+		__MUTEX_INITIALIZER(kgsl_driver.memfree_hist_mutex),
 };
 EXPORT_SYMBOL(kgsl_driver);
 
@@ -2287,6 +2884,7 @@
 
 	kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
 
+	/* Get starting physical address of device registers */
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
 					   device->iomemname);
 	if (res == NULL) {
@@ -2304,6 +2902,33 @@
 	device->reg_phys = res->start;
 	device->reg_len = resource_size(res);
 
+	/*
+	 * Check if a shadermemname is defined, and then get shader memory
+	 * details including shader memory starting physical address
+	 * and shader memory length
+	 */
+	if (device->shadermemname != NULL) {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						device->shadermemname);
+
+		if (res == NULL) {
+			KGSL_DRV_ERR(device,
+			"Shader memory: platform_get_resource_byname failed\n");
+		}
+
+		else {
+			device->shader_mem_phys = res->start;
+			device->shader_mem_len = resource_size(res);
+		}
+
+		if (!devm_request_mem_region(device->dev,
+					device->shader_mem_phys,
+					device->shader_mem_len,
+						device->name)) {
+			KGSL_DRV_ERR(device, "request_mem_region_failed\n");
+		}
+	}
+
 	if (!devm_request_mem_region(device->dev, device->reg_phys,
 				device->reg_len, device->name)) {
 		KGSL_DRV_ERR(device, "request_mem_region failed\n");
@@ -2349,7 +2974,6 @@
 	if (result)
 		goto error_pwrctrl_close;
 
-	kgsl_cffdump_open(device->id);
 
 	setup_timer(&device->idle_timer, kgsl_timer, (unsigned long) device);
 	status = kgsl_create_device_workqueue(device);
@@ -2371,7 +2995,8 @@
 		goto error_close_mmu;
 	}
 
-	pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY,
+	pm_qos_add_request(&device->pwrctrl.pm_qos_req_dma,
+				PM_QOS_CPU_DMA_LATENCY,
 				PM_QOS_DEFAULT_VALUE);
 
 	/* Initalize the snapshot engine */
@@ -2421,11 +3046,15 @@
 	if (device->pm_dump_enable) {
 
 		KGSL_LOG_DUMP(device,
-				"POWER: FLAGS = %08lX | ACTIVE POWERLEVEL = %08X",
-				pwr->power_flags, pwr->active_pwrlevel);
+			"POWER: NAP ALLOWED = %d | START_STOP_SLEEP_WAKE = %d\n"
+			, pwr->nap_allowed, pwr->strtstp_sleepwake);
+
+		KGSL_LOG_DUMP(device,
+			"POWER: FLAGS = %08lX | ACTIVE POWERLEVEL = %08X",
+			pwr->power_flags, pwr->active_pwrlevel);
 
 		KGSL_LOG_DUMP(device, "POWER: INTERVAL TIMEOUT = %08X ",
-				pwr->interval_timeout);
+			pwr->interval_timeout);
 
 	}
 
@@ -2473,10 +3102,9 @@
 {
 	kgsl_device_snapshot_close(device);
 
-	kgsl_cffdump_close(device->id);
 	kgsl_pwrctrl_uninit_sysfs(device);
 
-	pm_qos_remove_request(&device->pm_qos_req_dma);
+	pm_qos_remove_request(&device->pwrctrl.pm_qos_req_dma);
 
 	idr_destroy(&device->context_idr);
 
@@ -2529,6 +3157,7 @@
 		kgsl_driver.class = NULL;
 	}
 
+	kgsl_memfree_hist_exit();
 	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
 }
 
@@ -2600,6 +3229,9 @@
 			goto err;
 	}
 
+	if (kgsl_memfree_hist_init())
+		KGSL_CORE_ERR("failed to init memfree_hist");
+
 	return 0;
 
 err:
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index 3935164..c568db5 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -71,6 +71,23 @@
 #define KGSL_STATS_ADD(_size, _stat, _max) \
 	do { _stat += (_size); if (_stat > _max) _max = _stat; } while (0)
 
+
+#define KGSL_MEMFREE_HIST_SIZE	((int)(PAGE_SIZE * 2))
+
+struct kgsl_memfree_hist_elem {
+	unsigned int pid;
+	unsigned int gpuaddr;
+	unsigned int size;
+	unsigned int flags;
+};
+
+struct kgsl_memfree_hist {
+	void *base_hist_rb;
+	unsigned int size;
+	struct kgsl_memfree_hist_elem *wptr;
+};
+
+
 struct kgsl_device;
 struct kgsl_context;
 
@@ -99,6 +116,9 @@
 
 	void *ptpool;
 
+	struct mutex memfree_hist_mutex;
+	struct kgsl_memfree_hist memfree_hist;
+
 	struct {
 		unsigned int vmalloc;
 		unsigned int vmalloc_max;
@@ -129,13 +149,16 @@
 #define KGSL_MEMDESC_GUARD_PAGE BIT(0)
 /* Set if the memdesc is mapped into all pagetables */
 #define KGSL_MEMDESC_GLOBAL BIT(1)
+/* The memdesc is frozen during a snapshot */
+#define KGSL_MEMDESC_FROZEN BIT(2)
 
 /* shared memory allocation */
 struct kgsl_memdesc {
 	struct kgsl_pagetable *pagetable;
-	void *hostptr;
+	void *hostptr; /* kernel virtual address */
+	unsigned long useraddr; /* userspace address */
 	unsigned int gpuaddr;
-	unsigned int physaddr;
+	phys_addr_t physaddr;
 	unsigned int size;
 	unsigned int priv; /* Internal flags and settings */
 	struct scatterlist *sg;
@@ -154,17 +177,13 @@
 #define KGSL_MEM_ENTRY_ION    4
 #define KGSL_MEM_ENTRY_MAX    5
 
-/* List of flags */
-
-#define KGSL_MEM_ENTRY_FROZEN (1 << 0)
-
 struct kgsl_mem_entry {
 	struct kref refcount;
 	struct kgsl_memdesc memdesc;
 	int memtype;
-	int flags;
 	void *priv_data;
 	struct rb_node node;
+	unsigned int id;
 	unsigned int context_id;
 	/* back pointer to private structure under whose context this
 	* allocation is made */
@@ -224,6 +243,10 @@
 static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc,
 				unsigned int gpuaddr, unsigned int size)
 {
+	/* don't overflow */
+	if ((gpuaddr + size) < gpuaddr)
+		return 0;
+
 	if (gpuaddr >= memdesc->gpuaddr &&
 	    ((gpuaddr + size) <= (memdesc->gpuaddr + memdesc->size))) {
 		return 1;
diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c
index e06c94d..6dc2ccc 100644
--- a/drivers/gpu/msm/kgsl_cffdump.c
+++ b/drivers/gpu/msm/kgsl_cffdump.c
@@ -28,6 +28,7 @@
 #include "kgsl_log.h"
 #include "kgsl_sharedmem.h"
 #include "adreno_pm4types.h"
+#include "adreno.h"
 
 static struct rchan	*chan;
 static struct dentry	*dir;
@@ -334,7 +335,7 @@
 		return;
 	}
 
-	kgsl_cff_dump_enable = 1;
+	kgsl_cff_dump_enable = 0;
 
 	spin_lock_init(&cffdump_lock);
 
@@ -356,10 +357,21 @@
 		debugfs_remove(dir);
 }
 
-void kgsl_cffdump_open(enum kgsl_deviceid device_id)
+void kgsl_cffdump_open(struct kgsl_device *device)
 {
-	kgsl_cffdump_memory_base(device_id, KGSL_PAGETABLE_BASE,
-			kgsl_mmu_get_ptsize(), SZ_256K);
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
+		kgsl_cffdump_memory_base(device->id,
+			kgsl_mmu_get_base_addr(&device->mmu),
+			kgsl_mmu_get_ptsize(&device->mmu) +
+			KGSL_IOMMU_GLOBAL_MEM_SIZE, adreno_dev->gmem_size);
+	} else {
+		kgsl_cffdump_memory_base(device->id,
+			kgsl_mmu_get_base_addr(&device->mmu),
+			kgsl_mmu_get_ptsize(&device->mmu),
+			adreno_dev->gmem_size);
+	}
 }
 
 void kgsl_cffdump_memory_base(enum kgsl_deviceid device_id, unsigned int base,
@@ -387,7 +399,7 @@
 }
 
 void kgsl_cffdump_syncmem(struct kgsl_device_private *dev_priv,
-	const struct kgsl_memdesc *memdesc, uint gpuaddr, uint sizebytes,
+	struct kgsl_memdesc *memdesc, uint gpuaddr, uint sizebytes,
 	bool clean_cache)
 {
 	const void *src;
@@ -522,7 +534,7 @@
 }
 
 static struct dentry *create_buf_file_handler(const char *filename,
-	struct dentry *parent, int mode, struct rchan_buf *buf,
+	struct dentry *parent, unsigned short mode, struct rchan_buf *buf,
 	int *is_global)
 {
 	return debugfs_create_file(filename, mode, parent, buf,
diff --git a/drivers/gpu/msm/kgsl_cffdump.h b/drivers/gpu/msm/kgsl_cffdump.h
index 2733cc3..d5656f8 100644
--- a/drivers/gpu/msm/kgsl_cffdump.h
+++ b/drivers/gpu/msm/kgsl_cffdump.h
@@ -22,10 +22,10 @@
 
 void kgsl_cffdump_init(void);
 void kgsl_cffdump_destroy(void);
-void kgsl_cffdump_open(enum kgsl_deviceid device_id);
+void kgsl_cffdump_open(struct kgsl_device *device);
 void kgsl_cffdump_close(enum kgsl_deviceid device_id);
 void kgsl_cffdump_syncmem(struct kgsl_device_private *dev_priv,
-	const struct kgsl_memdesc *memdesc, uint physaddr, uint sizebytes,
+	struct kgsl_memdesc *memdesc, uint physaddr, uint sizebytes,
 	bool clean_cache);
 void kgsl_cffdump_setmem(uint addr, uint value, uint sizebytes);
 void kgsl_cffdump_regwrite(enum kgsl_deviceid device_id, uint addr,
@@ -49,7 +49,7 @@
 
 #define kgsl_cffdump_init()					(void)0
 #define kgsl_cffdump_destroy()					(void)0
-#define kgsl_cffdump_open(device_id)				(void)0
+#define kgsl_cffdump_open(device)				(void)0
 #define kgsl_cffdump_close(device_id)				(void)0
 #define kgsl_cffdump_syncmem(dev_priv, memdesc, addr, sizebytes, clean_cache) \
 	(void) 0
diff --git a/drivers/gpu/msm/kgsl_debugfs.c b/drivers/gpu/msm/kgsl_debugfs.c
index b41bd6b..9dfda32 100644
--- a/drivers/gpu/msm/kgsl_debugfs.c
+++ b/drivers/gpu/msm/kgsl_debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -125,6 +125,52 @@
 KGSL_DEBUGFS_LOG(pwr_log);
 KGSL_DEBUGFS_LOG(ft_log);
 
+static int memfree_hist_print(struct seq_file *s, void *unused)
+{
+	void *base = kgsl_driver.memfree_hist.base_hist_rb;
+
+	struct kgsl_memfree_hist_elem *wptr = kgsl_driver.memfree_hist.wptr;
+	struct kgsl_memfree_hist_elem *p;
+	char str[16];
+
+	seq_printf(s, "%8s %8s %8s %11s\n",
+			"pid", "gpuaddr", "size", "flags");
+
+	mutex_lock(&kgsl_driver.memfree_hist_mutex);
+	p = wptr;
+	for (;;) {
+		kgsl_get_memory_usage(str, sizeof(str), p->flags);
+		/*
+		 * if the ring buffer is not filled up yet
+		 * all its empty elems have size==0
+		 * just skip them ...
+		*/
+		if (p->size)
+			seq_printf(s, "%8d %08x %8d %11s\n",
+				p->pid, p->gpuaddr, p->size, str);
+		p++;
+		if ((void *)p >= base + kgsl_driver.memfree_hist.size)
+			p = (struct kgsl_memfree_hist_elem *) base;
+
+		if (p == kgsl_driver.memfree_hist.wptr)
+			break;
+	}
+	mutex_unlock(&kgsl_driver.memfree_hist_mutex);
+	return 0;
+}
+
+static int memfree_hist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, memfree_hist_print, inode->i_private);
+}
+
+static const struct file_operations memfree_hist_fops = {
+	.open = memfree_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 void kgsl_device_debugfs_init(struct kgsl_device *device)
 {
 	if (kgsl_debugfs_dir && !IS_ERR(kgsl_debugfs_dir))
@@ -151,6 +197,8 @@
 				&mem_log_fops);
 	debugfs_create_file("log_level_pwr", 0644, device->d_debugfs, device,
 				&pwr_log_fops);
+	debugfs_create_file("memfree_history", 0444, device->d_debugfs, device,
+				&memfree_hist_fops);
 	debugfs_create_file("log_level_ft", 0644, device->d_debugfs, device,
 				&ft_log_fops);
 
@@ -198,35 +246,71 @@
 	return '-';
 }
 
+static char get_cacheflag(const struct kgsl_memdesc *m)
+{
+	static const char table[] = {
+		[KGSL_CACHEMODE_WRITECOMBINE] = '-',
+		[KGSL_CACHEMODE_UNCACHED] = 'u',
+		[KGSL_CACHEMODE_WRITEBACK] = 'b',
+		[KGSL_CACHEMODE_WRITETHROUGH] = 't',
+	};
+	return table[kgsl_memdesc_get_cachemode(m)];
+}
+
+static void print_mem_entry(struct seq_file *s, struct kgsl_mem_entry *entry)
+{
+	char flags[6];
+	char usage[16];
+	struct kgsl_memdesc *m = &entry->memdesc;
+
+	flags[0] = kgsl_memdesc_is_global(m) ?  'g' : '-';
+	flags[1] = m->flags & KGSL_MEMFLAGS_GPUREADONLY ? 'r' : '-';
+	flags[2] = get_alignflag(m);
+	flags[3] = get_cacheflag(m);
+	flags[4] = kgsl_memdesc_use_cpu_map(m) ? 'p' : '-';
+	flags[5] = '\0';
+
+	kgsl_get_memory_usage(usage, sizeof(usage), m->flags);
+
+	seq_printf(s, "%08x %08lx %8d %5d %5s %10s %16s %5d\n",
+			m->gpuaddr, m->useraddr, m->size, entry->id, flags,
+			memtype_str(entry->memtype), usage, m->sglen);
+}
+
 static int process_mem_print(struct seq_file *s, void *unused)
 {
 	struct kgsl_mem_entry *entry;
 	struct rb_node *node;
 	struct kgsl_process_private *private = s->private;
-	char flags[4];
-	char usage[16];
+	int next = 0;
 
+	seq_printf(s, "%8s %8s %8s %5s %5s %10s %16s %5s\n",
+		   "gpuaddr", "useraddr", "size", "id", "flags", "type",
+		   "usage", "sglen");
+
+	/* print all entries with a GPU address */
 	spin_lock(&private->mem_lock);
-	seq_printf(s, "%8s %8s %5s %10s %16s %5s\n",
-		   "gpuaddr", "size", "flags", "type", "usage", "sglen");
+
 	for (node = rb_first(&private->mem_rb); node; node = rb_next(node)) {
-		struct kgsl_memdesc *m;
-
 		entry = rb_entry(node, struct kgsl_mem_entry, node);
-		m = &entry->memdesc;
-
-		flags[0] = m->priv & KGSL_MEMDESC_GLOBAL ?  'g' : '-';
-		flags[1] = m->flags & KGSL_MEMFLAGS_GPUREADONLY ? 'r' : '-';
-		flags[2] = get_alignflag(m);
-		flags[3] = '\0';
-
-		kgsl_get_memory_usage(usage, sizeof(usage), m->flags);
-
-		seq_printf(s, "%08x %8d %5s %10s %16s %5d\n",
-			   m->gpuaddr, m->size, flags,
-			   memtype_str(entry->memtype), usage, m->sglen);
+		print_mem_entry(s, entry);
 	}
+
 	spin_unlock(&private->mem_lock);
+
+	/* now print all the unbound entries */
+	while (1) {
+		rcu_read_lock();
+		entry = idr_get_next(&private->mem_idr, &next);
+		rcu_read_unlock();
+
+		if (entry == NULL)
+			break;
+		if (entry->memdesc.gpuaddr == 0)
+			print_mem_entry(s, entry);
+		next++;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index b215d8c..0d11660 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -73,7 +73,8 @@
 	int (*idle) (struct kgsl_device *device);
 	unsigned int (*isidle) (struct kgsl_device *device);
 	int (*suspend_context) (struct kgsl_device *device);
-	int (*start) (struct kgsl_device *device, unsigned int init_ram);
+	int (*init) (struct kgsl_device *device);
+	int (*start) (struct kgsl_device *device);
 	int (*stop) (struct kgsl_device *device);
 	int (*getproperty) (struct kgsl_device *device,
 		enum kgsl_property_type type, void *value,
@@ -105,7 +106,7 @@
 			uint32_t flags);
 	int (*drawctxt_create) (struct kgsl_device *device,
 		struct kgsl_pagetable *pagetable, struct kgsl_context *context,
-		uint32_t flags);
+		uint32_t *flags);
 	void (*drawctxt_destroy) (struct kgsl_device *device,
 		struct kgsl_context *context);
 	long (*ioctl) (struct kgsl_device_private *dev_priv,
@@ -145,11 +146,27 @@
 	unsigned int ver_minor;
 	uint32_t flags;
 	enum kgsl_deviceid id;
+
+	/* Starting physical address for GPU registers */
 	unsigned long reg_phys;
+
+	/* Starting Kernel virtual address for GPU registers */
 	void *reg_virt;
+
+	/* Total memory size for all GPU registers */
 	unsigned int reg_len;
+
+	/* Kernel virtual address for GPU shader memory */
+	void *shader_mem_virt;
+
+	/* Starting physical address for GPU shader memory */
+	unsigned long shader_mem_phys;
+
+	/* GPU shader memory size */
+	unsigned int shader_mem_len;
 	struct kgsl_memdesc memstore;
 	const char *iomemname;
+	const char *shadermemname;
 
 	struct kgsl_mh mh;
 	struct kgsl_mmu mmu;
@@ -160,7 +177,6 @@
 	struct kgsl_pwrctrl pwrctrl;
 	int open_count;
 
-	struct atomic_notifier_head ts_notifier_list;
 	struct mutex mutex;
 	uint32_t state;
 	uint32_t requested_state;
@@ -201,7 +217,6 @@
 	int pm_dump_enable;
 	struct kgsl_pwrscale pwrscale;
 	struct kobject pwrscale_kobj;
-	struct pm_qos_request pm_qos_req_dma;
 	struct work_struct ts_expired_ws;
 	struct list_head events;
 	struct list_head events_pending_list;
@@ -210,16 +225,16 @@
 	/* Postmortem Control switches */
 	int pm_regs_enabled;
 	int pm_ib_enabled;
+
+	int reset_counter; /* Track how many GPU core resets have occured */
 };
 
 void kgsl_process_events(struct work_struct *work);
-void kgsl_check_fences(struct work_struct *work);
 
 #define KGSL_DEVICE_COMMON_INIT(_dev) \
 	.hwaccess_gate = COMPLETION_INITIALIZER((_dev).hwaccess_gate),\
 	.suspend_gate = COMPLETION_INITIALIZER((_dev).suspend_gate),\
 	.ft_gate = COMPLETION_INITIALIZER((_dev).ft_gate),\
-	.ts_notifier_list = ATOMIC_NOTIFIER_INIT((_dev).ts_notifier_list),\
 	.idle_check_ws = __WORK_INITIALIZER((_dev).idle_check_ws,\
 			kgsl_idle_check),\
 	.ts_expired_ws  = __WORK_INITIALIZER((_dev).ts_expired_ws,\
@@ -266,6 +281,7 @@
 	pid_t pid;
 	spinlock_t mem_lock;
 	struct rb_root mem_rb;
+	struct idr mem_idr;
 	struct kgsl_pagetable *pagetable;
 	struct list_head list;
 	struct kobject kobj;
@@ -391,12 +407,6 @@
 int kgsl_check_timestamp(struct kgsl_device *device,
 		struct kgsl_context *context, unsigned int timestamp);
 
-int kgsl_register_ts_notifier(struct kgsl_device *device,
-			      struct notifier_block *nb);
-
-int kgsl_unregister_ts_notifier(struct kgsl_device *device,
-				struct notifier_block *nb);
-
 int kgsl_device_platform_probe(struct kgsl_device *device);
 
 void kgsl_device_platform_remove(struct kgsl_device *device);
diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c
index 2a5a5fa..11d6ffa 100644
--- a/drivers/gpu/msm/kgsl_drm.c
+++ b/drivers/gpu/msm/kgsl_drm.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2009-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2009-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -16,7 +16,8 @@
  */
 #include "drmP.h"
 #include "drm.h"
-#include <linux/android_pmem.h>
+
+#include <linux/msm_ion.h>
 
 #include "kgsl.h"
 #include "kgsl_device.h"
@@ -27,7 +28,7 @@
 #define DRIVER_AUTHOR           "Qualcomm"
 #define DRIVER_NAME             "kgsl"
 #define DRIVER_DESC             "KGSL DRM"
-#define DRIVER_DATE             "20100127"
+#define DRIVER_DATE             "20121107"
 
 #define DRIVER_MAJOR            2
 #define DRIVER_MINOR            1
@@ -106,6 +107,7 @@
 	uint32_t type;
 	struct kgsl_memdesc memdesc;
 	struct kgsl_pagetable *pagetable;
+	struct ion_handle *ion_handle;
 	uint64_t mmap_offset;
 	int bufcount;
 	int flags;
@@ -129,86 +131,18 @@
 	struct list_head wait_list;
 };
 
+static struct ion_client *kgsl_drm_ion_client;
+
 static int kgsl_drm_inited = DRM_KGSL_NOT_INITED;
 
 /* This is a global list of all the memory currently mapped in the MMU */
 static struct list_head kgsl_mem_list;
 
-static void kgsl_gem_mem_flush(struct kgsl_memdesc *memdesc, int type, int op)
-{
-	int cacheop = 0;
-
-	switch (op) {
-	case DRM_KGSL_GEM_CACHE_OP_TO_DEV:
-		if (type & (DRM_KGSL_GEM_CACHE_WBACK |
-			    DRM_KGSL_GEM_CACHE_WBACKWA))
-			cacheop = KGSL_CACHE_OP_CLEAN;
-
-		break;
-
-	case DRM_KGSL_GEM_CACHE_OP_FROM_DEV:
-		if (type & (DRM_KGSL_GEM_CACHE_WBACK |
-			    DRM_KGSL_GEM_CACHE_WBACKWA |
-			    DRM_KGSL_GEM_CACHE_WTHROUGH))
-			cacheop = KGSL_CACHE_OP_INV;
-	}
-
-	kgsl_cache_range_op(memdesc, cacheop);
-}
-
-/* TODO:
- * Add vsync wait */
-
-static int kgsl_drm_load(struct drm_device *dev, unsigned long flags)
-{
-	return 0;
-}
-
-static int kgsl_drm_unload(struct drm_device *dev)
-{
-	return 0;
-}
-
 struct kgsl_drm_device_priv {
 	struct kgsl_device *device[KGSL_DEVICE_MAX];
 	struct kgsl_device_private *devpriv[KGSL_DEVICE_MAX];
 };
 
-void kgsl_drm_preclose(struct drm_device *dev, struct drm_file *file_priv)
-{
-}
-
-static int kgsl_drm_suspend(struct drm_device *dev, pm_message_t state)
-{
-	return 0;
-}
-
-static int kgsl_drm_resume(struct drm_device *dev)
-{
-	return 0;
-}
-
-static void
-kgsl_gem_free_mmap_offset(struct drm_gem_object *obj)
-{
-	struct drm_device *dev = obj->dev;
-	struct drm_gem_mm *mm = dev->mm_private;
-	struct drm_kgsl_gem_object *priv = obj->driver_private;
-	struct drm_map_list *list;
-
-	list = &obj->map_list;
-	drm_ht_remove_item(&mm->offset_hash, &list->hash);
-	if (list->file_offset_node) {
-		drm_mm_put_block(list->file_offset_node);
-		list->file_offset_node = NULL;
-	}
-
-	kfree(list->map);
-	list->map = NULL;
-
-	priv->mmap_offset = 0;
-}
-
 static int
 kgsl_gem_memory_allocated(struct drm_gem_object *obj)
 {
@@ -220,6 +154,8 @@
 kgsl_gem_alloc_memory(struct drm_gem_object *obj)
 {
 	struct drm_kgsl_gem_object *priv = obj->driver_private;
+	struct sg_table *sg_table;
+	struct scatterlist *s;
 	int index;
 	int result = 0;
 
@@ -237,21 +173,52 @@
 		}
 	}
 
-	/* Set the flags for the memdesc (probably 0, unless it is cached) */
-	priv->memdesc.priv = 0;
-
 	if (TYPE_IS_PMEM(priv->type)) {
 		if (priv->type == DRM_KGSL_GEM_TYPE_EBI ||
 		    priv->type & DRM_KGSL_GEM_PMEM_EBI) {
-				result = kgsl_sharedmem_ebimem_user(
-						&priv->memdesc,
-						priv->pagetable,
-						obj->size * priv->bufcount);
-				if (result) {
-					DRM_ERROR(
-					"Unable to allocate PMEM memory\n");
-					return result;
-				}
+			priv->ion_handle = ion_alloc(kgsl_drm_ion_client,
+				obj->size * priv->bufcount, PAGE_SIZE,
+				ION_HEAP(ION_SF_HEAP_ID), 0);
+			if (IS_ERR_OR_NULL(priv->ion_handle)) {
+				DRM_ERROR(
+				"Unable to allocate ION Phys memory handle\n");
+				return -ENOMEM;
+			}
+
+			priv->memdesc.pagetable = priv->pagetable;
+
+			result = ion_phys(kgsl_drm_ion_client,
+				priv->ion_handle, (ion_phys_addr_t *)
+				&priv->memdesc.physaddr, &priv->memdesc.size);
+			if (result) {
+				DRM_ERROR(
+				"Unable to get ION Physical memory address\n");
+				ion_free(kgsl_drm_ion_client,
+					priv->ion_handle);
+				priv->ion_handle = NULL;
+				return result;
+			}
+
+			result = memdesc_sg_phys(&priv->memdesc,
+				priv->memdesc.physaddr, priv->memdesc.size);
+			if (result) {
+				DRM_ERROR(
+				"Unable to get sg list\n");
+				ion_free(kgsl_drm_ion_client,
+					priv->ion_handle);
+				priv->ion_handle = NULL;
+				return result;
+			}
+
+			result = kgsl_mmu_map(priv->pagetable, &priv->memdesc);
+			if (result) {
+				DRM_ERROR(
+				"kgsl_mmu_map failed.  result = %d\n", result);
+				ion_free(kgsl_drm_ion_client,
+					priv->ion_handle);
+				priv->ion_handle = NULL;
+				return result;
+			}
 		}
 		else
 			return -EINVAL;
@@ -262,15 +229,44 @@
 			priv->type & DRM_KGSL_GEM_CACHE_MASK)
 				list_add(&priv->list, &kgsl_mem_list);
 
-		result = kgsl_sharedmem_page_alloc_user(&priv->memdesc,
-					priv->pagetable,
-					obj->size * priv->bufcount);
+		priv->memdesc.pagetable = priv->pagetable;
 
-		if (result != 0) {
-				DRM_ERROR(
-				"Unable to allocate Vmalloc user memory\n");
-				return result;
+		priv->ion_handle = ion_alloc(kgsl_drm_ion_client,
+				obj->size * priv->bufcount, PAGE_SIZE,
+				ION_HEAP(ION_IOMMU_HEAP_ID), 0);
+		if (IS_ERR_OR_NULL(priv->ion_handle)) {
+			DRM_ERROR(
+				"Unable to allocate ION IOMMU memory handle\n");
+				return -ENOMEM;
 		}
+
+		sg_table = ion_sg_table(kgsl_drm_ion_client,
+				priv->ion_handle);
+		if (IS_ERR_OR_NULL(priv->ion_handle)) {
+			DRM_ERROR(
+			"Unable to get ION sg table\n");
+			goto memerr;
+		}
+
+		priv->memdesc.sg = sg_table->sgl;
+
+		/* Calculate the size of the memdesc from the sglist */
+
+		priv->memdesc.sglen = 0;
+
+		for (s = priv->memdesc.sg; s != NULL; s = sg_next(s)) {
+			priv->memdesc.size += s->length;
+			priv->memdesc.sglen++;
+		}
+
+		result = kgsl_mmu_map(priv->pagetable, &priv->memdesc,
+				GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+		if (result) {
+			DRM_ERROR(
+			"kgsl_mmu_map failed.  result = %d\n", result);
+			goto memerr;
+		}
+
 	} else
 		return -EINVAL;
 
@@ -282,7 +278,15 @@
 	}
 	priv->flags |= DRM_KGSL_GEM_FLAG_MAPPED;
 
+
 	return 0;
+
+memerr:
+	ion_free(kgsl_drm_ion_client,
+		priv->ion_handle);
+	priv->ion_handle = NULL;
+	return -ENOMEM;
+
 }
 
 static void
@@ -293,10 +297,19 @@
 	if (!kgsl_gem_memory_allocated(obj) || TYPE_IS_FD(priv->type))
 		return;
 
-	kgsl_gem_mem_flush(&priv->memdesc,  priv->type,
-			   DRM_KGSL_GEM_CACHE_OP_FROM_DEV);
+	if (priv->memdesc.gpuaddr)
+		kgsl_mmu_unmap(priv->memdesc.pagetable, &priv->memdesc);
 
-	kgsl_sharedmem_free(&priv->memdesc);
+	/* ION will take care of freeing the sg table. */
+	priv->memdesc.sg = NULL;
+	priv->memdesc.sglen = 0;
+
+	if (priv->ion_handle)
+		ion_free(kgsl_drm_ion_client, priv->ion_handle);
+
+	priv->ion_handle = NULL;
+
+	memset(&priv->memdesc, 0, sizeof(priv->memdesc));
 
 	kgsl_mmu_putpagetable(priv->pagetable);
 	priv->pagetable = NULL;
@@ -329,66 +342,10 @@
 kgsl_gem_free_object(struct drm_gem_object *obj)
 {
 	kgsl_gem_free_memory(obj);
-	kgsl_gem_free_mmap_offset(obj);
 	drm_gem_object_release(obj);
 	kfree(obj->driver_private);
 }
 
-static int
-kgsl_gem_create_mmap_offset(struct drm_gem_object *obj)
-{
-	struct drm_device *dev = obj->dev;
-	struct drm_gem_mm *mm = dev->mm_private;
-	struct drm_kgsl_gem_object *priv = obj->driver_private;
-	struct drm_map_list *list;
-	int msize;
-
-	list = &obj->map_list;
-	list->map = kzalloc(sizeof(struct drm_map_list), GFP_KERNEL);
-	if (list->map == NULL) {
-		DRM_ERROR("Unable to allocate drm_map_list\n");
-		return -ENOMEM;
-	}
-
-	msize = obj->size * priv->bufcount;
-
-	list->map->type = _DRM_GEM;
-	list->map->size = msize;
-	list->map->handle = obj;
-
-	/* Allocate a mmap offset */
-	list->file_offset_node = drm_mm_search_free(&mm->offset_manager,
-						    msize / PAGE_SIZE,
-						    0, 0);
-
-	if (!list->file_offset_node) {
-		DRM_ERROR("Failed to allocate offset for %d\n", obj->name);
-		kfree(list->map);
-		return -ENOMEM;
-	}
-
-	list->file_offset_node = drm_mm_get_block(list->file_offset_node,
-						  msize / PAGE_SIZE, 0);
-
-	if (!list->file_offset_node) {
-		DRM_ERROR("Unable to create the file_offset_node\n");
-		kfree(list->map);
-		return -ENOMEM;
-	}
-
-	list->hash.key = list->file_offset_node->start;
-	if (drm_ht_insert_item(&mm->offset_hash, &list->hash)) {
-		DRM_ERROR("Failed to add to map hash\n");
-		drm_mm_put_block(list->file_offset_node);
-		kfree(list->map);
-		return -ENOMEM;
-	}
-
-	priv->mmap_offset = ((uint64_t) list->hash.key) << PAGE_SHIFT;
-
-	return 0;
-}
-
 int
 kgsl_gem_obj_addr(int drm_fd, int handle, unsigned long *start,
 			unsigned long *len)
@@ -435,9 +392,6 @@
 			priv->bufs[priv->active].offset;
 
 		*len = priv->memdesc.size;
-
-		kgsl_gem_mem_flush(&priv->memdesc,
-				   priv->type, DRM_KGSL_GEM_CACHE_OP_TO_DEV);
 	} else {
 		*start = 0;
 		*len = 0;
@@ -468,10 +422,7 @@
 	priv->active = 0;
 	priv->bound = 0;
 
-	/* To preserve backwards compatability, the default memory source
-	   is EBI */
-
-	priv->type = DRM_KGSL_GEM_TYPE_PMEM | DRM_KGSL_GEM_PMEM_EBI;
+	priv->type = DRM_KGSL_GEM_TYPE_KMEM;
 
 	ret = drm_gem_handle_create(file_priv, obj, handle);
 
@@ -513,8 +464,11 @@
 	}
 
 	ret = kgsl_gem_init_obj(dev, file_priv, obj, &handle);
-	if (ret)
+	if (ret) {
+		drm_gem_object_release(obj);
+		DRM_ERROR("Unable to initialize GEM object ret = %d\n", ret);
 		return ret;
+	}
 
 	create->handle = handle;
 	return 0;
@@ -587,6 +541,149 @@
 }
 
 int
+kgsl_gem_create_from_ion_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_create_from_ion *args = data;
+	struct drm_gem_object *obj;
+	struct ion_handle *ion_handle;
+	struct drm_kgsl_gem_object *priv;
+	struct sg_table *sg_table;
+	struct scatterlist *s;
+	int ret, handle;
+	unsigned long size;
+
+	ion_handle = ion_import_dma_buf(kgsl_drm_ion_client, args->ion_fd);
+	if (IS_ERR_OR_NULL(ion_handle)) {
+		DRM_ERROR("Unable to import dmabuf.  Error number = %d\n",
+			(int)PTR_ERR(ion_handle));
+		return -EINVAL;
+	}
+
+	ion_handle_get_size(kgsl_drm_ion_client, ion_handle, &size);
+
+	if (size == 0) {
+		ion_free(kgsl_drm_ion_client, ion_handle);
+		DRM_ERROR(
+		"cannot create GEM object from zero size ION buffer\n");
+		return -EINVAL;
+	}
+
+	obj = drm_gem_object_alloc(dev, size);
+
+	if (obj == NULL) {
+		ion_free(kgsl_drm_ion_client, ion_handle);
+		DRM_ERROR("Unable to allocate the GEM object\n");
+		return -ENOMEM;
+	}
+
+	ret = kgsl_gem_init_obj(dev, file_priv, obj, &handle);
+	if (ret) {
+		ion_free(kgsl_drm_ion_client, ion_handle);
+		drm_gem_object_release(obj);
+		DRM_ERROR("Unable to initialize GEM object ret = %d\n", ret);
+		return ret;
+	}
+
+	priv = obj->driver_private;
+	priv->ion_handle = ion_handle;
+
+	priv->type = DRM_KGSL_GEM_TYPE_KMEM;
+	list_add(&priv->list, &kgsl_mem_list);
+
+	priv->pagetable = kgsl_mmu_getpagetable(KGSL_MMU_GLOBAL_PT);
+
+	priv->memdesc.pagetable = priv->pagetable;
+
+	sg_table = ion_sg_table(kgsl_drm_ion_client,
+		priv->ion_handle);
+	if (IS_ERR_OR_NULL(priv->ion_handle)) {
+		DRM_ERROR("Unable to get ION sg table\n");
+		ion_free(kgsl_drm_ion_client,
+			priv->ion_handle);
+		priv->ion_handle = NULL;
+		kgsl_mmu_putpagetable(priv->pagetable);
+		drm_gem_object_release(obj);
+		kfree(priv);
+		return -ENOMEM;
+	}
+
+	priv->memdesc.sg = sg_table->sgl;
+
+	/* Calculate the size of the memdesc from the sglist */
+
+	priv->memdesc.sglen = 0;
+
+	for (s = priv->memdesc.sg; s != NULL; s = sg_next(s)) {
+		priv->memdesc.size += s->length;
+		priv->memdesc.sglen++;
+	}
+
+	ret = kgsl_mmu_map(priv->pagetable, &priv->memdesc,
+		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	if (ret) {
+		DRM_ERROR("kgsl_mmu_map failed.  ret = %d\n", ret);
+		ion_free(kgsl_drm_ion_client,
+			priv->ion_handle);
+		priv->ion_handle = NULL;
+		kgsl_mmu_putpagetable(priv->pagetable);
+		drm_gem_object_release(obj);
+		kfree(priv);
+		return -ENOMEM;
+	}
+
+	priv->bufs[0].offset = 0;
+	priv->bufs[0].gpuaddr = priv->memdesc.gpuaddr;
+	priv->flags |= DRM_KGSL_GEM_FLAG_MAPPED;
+
+	args->handle = handle;
+	return 0;
+}
+
+int
+kgsl_gem_get_ion_fd_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_kgsl_gem_get_ion_fd *args = data;
+	struct drm_gem_object *obj;
+	struct drm_kgsl_gem_object *priv;
+	int ret = 0;
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+
+	if (obj == NULL) {
+		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	priv = obj->driver_private;
+
+	if (TYPE_IS_FD(priv->type))
+		ret = -EINVAL;
+	else if (TYPE_IS_PMEM(priv->type) || TYPE_IS_MEM(priv->type)) {
+		if (priv->ion_handle) {
+			args->ion_fd = ion_share_dma_buf(
+				kgsl_drm_ion_client, priv->ion_handle);
+			if (args->ion_fd < 0) {
+				DRM_ERROR(
+				"Could not share ion buffer. Error = %d\n",
+					args->ion_fd);
+				ret = -EINVAL;
+			}
+		} else {
+			DRM_ERROR("GEM object has no ion memory allocated.\n");
+			ret = -EINVAL;
+		}
+	}
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+int
 kgsl_gem_setmemtype_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv)
 {
@@ -685,13 +782,9 @@
 
 	if (ret) {
 		DRM_ERROR("Unable to allocate object memory\n");
-	} else if (!priv->mmap_offset) {
-		ret = kgsl_gem_create_mmap_offset(obj);
-		if (ret)
-			DRM_ERROR("Unable to create a mmap offset\n");
 	}
 
-	args->offset = priv->mmap_offset;
+	args->offset = 0;
 
 	drm_gem_object_unreference(obj);
 	mutex_unlock(&dev->struct_mutex);
@@ -703,33 +796,7 @@
 kgsl_gem_mmap_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv)
 {
-	struct drm_kgsl_gem_mmap *args = data;
-	struct drm_gem_object *obj;
-	unsigned long addr;
-
-	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
-
-	if (obj == NULL) {
-		DRM_ERROR("Invalid GEM handle %x\n", args->handle);
-		return -EBADF;
-	}
-
-	down_write(&current->mm->mmap_sem);
-
-	addr = do_mmap(obj->filp, 0, args->size,
-		       PROT_READ | PROT_WRITE, MAP_SHARED,
-		       args->offset);
-
-	up_write(&current->mm->mmap_sem);
-
-	mutex_lock(&dev->struct_mutex);
-	drm_gem_object_unreference(obj);
-	mutex_unlock(&dev->struct_mutex);
-
-	if (IS_ERR((void *) addr))
-		return addr;
-
-	args->hostptr = (uint32_t) addr;
+	/* Ion is used for mmap at this time */
 	return 0;
 }
 
@@ -762,18 +829,6 @@
 		return ret;
 	}
 
-	if (priv->mmap_offset == 0) {
-		ret = kgsl_gem_create_mmap_offset(obj);
-		if (ret) {
-			drm_gem_object_unreference(obj);
-			mutex_unlock(&dev->struct_mutex);
-			return ret;
-		}
-	}
-
-	args->offset = priv->mmap_offset;
-	args->phys = priv->memdesc.physaddr;
-
 	drm_gem_object_unreference(obj);
 	mutex_unlock(&dev->struct_mutex);
 
@@ -957,122 +1012,6 @@
 	}
 }
 
-static struct vm_operations_struct kgsl_gem_kmem_vm_ops = {
-	.fault = kgsl_gem_kmem_fault,
-	.open = drm_gem_vm_open,
-	.close = drm_gem_vm_close,
-};
-
-static struct vm_operations_struct kgsl_gem_phys_vm_ops = {
-	.fault = kgsl_gem_phys_fault,
-	.open = drm_gem_vm_open,
-	.close = drm_gem_vm_close,
-};
-
-/* This is a clone of the standard drm_gem_mmap function modified to allow
-   us to properly map KMEM regions as well as the PMEM regions */
-
-int msm_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	struct drm_file *priv = filp->private_data;
-	struct drm_device *dev = priv->minor->dev;
-	struct drm_gem_mm *mm = dev->mm_private;
-	struct drm_local_map *map = NULL;
-	struct drm_gem_object *obj;
-	struct drm_hash_item *hash;
-	struct drm_kgsl_gem_object *gpriv;
-	int ret = 0;
-
-	mutex_lock(&dev->struct_mutex);
-
-	if (drm_ht_find_item(&mm->offset_hash, vma->vm_pgoff, &hash)) {
-		mutex_unlock(&dev->struct_mutex);
-		return drm_mmap(filp, vma);
-	}
-
-	map = drm_hash_entry(hash, struct drm_map_list, hash)->map;
-	if (!map ||
-	    ((map->flags & _DRM_RESTRICTED) && !capable(CAP_SYS_ADMIN))) {
-		ret =  -EPERM;
-		goto out_unlock;
-	}
-
-	/* Check for valid size. */
-	if (map->size < vma->vm_end - vma->vm_start) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	obj = map->handle;
-
-	gpriv = obj->driver_private;
-
-	/* VM_PFNMAP is only for memory that doesn't use struct page
-	 * in other words, not "normal" memory.  If you try to use it
-	 * with "normal" memory then the mappings don't get flushed. */
-
-	if (TYPE_IS_MEM(gpriv->type)) {
-		vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
-		vma->vm_ops = &kgsl_gem_kmem_vm_ops;
-	} else {
-		vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP |
-			VM_DONTEXPAND;
-		vma->vm_ops = &kgsl_gem_phys_vm_ops;
-	}
-
-	vma->vm_private_data = map->handle;
-
-
-	/* Take care of requested caching policy */
-	if (gpriv->type == DRM_KGSL_GEM_TYPE_KMEM ||
-	    gpriv->type & DRM_KGSL_GEM_CACHE_MASK) {
-		if (gpriv->type & DRM_KGSL_GEM_CACHE_WBACKWA)
-			vma->vm_page_prot =
-			pgprot_writebackwacache(vma->vm_page_prot);
-		else if (gpriv->type & DRM_KGSL_GEM_CACHE_WBACK)
-				vma->vm_page_prot =
-				pgprot_writebackcache(vma->vm_page_prot);
-		else if (gpriv->type & DRM_KGSL_GEM_CACHE_WTHROUGH)
-				vma->vm_page_prot =
-				pgprot_writethroughcache(vma->vm_page_prot);
-		else
-			vma->vm_page_prot =
-			pgprot_writecombine(vma->vm_page_prot);
-	} else {
-		if (gpriv->type == DRM_KGSL_GEM_TYPE_KMEM_NOCACHE)
-			vma->vm_page_prot =
-			pgprot_noncached(vma->vm_page_prot);
-		else
-			/* default pmem is WC */
-			vma->vm_page_prot =
-			pgprot_writecombine(vma->vm_page_prot);
-	}
-
-	/* flush out existing KMEM cached mappings if new ones are
-	 * of uncached type */
-	if (IS_MEM_UNCACHED(gpriv->type))
-		kgsl_cache_range_op(&gpriv->memdesc,
-				    KGSL_CACHE_OP_FLUSH);
-
-	/* Add the other memory types here */
-
-	/* Take a ref for this mapping of the object, so that the fault
-	 * handler can dereference the mmap offset's pointer to the object.
-	 * This reference is cleaned up by the corresponding vm_close
-	 * (which should happen whether the vma was created by this call, or
-	 * by a vm_open due to mremap or partial unmap or whatever).
-	 */
-	drm_gem_object_reference(obj);
-
-	vma->vm_file = filp;	/* Needed for drm_vm_open() */
-	drm_vm_open_locked(vma);
-
-out_unlock:
-	mutex_unlock(&dev->struct_mutex);
-
-	return ret;
-}
-
 void
 cleanup_fence(struct drm_kgsl_gem_object_fence *fence, int check_waiting)
 {
@@ -1434,6 +1373,9 @@
 	DRM_IOCTL_DEF_DRV(KGSL_GEM_ALLOC, kgsl_gem_alloc_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(KGSL_GEM_MMAP, kgsl_gem_mmap_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(KGSL_GEM_GET_BUFINFO, kgsl_gem_get_bufinfo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_GET_ION_FD, kgsl_gem_get_ion_fd_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(KGSL_GEM_CREATE_FROM_ION,
+		kgsl_gem_create_from_ion_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(KGSL_GEM_SET_BUFCOUNT,
 		      kgsl_gem_set_bufcount_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(KGSL_GEM_SET_ACTIVE, kgsl_gem_set_active_ioctl, 0),
@@ -1447,28 +1389,22 @@
 		      DRM_MASTER),
 };
 
+static const struct file_operations kgsl_drm_driver_fops = {
+	.owner = THIS_MODULE,
+	.open = drm_open,
+	.release = drm_release,
+	.unlocked_ioctl = drm_ioctl,
+	.mmap = drm_gem_mmap,
+	.poll = drm_poll,
+	.fasync = drm_fasync,
+};
+
 static struct drm_driver driver = {
 	.driver_features = DRIVER_GEM,
-	.load = kgsl_drm_load,
-	.unload = kgsl_drm_unload,
-	.preclose = kgsl_drm_preclose,
-	.suspend = kgsl_drm_suspend,
-	.resume = kgsl_drm_resume,
-	.reclaim_buffers = drm_core_reclaim_buffers,
 	.gem_init_object = kgsl_gem_init_object,
 	.gem_free_object = kgsl_gem_free_object,
 	.ioctls = kgsl_drm_ioctls,
-
-	.fops = {
-		 .owner = THIS_MODULE,
-		 .open = drm_open,
-		 .release = drm_release,
-		 .unlocked_ioctl = drm_ioctl,
-		 .mmap = msm_drm_gem_mmap,
-		 .poll = drm_poll,
-		 .fasync = drm_fasync,
-		 },
-
+	.fops = &kgsl_drm_driver_fops,
 	.name = DRIVER_NAME,
 	.desc = DRIVER_DESC,
 	.date = DRIVER_DATE,
@@ -1497,11 +1433,24 @@
 		gem_buf_fence[i].fence_id = ENTRY_EMPTY;
 	}
 
+	/* Create ION Client */
+	kgsl_drm_ion_client = msm_ion_client_create(
+			0xffffffff, "kgsl_drm");
+	if (!kgsl_drm_ion_client) {
+		DRM_ERROR("Unable to create ION client\n");
+		return -ENOMEM;
+	}
+
 	return drm_platform_init(&driver, dev);
 }
 
 void kgsl_drm_exit(void)
 {
 	kgsl_drm_inited = DRM_KGSL_NOT_INITED;
+
+	if (kgsl_drm_ion_client)
+		ion_client_destroy(kgsl_drm_ion_client);
+	kgsl_drm_ion_client = NULL;
+
 	drm_platform_exit(&driver, driver.kdriver.platform_device);
 }
diff --git a/drivers/gpu/msm/kgsl_events.c b/drivers/gpu/msm/kgsl_events.c
index 6798eed..9e9c0da 100644
--- a/drivers/gpu/msm/kgsl_events.c
+++ b/drivers/gpu/msm/kgsl_events.c
@@ -149,6 +149,7 @@
 		 * Send the current timestamp so the event knows how far the
 		 * system got before the event was canceled
 		 */
+		list_del(&event->list);
 
 		trace_kgsl_fire_event(id, cur, jiffies - event->created);
 
@@ -156,7 +157,6 @@
 			event->func(device, event->priv, id, cur);
 
 		kgsl_context_put(context);
-		list_del(&event->list);
 		kfree(event);
 
 		kgsl_active_count_put(device);
@@ -192,6 +192,7 @@
 		 * the callback knows how far the GPU made it before things went
 		 * explosion
 		 */
+		list_del(&event->list);
 
 		trace_kgsl_fire_event(KGSL_MEMSTORE_GLOBAL, cur,
 			jiffies - event->created);
@@ -202,8 +203,6 @@
 
 		if (event->context)
 			kgsl_context_put(event->context);
-
-		list_del(&event->list);
 		kfree(event);
 
 		kgsl_active_count_put(device);
@@ -229,6 +228,7 @@
 		 * confused if they don't bother comparing the current timetamp
 		 * to the timestamp they wanted
 		 */
+		list_del(&event->list);
 
 		trace_kgsl_fire_event(id, event->timestamp,
 			jiffies - event->created);
@@ -238,8 +238,6 @@
 
 		if (event->context)
 			kgsl_context_put(event->context);
-
-		list_del(&event->list);
 		kfree(event);
 
 		kgsl_active_count_put(device);
diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c
index 8f28505..5cc0dff 100644
--- a/drivers/gpu/msm/kgsl_gpummu.c
+++ b/drivers/gpu/msm/kgsl_gpummu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,9 +19,11 @@
 
 #include "kgsl.h"
 #include "kgsl_mmu.h"
+#include "kgsl_gpummu.h"
 #include "kgsl_device.h"
 #include "kgsl_sharedmem.h"
 #include "kgsl_trace.h"
+#include "adreno.h"
 
 #define KGSL_PAGETABLE_SIZE \
 	ALIGN(KGSL_PAGETABLE_ENTRIES(CONFIG_MSM_KGSL_PAGE_TABLE_SIZE) * \
@@ -161,7 +163,7 @@
 }
 
 static void *
-_kgsl_ptpool_get_entry(struct kgsl_ptpool *pool, unsigned int *physaddr)
+_kgsl_ptpool_get_entry(struct kgsl_ptpool *pool, phys_addr_t *physaddr)
 {
 	struct kgsl_ptpool_chunk *chunk;
 
@@ -227,7 +229,7 @@
  */
 
 static void *kgsl_ptpool_alloc(struct kgsl_ptpool *pool,
-				unsigned int *physaddr)
+				phys_addr_t *physaddr)
 {
 	void *addr = NULL;
 	int ret;
@@ -363,10 +365,9 @@
 	return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
 }
 
-void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt)
+void kgsl_gpummu_destroy_pagetable(struct kgsl_pagetable *pt)
 {
-	struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *)
-						mmu_specific_pt;
+	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
 	kgsl_ptpool_free((struct kgsl_ptpool *)kgsl_driver.ptpool,
 				gpummu_pt->base.hostptr);
 
@@ -403,11 +404,22 @@
 {
 	unsigned int reg;
 	unsigned int ptbase;
+	struct kgsl_device *device;
+	struct adreno_device *adreno_dev;
+	unsigned int no_page_fault_log = 0;
 
-	kgsl_regread(mmu->device, MH_MMU_PAGE_FAULT, &reg);
-	kgsl_regread(mmu->device, MH_MMU_PT_BASE, &ptbase);
+	device = mmu->device;
+	adreno_dev = ADRENO_DEVICE(device);
 
-	KGSL_MEM_CRIT(mmu->device,
+	kgsl_regread(device, MH_MMU_PAGE_FAULT, &reg);
+	kgsl_regread(device, MH_MMU_PT_BASE, &ptbase);
+
+
+	if (adreno_dev->ft_pf_policy & KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE)
+		no_page_fault_log = kgsl_mmu_log_fault_addr(mmu, ptbase, reg);
+
+	if (!no_page_fault_log)
+		KGSL_MEM_CRIT(mmu->device,
 			"mmu page fault: page=0x%lx pt=%d op=%s axi=%d\n",
 			reg & ~(PAGE_SIZE - 1),
 			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase),
@@ -516,6 +528,11 @@
 	 */
 	int status = 0;
 
+	mmu->pt_base = KGSL_PAGETABLE_BASE;
+	mmu->pt_size = CONFIG_MSM_KGSL_PAGE_TABLE_SIZE;
+	mmu->pt_per_process = KGSL_MMU_USE_PER_PROCESS_PT;
+	mmu->use_cpu_map = false;
+
 	/* sub-client MMU lookups require address translation */
 	if ((mmu->config & ~0x1) > 0) {
 		/*make sure virtual address range is a multiple of 64Kb */
@@ -572,7 +589,7 @@
 
 	if (mmu->defaultpagetable == NULL)
 		mmu->defaultpagetable =
-			kgsl_mmu_getpagetable(KGSL_MMU_GLOBAL_PT);
+			kgsl_mmu_getpagetable(mmu, KGSL_MMU_GLOBAL_PT);
 
 	/* Return error if the default pagetable doesn't exist */
 	if (mmu->defaultpagetable == NULL)
@@ -592,14 +609,14 @@
 }
 
 static int
-kgsl_gpummu_unmap(void *mmu_specific_pt,
+kgsl_gpummu_unmap(struct kgsl_pagetable *pt,
 		struct kgsl_memdesc *memdesc,
 		unsigned int *tlb_flags)
 {
 	unsigned int numpages;
 	unsigned int pte, ptefirst, ptelast, superpte;
 	unsigned int range = kgsl_sg_size(memdesc->sg, memdesc->sglen);
-	struct kgsl_gpummu_pt *gpummu_pt = mmu_specific_pt;
+	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
 
 	/* All GPU addresses as assigned are page aligned, but some
 	   functions purturb the gpuaddr with an offset, so apply the
@@ -641,13 +658,13 @@
 GSL_TLBFLUSH_FILTER_ISDIRTY((_p) / GSL_PT_SUPER_PTE))
 
 static int
-kgsl_gpummu_map(void *mmu_specific_pt,
+kgsl_gpummu_map(struct kgsl_pagetable *pt,
 		struct kgsl_memdesc *memdesc,
 		unsigned int protflags,
 		unsigned int *tlb_flags)
 {
 	unsigned int pte;
-	struct kgsl_gpummu_pt *gpummu_pt = mmu_specific_pt;
+	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
 	struct scatterlist *s;
 	int flushtlb = 0;
 	int i;
diff --git a/drivers/gpu/msm/kgsl_gpummu.h b/drivers/gpu/msm/kgsl_gpummu.h
index 99e7d5f..1753aff 100644
--- a/drivers/gpu/msm/kgsl_gpummu.h
+++ b/drivers/gpu/msm/kgsl_gpummu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -57,7 +57,7 @@
 	int dynamic;
 
 	void *data;
-	unsigned int phys;
+	phys_addr_t phys;
 
 	unsigned long *bitmap;
 	struct list_head list;
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index f2393e4..739fcff 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2011-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -11,6 +11,7 @@
  *
  */
 #include <linux/types.h>
+#include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/spinlock.h>
 #include <linux/genalloc.h>
@@ -20,6 +21,7 @@
 #include <mach/socinfo.h>
 #include <mach/msm_iomap.h>
 #include <mach/board.h>
+#include <mach/iommu_domains.h>
 #include <stddef.h>
 
 #include "kgsl.h"
@@ -31,24 +33,33 @@
 #include "adreno.h"
 #include "kgsl_trace.h"
 #include "z180.h"
+#include "kgsl_cffdump.h"
 
 
-static struct kgsl_iommu_register_list kgsl_iommuv1_reg[KGSL_IOMMU_REG_MAX] = {
+static struct kgsl_iommu_register_list kgsl_iommuv0_reg[KGSL_IOMMU_REG_MAX] = {
 	{ 0, 0, 0 },				/* GLOBAL_BASE */
 	{ 0x10, 0x0003FFFF, 14 },		/* TTBR0 */
 	{ 0x14, 0x0003FFFF, 14 },		/* TTBR1 */
 	{ 0x20, 0, 0 },				/* FSR */
 	{ 0x800, 0, 0 },			/* TLBIALL */
 	{ 0x820, 0, 0 },			/* RESUME */
+	{ 0x03C, 0, 0 },			/* TLBLKCR */
+	{ 0x818, 0, 0 },			/* V2PUR */
+	{ 0x2C, 0, 0 },                         /* FSYNR0 */
+	{ 0x2C, 0, 0 },                         /* FSYNR0 */
 };
 
-static struct kgsl_iommu_register_list kgsl_iommuv2_reg[KGSL_IOMMU_REG_MAX] = {
+static struct kgsl_iommu_register_list kgsl_iommuv1_reg[KGSL_IOMMU_REG_MAX] = {
 	{ 0, 0, 0 },				/* GLOBAL_BASE */
 	{ 0x20, 0x00FFFFFF, 14 },		/* TTBR0 */
 	{ 0x28, 0x00FFFFFF, 14 },		/* TTBR1 */
 	{ 0x58, 0, 0 },				/* FSR */
 	{ 0x618, 0, 0 },			/* TLBIALL */
-	{ 0x008, 0, 0 }				/* RESUME */
+	{ 0x008, 0, 0 },			/* RESUME */
+	{ 0, 0, 0 },				/* TLBLKCR */
+	{ 0, 0, 0 },				/* V2PUR */
+	{ 0x68, 0, 0 },				/* FSYNR0 */
+	{ 0x6C, 0, 0 }				/* FSYNR1 */
 };
 
 struct remote_iommu_petersons_spinlock kgsl_iommu_sync_lock_vars;
@@ -100,8 +111,172 @@
 	return NULL;
 }
 
+/* These functions help find the nearest allocated memory entries on either side
+ * of a faulting address. If we know the nearby allocations memory we can
+ * get a better determination of what we think should have been located in the
+ * faulting region
+ */
+
+/*
+ * A local structure to make it easy to store the interesting bits for the
+ * memory entries on either side of the faulting address
+ */
+
+struct _mem_entry {
+	unsigned int gpuaddr;
+	unsigned int size;
+	unsigned int flags;
+	unsigned int priv;
+	pid_t pid;
+};
+
+/*
+ * Find the closest alloated memory block with an smaller GPU address then the
+ * given address
+ */
+
+static void _prev_entry(struct kgsl_process_private *priv,
+	unsigned int faultaddr, struct _mem_entry *ret)
+{
+	struct rb_node *node;
+	struct kgsl_mem_entry *entry;
+
+	for (node = rb_first(&priv->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+
+		if (entry->memdesc.gpuaddr > faultaddr)
+			break;
+
+		/*
+		 * If this is closer to the faulting address, then copy
+		 * the entry
+		 */
+
+		if (entry->memdesc.gpuaddr > ret->gpuaddr) {
+			ret->gpuaddr = entry->memdesc.gpuaddr;
+			ret->size = entry->memdesc.size;
+			ret->flags = entry->memdesc.flags;
+			ret->priv = entry->memdesc.priv;
+			ret->pid = priv->pid;
+		}
+
+		node = rb_next(&entry->node);
+	}
+}
+
+/*
+ * Find the closest alloated memory block with a greater starting GPU address
+ * then the given address
+ */
+
+static void _next_entry(struct kgsl_process_private *priv,
+	unsigned int faultaddr, struct _mem_entry *ret)
+{
+	struct rb_node *node;
+	struct kgsl_mem_entry *entry;
+
+	for (node = rb_last(&priv->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+
+		if (entry->memdesc.gpuaddr < faultaddr)
+			break;
+
+		/*
+		 * If this is closer to the faulting address, then copy
+		 * the entry
+		 */
+
+		if (entry->memdesc.gpuaddr < ret->gpuaddr) {
+			ret->gpuaddr = entry->memdesc.gpuaddr;
+			ret->size = entry->memdesc.size;
+			ret->flags = entry->memdesc.flags;
+			ret->priv = entry->memdesc.priv;
+			ret->pid = priv->pid;
+		}
+
+		node = rb_prev(&entry->node);
+	}
+}
+
+static void _find_mem_entries(struct kgsl_mmu *mmu, unsigned int faultaddr,
+	unsigned int ptbase, struct _mem_entry *preventry,
+	struct _mem_entry *nextentry)
+{
+	struct kgsl_process_private *private;
+	int id = kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase);
+
+	memset(preventry, 0, sizeof(*preventry));
+	memset(nextentry, 0, sizeof(*nextentry));
+
+	/* Set the maximum possible size as an initial value */
+	nextentry->gpuaddr = 0xFFFFFFFF;
+
+	mutex_lock(&kgsl_driver.process_mutex);
+
+	list_for_each_entry(private, &kgsl_driver.process_list, list) {
+
+		if (private->pagetable->name != id)
+			continue;
+
+		spin_lock(&private->mem_lock);
+		_prev_entry(private, faultaddr, preventry);
+		_next_entry(private, faultaddr, nextentry);
+		spin_unlock(&private->mem_lock);
+	}
+
+	mutex_unlock(&kgsl_driver.process_mutex);
+}
+
+static void _print_entry(struct kgsl_device *device, struct _mem_entry *entry)
+{
+	char name[32];
+	memset(name, 0, sizeof(name));
+
+	kgsl_get_memory_usage(name, sizeof(name) - 1, entry->flags);
+
+	KGSL_LOG_DUMP(device,
+		"[%8.8X - %8.8X] %s (pid = %d) (%s)\n",
+		entry->gpuaddr,
+		entry->gpuaddr + entry->size,
+		entry->priv & KGSL_MEMDESC_GUARD_PAGE ? "(+guard)" : "",
+		entry->pid, name);
+}
+
+static void _check_if_freed(struct kgsl_iommu_device *iommu_dev,
+	unsigned long addr, unsigned int pid)
+{
+	void *base = kgsl_driver.memfree_hist.base_hist_rb;
+	struct kgsl_memfree_hist_elem *wptr;
+	struct kgsl_memfree_hist_elem *p;
+
+	mutex_lock(&kgsl_driver.memfree_hist_mutex);
+	wptr = kgsl_driver.memfree_hist.wptr;
+	p = wptr;
+	for (;;) {
+		if (p->size && p->pid == pid)
+			if (addr >= p->gpuaddr &&
+				addr < (p->gpuaddr + p->size)) {
+
+				KGSL_LOG_DUMP(iommu_dev->kgsldev,
+					"---- premature free ----\n");
+				KGSL_LOG_DUMP(iommu_dev->kgsldev,
+					"[%8.8X-%8.8X] was already freed by pid %d\n",
+					p->gpuaddr,
+					p->gpuaddr + p->size,
+					p->pid);
+			}
+		p++;
+		if ((void *)p >= base + kgsl_driver.memfree_hist.size)
+			p = (struct kgsl_memfree_hist_elem *) base;
+
+		if (p == kgsl_driver.memfree_hist.wptr)
+			break;
+	}
+	mutex_unlock(&kgsl_driver.memfree_hist_mutex);
+}
+
 static int kgsl_iommu_fault_handler(struct iommu_domain *domain,
-	struct device *dev, unsigned long addr, int flags)
+	struct device *dev, unsigned long addr, int flags, void *token)
 {
 	int ret = 0;
 	struct kgsl_mmu *mmu;
@@ -109,9 +284,17 @@
 	struct kgsl_iommu_unit *iommu_unit;
 	struct kgsl_iommu_device *iommu_dev;
 	unsigned int ptbase, fsr;
+	unsigned int pid;
+	struct _mem_entry prev, next;
+	unsigned int fsynr0, fsynr1;
+	int write;
 	struct kgsl_device *device;
 	struct adreno_device *adreno_dev;
 	unsigned int no_page_fault_log = 0;
+	unsigned int curr_context_id = 0;
+	unsigned int curr_global_ts = 0;
+	static struct adreno_context *curr_context;
+	static struct kgsl_context *context;
 
 	ret = get_iommu_unit(dev, &mmu, &iommu_unit);
 	if (ret)
@@ -131,6 +314,25 @@
 
 	fsr = KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit,
 		iommu_dev->ctx_id, FSR);
+	fsynr0 = KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit,
+		iommu_dev->ctx_id, FSYNR0);
+	fsynr1 = KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit,
+		iommu_dev->ctx_id, FSYNR1);
+
+	if (msm_soc_version_supports_iommu_v0())
+		write = ((fsynr1 & (KGSL_IOMMU_FSYNR1_AWRITE_MASK <<
+			KGSL_IOMMU_FSYNR1_AWRITE_SHIFT)) ? 1 : 0);
+	else
+		write = ((fsynr0 & (KGSL_IOMMU_V1_FSYNR0_WNR_MASK <<
+			KGSL_IOMMU_V1_FSYNR0_WNR_SHIFT)) ? 1 : 0);
+
+	pid = kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase);
+	KGSL_MEM_CRIT(iommu_dev->kgsldev,
+		"GPU PAGE FAULT: addr = %lX pid = %d\n", addr, pid);
+	KGSL_MEM_CRIT(iommu_dev->kgsldev,
+		"context = %d FSR = %X FSYNR0 = %X FSYNR1 = %X(%s fault)\n",
+		iommu_dev->ctx_id, fsr, fsynr0, fsynr1,
+		write ? "write" : "read");
 
 	if (adreno_dev->ft_pf_policy & KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE)
 		no_page_fault_log = kgsl_mmu_log_fault_addr(mmu, ptbase, addr);
@@ -141,13 +343,49 @@
 			addr, kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase));
 		KGSL_MEM_CRIT(iommu_dev->kgsldev, "context = %d FSR = %X\n",
 			iommu_dev->ctx_id, fsr);
+
+		_check_if_freed(iommu_dev, addr, pid);
+
+		KGSL_LOG_DUMP(iommu_dev->kgsldev, "---- nearby memory ----\n");
+
+		_find_mem_entries(mmu, addr, ptbase, &prev, &next);
+
+		if (prev.gpuaddr)
+			_print_entry(iommu_dev->kgsldev, &prev);
+		else
+			KGSL_LOG_DUMP(iommu_dev->kgsldev, "*EMPTY*\n");
+
+		KGSL_LOG_DUMP(iommu_dev->kgsldev, " <- fault @ %8.8lX\n", addr);
+
+		if (next.gpuaddr != 0xFFFFFFFF)
+			_print_entry(iommu_dev->kgsldev, &next);
+		else
+			KGSL_LOG_DUMP(iommu_dev->kgsldev, "*EMPTY*\n");
+
 	}
 
 	mmu->fault = 1;
 	iommu_dev->fault = 1;
 
+	kgsl_sharedmem_readl(&device->memstore, &curr_context_id,
+		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL, current_context));
+	context = idr_find(&device->context_idr, curr_context_id);
+	if (context != NULL)
+			curr_context = context->devctxt;
+
+	kgsl_sharedmem_readl(&device->memstore, &curr_global_ts,
+		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL, eoptimestamp));
+
+	/*
+	 * Store pagefault's timestamp in adreno context,
+	 * this information will be used in GFT
+	 */
+	curr_context->pagefault = 1;
+	curr_context->pagefault_ts = curr_global_ts;
+
 	trace_kgsl_mmu_pagefault(iommu_dev->kgsldev, addr,
-			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase), 0);
+			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase),
+			write ? "write" : "read");
 
 	/*
 	 * We do not want the h/w to resume fetching data from an iommu unit
@@ -367,9 +605,9 @@
  *
  * Return - void
  */
-static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt)
+static void kgsl_iommu_destroy_pagetable(struct kgsl_pagetable *pt)
 {
-	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
+	struct kgsl_iommu_pt *iommu_pt = pt->priv;
 	if (iommu_pt->domain)
 		iommu_domain_free(iommu_pt->domain);
 	kfree(iommu_pt);
@@ -384,28 +622,39 @@
  */
 void *kgsl_iommu_create_pagetable(void)
 {
+	int domain_num;
 	struct kgsl_iommu_pt *iommu_pt;
 
+	struct msm_iova_partition kgsl_partition = {
+		.start = 0,
+		.size = 0xFFFFFFFF,
+	};
+	struct msm_iova_layout kgsl_layout = {
+		.partitions = &kgsl_partition,
+		.npartitions = 1,
+		.client_name = "kgsl",
+		.domain_flags = 0,
+	};
+
 	iommu_pt = kzalloc(sizeof(struct kgsl_iommu_pt), GFP_KERNEL);
 	if (!iommu_pt) {
 		KGSL_CORE_ERR("kzalloc(%d) failed\n",
 				sizeof(struct kgsl_iommu_pt));
 		return NULL;
 	}
-	/* L2 redirect is not stable on IOMMU v2 */
-	if (msm_soc_version_supports_iommu_v1())
-		iommu_pt->domain = iommu_domain_alloc(&platform_bus_type,
-					MSM_IOMMU_DOMAIN_PT_CACHEABLE);
-	else
-		iommu_pt->domain = iommu_domain_alloc(&platform_bus_type,
-					0);
-	if (!iommu_pt->domain) {
+	/* L2 redirect is not stable on IOMMU v1 */
+	if (msm_soc_version_supports_iommu_v0())
+		kgsl_layout.domain_flags = MSM_IOMMU_DOMAIN_PT_CACHEABLE;
+
+	domain_num = msm_register_domain(&kgsl_layout);
+	if (domain_num >= 0) {
+		iommu_pt->domain = msm_get_iommu_domain(domain_num);
+		iommu_set_fault_handler(iommu_pt->domain,
+			kgsl_iommu_fault_handler, NULL);
+	} else {
 		KGSL_CORE_ERR("Failed to create iommu domain\n");
 		kfree(iommu_pt);
 		return NULL;
-	} else {
-		iommu_set_fault_handler(iommu_pt->domain,
-			kgsl_iommu_fault_handler);
 	}
 
 	return iommu_pt;
@@ -523,17 +772,23 @@
 {
 	struct kgsl_iommu *iommu = mmu->priv;
 	struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[unit_id];
-	int i;
+	int i, j;
+	int found_ctx;
 
-	if (data->iommu_ctx_count > KGSL_IOMMU_MAX_DEVS_PER_UNIT) {
-		KGSL_CORE_ERR("Too many iommu devices defined for an "
-				"IOMMU unit\n");
-		return -EINVAL;
-	}
-
-	for (i = 0; i < data->iommu_ctx_count; i++) {
-		if (!data->iommu_ctxs[i].iommu_ctx_name)
-			continue;
+	for (j = 0; j < KGSL_IOMMU_MAX_DEVS_PER_UNIT; j++) {
+		found_ctx = 0;
+		for (i = 0; i < data->iommu_ctx_count; i++) {
+			if (j == data->iommu_ctxs[i].ctx_id) {
+				found_ctx = 1;
+				break;
+			}
+		}
+		if (!found_ctx)
+			break;
+		if (!data->iommu_ctxs[i].iommu_ctx_name) {
+			KGSL_CORE_ERR("Context name invalid\n");
+			return -EINVAL;
+		}
 
 		iommu_unit->dev[iommu_unit->dev_count].dev =
 			msm_iommu_get_ctx(data->iommu_ctxs[i].iommu_ctx_name);
@@ -542,12 +797,6 @@
 			"device %s\n", data->iommu_ctxs[i].iommu_ctx_name);
 			return -EINVAL;
 		}
-		if (KGSL_IOMMU_CONTEXT_USER != data->iommu_ctxs[i].ctx_id &&
-			KGSL_IOMMU_CONTEXT_PRIV != data->iommu_ctxs[i].ctx_id) {
-			KGSL_CORE_ERR("Invalid context ID defined: %d\n",
-					data->iommu_ctxs[i].ctx_id);
-			return -EINVAL;
-		}
 		iommu_unit->dev[iommu_unit->dev_count].ctx_id =
 						data->iommu_ctxs[i].ctx_id;
 		iommu_unit->dev[iommu_unit->dev_count].kgsldev = mmu->device;
@@ -559,6 +808,51 @@
 
 		iommu_unit->dev_count++;
 	}
+	if (!j) {
+		KGSL_CORE_ERR("No ctxts initialized, user ctxt absent\n ");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * kgsl_iommu_start_sync_lock - Initialize some variables during MMU start up
+ * for GPU CPU synchronization
+ * @mmu - Pointer to mmu device
+ *
+ * Return - 0 on success else error code
+ */
+static int kgsl_iommu_start_sync_lock(struct kgsl_mmu *mmu)
+{
+	struct kgsl_iommu *iommu = mmu->priv;
+	uint32_t lock_gpu_addr = 0;
+
+	if (KGSL_DEVICE_3D0 != mmu->device->id ||
+		!msm_soc_version_supports_iommu_v0() ||
+		!kgsl_mmu_is_perprocess(mmu) ||
+		iommu->sync_lock_vars)
+		return 0;
+
+	if (!(mmu->flags & KGSL_MMU_FLAGS_IOMMU_SYNC)) {
+		KGSL_DRV_ERR(mmu->device,
+		"The GPU microcode does not support IOMMUv1 sync opcodes\n");
+		return -ENXIO;
+	}
+	/* Store Lock variables GPU address  */
+	lock_gpu_addr = (iommu->sync_lock_desc.gpuaddr +
+			iommu->sync_lock_offset);
+
+	kgsl_iommu_sync_lock_vars.flag[PROC_APPS] = (lock_gpu_addr +
+		(offsetof(struct remote_iommu_petersons_spinlock,
+			flag[PROC_APPS])));
+	kgsl_iommu_sync_lock_vars.flag[PROC_GPU] = (lock_gpu_addr +
+		(offsetof(struct remote_iommu_petersons_spinlock,
+			flag[PROC_GPU])));
+	kgsl_iommu_sync_lock_vars.turn = (lock_gpu_addr +
+		(offsetof(struct remote_iommu_petersons_spinlock, turn)));
+
+	iommu->sync_lock_vars = &kgsl_iommu_sync_lock_vars;
 
 	return 0;
 }
@@ -571,21 +865,30 @@
  */
 static int kgsl_iommu_init_sync_lock(struct kgsl_mmu *mmu)
 {
-	struct kgsl_iommu *iommu = mmu->device->mmu.priv;
+	struct kgsl_iommu *iommu = mmu->priv;
 	int status = 0;
-	struct kgsl_pagetable *pagetable = NULL;
-	uint32_t lock_gpu_addr = 0;
 	uint32_t lock_phy_addr = 0;
 	uint32_t page_offset = 0;
 
-	iommu->sync_lock_initialized = 0;
+	if (!msm_soc_version_supports_iommu_v0() ||
+		!kgsl_mmu_is_perprocess(mmu))
+		return status;
 
-	if (!(mmu->flags & KGSL_MMU_FLAGS_IOMMU_SYNC)) {
-		KGSL_DRV_ERR(mmu->device,
-		"The GPU microcode does not support IOMMUv1 sync opcodes\n");
-		return -ENXIO;
+	/*
+	 * For 2D devices cpu side sync lock is required. For 3D device,
+	 * since we only have a single 3D core and we always ensure that
+	 * 3D core is idle while writing to IOMMU register using CPU this
+	 * lock is not required
+	 */
+	if (KGSL_DEVICE_2D0 == mmu->device->id ||
+		KGSL_DEVICE_2D1 == mmu->device->id) {
+		return status;
 	}
 
+	/* Return if already initialized */
+	if (iommu->sync_lock_initialized)
+		return status;
+
 	/* Get the physical address of the Lock variables */
 	lock_phy_addr = (msm_iommu_lock_initialize()
 			- MSM_SHARED_RAM_BASE + msm_shared_ram_phys);
@@ -600,6 +903,7 @@
 	page_offset = (lock_phy_addr & (PAGE_SIZE - 1));
 	lock_phy_addr = (lock_phy_addr & ~(PAGE_SIZE - 1));
 	iommu->sync_lock_desc.physaddr = (unsigned int)lock_phy_addr;
+	iommu->sync_lock_offset = page_offset;
 
 	iommu->sync_lock_desc.size =
 				PAGE_ALIGN(sizeof(kgsl_iommu_sync_lock_vars));
@@ -610,35 +914,6 @@
 	if (status)
 		return status;
 
-	/* Map Lock variables to GPU pagetable */
-	iommu->sync_lock_desc.priv |= KGSL_MEMDESC_GLOBAL;
-
-	pagetable = mmu->priv_bank_table ? mmu->priv_bank_table :
-				mmu->defaultpagetable;
-
-	status = kgsl_mmu_map(pagetable, &iommu->sync_lock_desc,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-
-	if (status) {
-		kgsl_mmu_unmap(pagetable, &iommu->sync_lock_desc);
-		iommu->sync_lock_desc.priv &= ~KGSL_MEMDESC_GLOBAL;
-		return status;
-	}
-
-	/* Store Lock variables GPU address  */
-	lock_gpu_addr = (iommu->sync_lock_desc.gpuaddr + page_offset);
-
-	kgsl_iommu_sync_lock_vars.flag[PROC_APPS] = (lock_gpu_addr +
-		(offsetof(struct remote_iommu_petersons_spinlock,
-			flag[PROC_APPS])));
-	kgsl_iommu_sync_lock_vars.flag[PROC_GPU] = (lock_gpu_addr +
-		(offsetof(struct remote_iommu_petersons_spinlock,
-			flag[PROC_GPU])));
-	kgsl_iommu_sync_lock_vars.turn = (lock_gpu_addr +
-		(offsetof(struct remote_iommu_petersons_spinlock, turn)));
-
-	iommu->sync_lock_vars = &kgsl_iommu_sync_lock_vars;
-
 	/* Flag Sync Lock is Initialized  */
 	iommu->sync_lock_initialized = 1;
 
@@ -897,6 +1172,75 @@
 	}
 }
 
+/*
+ * kgsl_iommu_setup_regs - map iommu registers into a pagetable
+ * @mmu: Pointer to mmu structure
+ * @pt: the pagetable
+ *
+ * To do pagetable switches from the GPU command stream, the IOMMU
+ * registers need to be mapped into the GPU's pagetable. This function
+ * is used differently on different targets. On 8960, the registers
+ * are mapped into every pagetable during kgsl_setup_pt(). On
+ * all other targets, the registers are mapped only into the second
+ * context bank.
+ *
+ * Return - 0 on success else error code
+ */
+static int kgsl_iommu_setup_regs(struct kgsl_mmu *mmu,
+				    struct kgsl_pagetable *pt)
+{
+	int status;
+	int i = 0;
+	struct kgsl_iommu *iommu = mmu->priv;
+
+	if (!msm_soc_version_supports_iommu_v0())
+		return 0;
+
+	for (i = 0; i < iommu->unit_count; i++) {
+		status = kgsl_mmu_map_global(pt,
+				&(iommu->iommu_units[i].reg_map));
+		if (status)
+			goto err;
+	}
+
+	/* Map Lock variables to GPU pagetable */
+	if (iommu->sync_lock_initialized) {
+		status = kgsl_mmu_map_global(pt, &iommu->sync_lock_desc);
+		if (status)
+			goto err;
+	}
+
+	return 0;
+err:
+	for (i--; i >= 0; i--)
+		kgsl_mmu_unmap(pt,
+				&(iommu->iommu_units[i].reg_map));
+
+	return status;
+}
+
+/*
+ * kgsl_iommu_cleanup_regs - unmap iommu registers from a pagetable
+ * @mmu: Pointer to mmu structure
+ * @pt: the pagetable
+ *
+ * Removes mappings created by kgsl_iommu_setup_regs().
+ *
+ * Return - 0 on success else error code
+ */
+static void kgsl_iommu_cleanup_regs(struct kgsl_mmu *mmu,
+					struct kgsl_pagetable *pt)
+{
+	struct kgsl_iommu *iommu = mmu->priv;
+	int i;
+	for (i = 0; i < iommu->unit_count; i++)
+		kgsl_mmu_unmap(pt, &(iommu->iommu_units[i].reg_map));
+
+	if (iommu->sync_lock_desc.gpuaddr)
+		kgsl_mmu_unmap(pt, &iommu->sync_lock_desc);
+}
+
+
 static int kgsl_iommu_init(struct kgsl_mmu *mmu)
 {
 	/*
@@ -921,16 +1265,45 @@
 	status = kgsl_set_register_map(mmu);
 	if (status)
 		goto done;
+	status = kgsl_iommu_init_sync_lock(mmu);
+	if (status)
+		goto done;
 
-	iommu->iommu_reg_list = kgsl_iommuv1_reg;
-	iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V1;
+	/* We presently do not support per-process for IOMMU-v1 */
+	mmu->pt_per_process = KGSL_MMU_USE_PER_PROCESS_PT &&
+				msm_soc_version_supports_iommu_v0();
 
-	if (msm_soc_version_supports_iommu_v1()) {
+	/*
+	 * For IOMMU per-process pagetables, the allocatable range
+	 * and the kernel global range must both be outside
+	 * the userspace address range. There is a 1Mb gap
+	 * between these address ranges to make overrun
+	 * detection easier.
+	 * For the shared pagetable case use 2GB and because
+	 * mirroring the CPU address space is not possible and
+	 * we're better off with extra room.
+	 */
+	if (mmu->pt_per_process) {
+		mmu->pt_base = PAGE_OFFSET;
+		mmu->pt_size = KGSL_IOMMU_GLOBAL_MEM_BASE
+				- kgsl_mmu_get_base_addr(mmu) - SZ_1M;
+		mmu->use_cpu_map = true;
+	} else {
+		mmu->pt_base = KGSL_PAGETABLE_BASE;
+		mmu->pt_size = SZ_2G;
+		mmu->use_cpu_map = false;
+	}
+
+
+	iommu->iommu_reg_list = kgsl_iommuv0_reg;
+	iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V0;
+
+	if (msm_soc_version_supports_iommu_v0()) {
+		iommu->iommu_reg_list = kgsl_iommuv0_reg;
+		iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V0;
+	} else {
 		iommu->iommu_reg_list = kgsl_iommuv1_reg;
 		iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V1;
-	} else {
-		iommu->iommu_reg_list = kgsl_iommuv2_reg;
-		iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V2;
 	}
 
 	/* A nop is required in an indirect buffer when switching
@@ -939,6 +1312,15 @@
 				KGSL_IOMMU_SETSTATE_NOP_OFFSET,
 				cp_nop_packet(1));
 
+	if (cpu_is_msm8960()) {
+		/*
+		 * 8960 doesn't have a second context bank, so the IOMMU
+		 * registers must be mapped into every pagetable.
+		 */
+		iommu_ops.mmu_setup_pt = kgsl_iommu_setup_regs;
+		iommu_ops.mmu_cleanup_pt = kgsl_iommu_cleanup_regs;
+	}
+
 	dev_info(mmu->device->dev, "|%s| MMU type set for device is IOMMU\n",
 			__func__);
 done:
@@ -961,51 +1343,31 @@
 static int kgsl_iommu_setup_defaultpagetable(struct kgsl_mmu *mmu)
 {
 	int status = 0;
-	int i = 0;
-	struct kgsl_iommu *iommu = mmu->priv;
-	struct kgsl_pagetable *pagetable = NULL;
 
 	/* If chip is not 8960 then we use the 2nd context bank for pagetable
 	 * switching on the 3D side for which a separate table is allocated */
-	if (!cpu_is_msm8960() && msm_soc_version_supports_iommu_v1()) {
+	if (!cpu_is_msm8960() && msm_soc_version_supports_iommu_v0()) {
 		mmu->priv_bank_table =
-			kgsl_mmu_getpagetable(KGSL_MMU_PRIV_BANK_TABLE_NAME);
+			kgsl_mmu_getpagetable(mmu,
+					KGSL_MMU_PRIV_BANK_TABLE_NAME);
 		if (mmu->priv_bank_table == NULL) {
 			status = -ENOMEM;
 			goto err;
 		}
+		status = kgsl_iommu_setup_regs(mmu, mmu->priv_bank_table);
+		if (status)
+			goto err;
 	}
-	mmu->defaultpagetable = kgsl_mmu_getpagetable(KGSL_MMU_GLOBAL_PT);
+	mmu->defaultpagetable = kgsl_mmu_getpagetable(mmu, KGSL_MMU_GLOBAL_PT);
 	/* Return error if the default pagetable doesn't exist */
 	if (mmu->defaultpagetable == NULL) {
 		status = -ENOMEM;
 		goto err;
 	}
-	pagetable = mmu->priv_bank_table ? mmu->priv_bank_table :
-				mmu->defaultpagetable;
-	/* Map the IOMMU regsiters to only defaultpagetable */
-	if (msm_soc_version_supports_iommu_v1()) {
-		for (i = 0; i < iommu->unit_count; i++) {
-			iommu->iommu_units[i].reg_map.priv |=
-						KGSL_MEMDESC_GLOBAL;
-			status = kgsl_mmu_map(pagetable,
-				&(iommu->iommu_units[i].reg_map),
-				GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-			if (status) {
-				iommu->iommu_units[i].reg_map.priv &=
-							~KGSL_MEMDESC_GLOBAL;
-				goto err;
-			}
-		}
-	}
 	return status;
 err:
-	for (i--; i >= 0; i--) {
-		kgsl_mmu_unmap(pagetable,
-				&(iommu->iommu_units[i].reg_map));
-		iommu->iommu_units[i].reg_map.priv &= ~KGSL_MEMDESC_GLOBAL;
-	}
 	if (mmu->priv_bank_table) {
+		kgsl_iommu_cleanup_regs(mmu, mmu->priv_bank_table);
 		kgsl_mmu_putpagetable(mmu->priv_bank_table);
 		mmu->priv_bank_table = NULL;
 	}
@@ -1016,9 +1378,113 @@
 	return status;
 }
 
-static int kgsl_iommu_start(struct kgsl_mmu *mmu)
+/*
+ * kgsl_iommu_lock_rb_in_tlb - Allocates tlb entries and locks the
+ * virtual to physical address translation of ringbuffer for 3D
+ * device into tlb.
+ * @mmu - Pointer to mmu structure
+ *
+ * Return - void
+ */
+static void kgsl_iommu_lock_rb_in_tlb(struct kgsl_mmu *mmu)
 {
 	struct kgsl_device *device = mmu->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_ringbuffer *rb;
+	struct kgsl_iommu *iommu = mmu->priv;
+	unsigned int num_tlb_entries;
+	unsigned int tlblkcr = 0;
+	unsigned int v2pxx = 0;
+	unsigned int vaddr = 0;
+	int i, j, k, l;
+
+	if (!iommu->sync_lock_initialized)
+		return;
+
+	rb = &adreno_dev->ringbuffer;
+	num_tlb_entries = rb->buffer_desc.size / PAGE_SIZE;
+
+	for (i = 0; i < iommu->unit_count; i++) {
+		struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[i];
+		for (j = 0; j < iommu_unit->dev_count; j++) {
+			tlblkcr = 0;
+			if (cpu_is_msm8960())
+				tlblkcr |= ((num_tlb_entries &
+					KGSL_IOMMU_TLBLKCR_FLOOR_MASK) <<
+					KGSL_IOMMU_TLBLKCR_FLOOR_SHIFT);
+			else
+				tlblkcr |= (((num_tlb_entries *
+					iommu_unit->dev_count) &
+					KGSL_IOMMU_TLBLKCR_FLOOR_MASK) <<
+					KGSL_IOMMU_TLBLKCR_FLOOR_SHIFT);
+			/* Do not invalidate locked entries on tlbiall flush */
+			tlblkcr	|= ((1 & KGSL_IOMMU_TLBLKCR_TLBIALLCFG_MASK)
+				<< KGSL_IOMMU_TLBLKCR_TLBIALLCFG_SHIFT);
+			tlblkcr	|= ((1 & KGSL_IOMMU_TLBLKCR_TLBIASIDCFG_MASK)
+				<< KGSL_IOMMU_TLBLKCR_TLBIASIDCFG_SHIFT);
+			tlblkcr	|= ((1 & KGSL_IOMMU_TLBLKCR_TLBIVAACFG_MASK)
+				<< KGSL_IOMMU_TLBLKCR_TLBIVAACFG_SHIFT);
+			/* Enable tlb locking */
+			tlblkcr |= ((1 & KGSL_IOMMU_TLBLKCR_LKE_MASK)
+				<< KGSL_IOMMU_TLBLKCR_LKE_SHIFT);
+			KGSL_IOMMU_SET_CTX_REG(iommu, iommu_unit,
+					iommu_unit->dev[j].ctx_id,
+					TLBLKCR, tlblkcr);
+		}
+		for (j = 0; j < iommu_unit->dev_count; j++) {
+			/* skip locking entries for private bank on 8960 */
+			if (cpu_is_msm8960() &&  KGSL_IOMMU_CONTEXT_PRIV == j)
+				continue;
+			/* Lock the ringbuffer virtual address into tlb */
+			vaddr = rb->buffer_desc.gpuaddr;
+			for (k = 0; k < num_tlb_entries; k++) {
+				v2pxx = 0;
+				v2pxx |= (((k + j * num_tlb_entries) &
+					KGSL_IOMMU_V2PXX_INDEX_MASK)
+					<< KGSL_IOMMU_V2PXX_INDEX_SHIFT);
+				v2pxx |= vaddr & (KGSL_IOMMU_V2PXX_VA_MASK <<
+						KGSL_IOMMU_V2PXX_VA_SHIFT);
+
+				KGSL_IOMMU_SET_CTX_REG(iommu, iommu_unit,
+						iommu_unit->dev[j].ctx_id,
+						V2PUR, v2pxx);
+				vaddr += PAGE_SIZE;
+				for (l = 0; l < iommu_unit->dev_count; l++) {
+					tlblkcr = KGSL_IOMMU_GET_CTX_REG(iommu,
+						iommu_unit,
+						iommu_unit->dev[l].ctx_id,
+						TLBLKCR);
+					mb();
+					tlblkcr &=
+					~(KGSL_IOMMU_TLBLKCR_VICTIM_MASK
+					<< KGSL_IOMMU_TLBLKCR_VICTIM_SHIFT);
+					tlblkcr |= (((k + 1 +
+					(j * num_tlb_entries)) &
+					KGSL_IOMMU_TLBLKCR_VICTIM_MASK) <<
+					KGSL_IOMMU_TLBLKCR_VICTIM_SHIFT);
+					KGSL_IOMMU_SET_CTX_REG(iommu,
+						iommu_unit,
+						iommu_unit->dev[l].ctx_id,
+						TLBLKCR, tlblkcr);
+				}
+			}
+		}
+		for (j = 0; j < iommu_unit->dev_count; j++) {
+			tlblkcr = KGSL_IOMMU_GET_CTX_REG(iommu, iommu_unit,
+						iommu_unit->dev[j].ctx_id,
+						TLBLKCR);
+			mb();
+			/* Disable tlb locking */
+			tlblkcr &= ~(KGSL_IOMMU_TLBLKCR_LKE_MASK
+				<< KGSL_IOMMU_TLBLKCR_LKE_SHIFT);
+			KGSL_IOMMU_SET_CTX_REG(iommu, iommu_unit,
+				iommu_unit->dev[j].ctx_id, TLBLKCR, tlblkcr);
+		}
+	}
+}
+
+static int kgsl_iommu_start(struct kgsl_mmu *mmu)
+{
 	int status;
 	struct kgsl_iommu *iommu = mmu->priv;
 	int i, j;
@@ -1030,23 +1496,21 @@
 		status = kgsl_iommu_setup_defaultpagetable(mmu);
 		if (status)
 			return -ENOMEM;
-
-		/* Initialize the sync lock between GPU and CPU */
-		if (msm_soc_version_supports_iommu_v1() &&
-			(device->id == KGSL_DEVICE_3D0))
-				kgsl_iommu_init_sync_lock(mmu);
 	}
+	status = kgsl_iommu_start_sync_lock(mmu);
+	if (status)
+		return status;
 
 	/* We use the GPU MMU to control access to IOMMU registers on 8960 with
 	 * a225, hence we still keep the MMU active on 8960 */
-	if (cpu_is_msm8960()) {
+	if (cpu_is_msm8960() && KGSL_DEVICE_3D0 == mmu->device->id) {
 		struct kgsl_mh *mh = &(mmu->device->mh);
+		BUG_ON(iommu->iommu_units[0].reg_map.gpuaddr != 0 &&
+			mh->mpu_base > iommu->iommu_units[0].reg_map.gpuaddr);
 		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000001);
+
 		kgsl_regwrite(mmu->device, MH_MMU_MPU_END,
-			mh->mpu_base +
-			iommu->iommu_units[0].reg_map.gpuaddr);
-	} else {
-		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000000);
+			mh->mpu_base + mh->mpu_range);
 	}
 
 	mmu->hwpagetable = mmu->defaultpagetable;
@@ -1071,6 +1535,7 @@
 	 * changing pagetables we can use this lsb value of the pagetable w/o
 	 * having to read it again
 	 */
+	msm_iommu_lock();
 	for (i = 0; i < iommu->unit_count; i++) {
 		struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[i];
 		for (j = 0; j < iommu_unit->dev_count; j++) {
@@ -1081,6 +1546,13 @@
 						TTBR0));
 		}
 	}
+	kgsl_iommu_lock_rb_in_tlb(mmu);
+	msm_iommu_unlock();
+
+	/* For complete CFF */
+	kgsl_cffdump_setmem(mmu->setstate_memory.gpuaddr +
+				KGSL_IOMMU_SETSTATE_NOP_OFFSET,
+				cp_nop_packet(1), sizeof(unsigned int));
 
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
 	mmu->flags |= KGSL_FLAGS_STARTED;
@@ -1094,13 +1566,13 @@
 }
 
 static int
-kgsl_iommu_unmap(void *mmu_specific_pt,
+kgsl_iommu_unmap(struct kgsl_pagetable *pt,
 		struct kgsl_memdesc *memdesc,
 		unsigned int *tlb_flags)
 {
 	int ret;
 	unsigned int range = kgsl_sg_size(memdesc->sg, memdesc->sglen);
-	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
+	struct kgsl_iommu_pt *iommu_pt = pt->priv;
 
 	/* All GPU addresses as assigned are page aligned, but some
 	   functions purturb the gpuaddr with an offset, so apply the
@@ -1117,43 +1589,58 @@
 			"with err: %d\n", iommu_pt->domain, gpuaddr,
 			range, ret);
 
-#ifdef CONFIG_KGSL_PER_PROCESS_PAGE_TABLE
 	/*
 	 * Flushing only required if per process pagetables are used. With
 	 * global case, flushing will happen inside iommu_map function
 	 */
-	if (!ret && msm_soc_version_supports_iommu_v1())
+	if (!ret && kgsl_mmu_is_perprocess(pt->mmu))
 		*tlb_flags = UINT_MAX;
-#endif
 	return 0;
 }
 
 static int
-kgsl_iommu_map(void *mmu_specific_pt,
+kgsl_iommu_map(struct kgsl_pagetable *pt,
 			struct kgsl_memdesc *memdesc,
 			unsigned int protflags,
 			unsigned int *tlb_flags)
 {
 	int ret;
 	unsigned int iommu_virt_addr;
-	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
+	struct kgsl_iommu_pt *iommu_pt = pt->priv;
 	int size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
 
 	BUG_ON(NULL == iommu_pt);
 
+	/* if there's a guard page, we'll map it read only below */
+	if ((protflags & IOMMU_WRITE) && kgsl_memdesc_has_guard_page(memdesc))
+			size -= PAGE_SIZE;
 
 	iommu_virt_addr = memdesc->gpuaddr;
 
 	ret = iommu_map_range(iommu_pt->domain, iommu_virt_addr, memdesc->sg,
-				size, (IOMMU_READ | IOMMU_WRITE));
+				size, protflags);
 	if (ret) {
-		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) "
-				"failed with err: %d\n", iommu_pt->domain,
-				iommu_virt_addr, memdesc->sg, size,
-				(IOMMU_READ | IOMMU_WRITE), ret);
+		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %x) err: %d\n",
+			iommu_pt->domain, iommu_virt_addr, memdesc->sg, size,
+			protflags, ret);
 		return ret;
 	}
+	if ((protflags & IOMMU_WRITE) && kgsl_memdesc_has_guard_page(memdesc)) {
+		struct scatterlist *sg = &memdesc->sg[memdesc->sglen - 1];
 
+		ret = iommu_map(iommu_pt->domain, iommu_virt_addr + size,
+				kgsl_get_sg_pa(sg), PAGE_SIZE,
+				protflags & ~IOMMU_WRITE);
+		if (ret) {
+			KGSL_CORE_ERR("iommu_map(%p, %x, %x, %x) err: %d\n",
+				iommu_pt->domain, iommu_virt_addr + size,
+				kgsl_get_sg_pa(sg), protflags & ~IOMMU_WRITE,
+				ret);
+			/* cleanup the partial mapping */
+			iommu_unmap_range(iommu_pt->domain, iommu_virt_addr,
+					  size);
+		}
+	}
 	return ret;
 }
 
@@ -1166,7 +1653,6 @@
 	 *
 	 *  call this with the global lock held
 	 */
-
 	if (mmu->flags & KGSL_FLAGS_STARTED) {
 		/* detach iommu attachment */
 		kgsl_detach_pagetable_iommu_domain(mmu);
@@ -1181,10 +1667,12 @@
 				for (j = 0; j < iommu_unit->dev_count; j++) {
 					if (iommu_unit->dev[j].fault) {
 						kgsl_iommu_enable_clk(mmu, j);
+						msm_iommu_lock();
 						KGSL_IOMMU_SET_CTX_REG(iommu,
 						iommu_unit,
 						iommu_unit->dev[j].ctx_id,
 						RESUME, 1);
+						msm_iommu_unlock();
 						iommu_unit->dev[j].fault = 0;
 					}
 				}
@@ -1202,22 +1690,28 @@
 {
 	struct kgsl_iommu *iommu = mmu->priv;
 	int i;
-	for (i = 0; i < iommu->unit_count; i++) {
-		struct kgsl_pagetable *pagetable = (mmu->priv_bank_table ?
-			mmu->priv_bank_table : mmu->defaultpagetable);
-		if (iommu->iommu_units[i].reg_map.gpuaddr)
-			kgsl_mmu_unmap(pagetable,
-			&(iommu->iommu_units[i].reg_map));
-		if (iommu->iommu_units[i].reg_map.hostptr)
-			iounmap(iommu->iommu_units[i].reg_map.hostptr);
-		kgsl_sg_free(iommu->iommu_units[i].reg_map.sg,
-				iommu->iommu_units[i].reg_map.sglen);
+
+	if (mmu->priv_bank_table != NULL) {
+		kgsl_iommu_cleanup_regs(mmu, mmu->priv_bank_table);
+		kgsl_mmu_putpagetable(mmu->priv_bank_table);
 	}
 
-	if (mmu->priv_bank_table)
-		kgsl_mmu_putpagetable(mmu->priv_bank_table);
-	if (mmu->defaultpagetable)
+	if (mmu->defaultpagetable != NULL)
 		kgsl_mmu_putpagetable(mmu->defaultpagetable);
+
+	for (i = 0; i < iommu->unit_count; i++) {
+		struct kgsl_memdesc *reg_map = &iommu->iommu_units[i].reg_map;
+
+		if (reg_map->hostptr)
+			iounmap(reg_map->hostptr);
+		kgsl_sg_free(reg_map->sg, reg_map->sglen);
+		reg_map->priv &= ~KGSL_MEMDESC_GLOBAL;
+	}
+	/* clear IOMMU GPU CPU sync structures */
+	kgsl_sg_free(iommu->sync_lock_desc.sg, iommu->sync_lock_desc.sglen);
+	memset(&iommu->sync_lock_desc, 0, sizeof(iommu->sync_lock_desc));
+	iommu->sync_lock_vars = NULL;
+
 	kfree(iommu);
 
 	return 0;
@@ -1273,12 +1767,15 @@
 	pt_base &= (iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
 			iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
 
-	//if (msm_soc_version_supports_iommu_v1())
+	/* For v0 SMMU GPU needs to be idle for tlb invalidate as well */
+	if (msm_soc_version_supports_iommu_v0())
+		kgsl_idle(mmu->device);
+
 	/* Acquire GPU-CPU sync Lock here */
 	msm_iommu_lock();
 
 	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
-		if (!msm_soc_version_supports_iommu_v1())
+		if (!msm_soc_version_supports_iommu_v0())
 			kgsl_idle(mmu->device);
 		for (i = 0; i < iommu->unit_count; i++) {
 			/* get the lsb value which should not change when
@@ -1357,6 +1854,9 @@
 	.mmu_get_num_iommu_units = kgsl_iommu_get_num_iommu_units,
 	.mmu_pt_equal = kgsl_iommu_pt_equal,
 	.mmu_get_pt_base_addr = kgsl_iommu_get_pt_base_addr,
+	/* These callbacks will be set on some chipsets */
+	.mmu_setup_pt = NULL,
+	.mmu_cleanup_pt = NULL,
 	.mmu_sync_lock = kgsl_iommu_sync_lock,
 	.mmu_sync_unlock = kgsl_iommu_sync_unlock,
 };
diff --git a/drivers/gpu/msm/kgsl_iommu.h b/drivers/gpu/msm/kgsl_iommu.h
index 4507700..c09bc4b 100644
--- a/drivers/gpu/msm/kgsl_iommu.h
+++ b/drivers/gpu/msm/kgsl_iommu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -15,10 +15,37 @@
 
 #include <mach/iommu.h>
 
-#define KGSL_IOMMU_CTX_OFFSET_V1	0
-#define KGSL_IOMMU_CTX_OFFSET_V2	0x8000
+#define KGSL_IOMMU_CTX_OFFSET_V0	0
+#define KGSL_IOMMU_CTX_OFFSET_V1	0x8000
 #define KGSL_IOMMU_CTX_SHIFT		12
 
+/* TLBLKCR fields */
+#define KGSL_IOMMU_TLBLKCR_LKE_MASK		0x00000001
+#define KGSL_IOMMU_TLBLKCR_LKE_SHIFT		0
+#define KGSL_IOMMU_TLBLKCR_TLBIALLCFG_MASK	0x00000001
+#define KGSL_IOMMU_TLBLKCR_TLBIALLCFG_SHIFT	1
+#define KGSL_IOMMU_TLBLKCR_TLBIASIDCFG_MASK	0x00000001
+#define KGSL_IOMMU_TLBLKCR_TLBIASIDCFG_SHIFT	2
+#define KGSL_IOMMU_TLBLKCR_TLBIVAACFG_MASK	0x00000001
+#define KGSL_IOMMU_TLBLKCR_TLBIVAACFG_SHIFT	3
+#define KGSL_IOMMU_TLBLKCR_FLOOR_MASK		0x000000FF
+#define KGSL_IOMMU_TLBLKCR_FLOOR_SHIFT		8
+#define KGSL_IOMMU_TLBLKCR_VICTIM_MASK		0x000000FF
+#define KGSL_IOMMU_TLBLKCR_VICTIM_SHIFT		16
+
+/* V2PXX fields */
+#define KGSL_IOMMU_V2PXX_INDEX_MASK		0x000000FF
+#define KGSL_IOMMU_V2PXX_INDEX_SHIFT		0
+#define KGSL_IOMMU_V2PXX_VA_MASK		0x000FFFFF
+#define KGSL_IOMMU_V2PXX_VA_SHIFT		12
+
+/* FSYNR1 V0 fields */
+#define KGSL_IOMMU_FSYNR1_AWRITE_MASK		0x00000001
+#define KGSL_IOMMU_FSYNR1_AWRITE_SHIFT		8
+/* FSYNR0 V1 fields */
+#define KGSL_IOMMU_V1_FSYNR0_WNR_MASK		0x00000001
+#define KGSL_IOMMU_V1_FSYNR0_WNR_SHIFT		4
+
 enum kgsl_iommu_reg_map {
 	KGSL_IOMMU_GLOBAL_BASE = 0,
 	KGSL_IOMMU_CTX_TTBR0,
@@ -26,6 +53,10 @@
 	KGSL_IOMMU_CTX_FSR,
 	KGSL_IOMMU_CTX_TLBIALL,
 	KGSL_IOMMU_CTX_RESUME,
+	KGSL_IOMMU_CTX_TLBLKCR,
+	KGSL_IOMMU_CTX_V2PUR,
+	KGSL_IOMMU_CTX_FSYNR0,
+	KGSL_IOMMU_CTX_FSYNR1,
 	KGSL_IOMMU_REG_MAX
 };
 
@@ -124,6 +155,8 @@
  * IOMMU registers
  * @sync_lock_desc: GPU Memory descriptor for the memory containing the
  * spinlocks
+ * @sync_lock_offset - The page offset within a page at which the sync
+ * variables are located
  * @sync_lock_initialized: True if the sync_lock feature is enabled
  */
 struct kgsl_iommu {
@@ -136,6 +169,7 @@
 	struct kgsl_iommu_register_list *iommu_reg_list;
 	struct remote_iommu_petersons_spinlock *sync_lock_vars;
 	struct kgsl_memdesc sync_lock_desc;
+	unsigned int sync_lock_offset;
 	bool sync_lock_initialized;
 };
 
diff --git a/drivers/gpu/msm/kgsl_log.h b/drivers/gpu/msm/kgsl_log.h
index 83d14f7..a7832e4 100644
--- a/drivers/gpu/msm/kgsl_log.h
+++ b/drivers/gpu/msm/kgsl_log.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2008-2011, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2008-2011,2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/gpu/msm/kgsl_mmu.c b/drivers/gpu/msm/kgsl_mmu.c
index d1f58c4..4e95373 100644
--- a/drivers/gpu/msm/kgsl_mmu.c
+++ b/drivers/gpu/msm/kgsl_mmu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -23,13 +23,11 @@
 
 #include "kgsl.h"
 #include "kgsl_mmu.h"
+#include "kgsl_gpummu.h"
 #include "kgsl_device.h"
 #include "kgsl_sharedmem.h"
 #include "adreno.h"
 
-#define KGSL_MMU_ALIGN_SHIFT    13
-#define KGSL_MMU_ALIGN_MASK     (~((1 << KGSL_MMU_ALIGN_SHIFT) - 1))
-
 static enum kgsl_mmutype kgsl_mmu_type;
 
 static void pagetable_remove_sysfs_objects(struct kgsl_pagetable *pagetable);
@@ -37,17 +35,18 @@
 static int kgsl_cleanup_pt(struct kgsl_pagetable *pt)
 {
 	int i;
-	/* For IOMMU only unmap the global structures to global pt */
-	if ((KGSL_MMU_TYPE_NONE != kgsl_mmu_type) &&
-		(KGSL_MMU_TYPE_IOMMU == kgsl_mmu_type) &&
-		(KGSL_MMU_GLOBAL_PT !=  pt->name) &&
-		(KGSL_MMU_PRIV_BANK_TABLE_NAME !=  pt->name))
-		return 0;
+	struct kgsl_device *device;
+
 	for (i = 0; i < KGSL_DEVICE_MAX; i++) {
-		struct kgsl_device *device = kgsl_driver.devp[i];
+		device = kgsl_driver.devp[i];
 		if (device)
 			device->ftbl->cleanup_pt(device, pt);
 	}
+	/* Only the 3d device needs mmu specific pt entries */
+	device = kgsl_driver.devp[KGSL_DEVICE_3D0];
+	if (device->mmu.mmu_ops->mmu_cleanup_pt != NULL)
+		device->mmu.mmu_ops->mmu_cleanup_pt(&device->mmu, pt);
+
 	return 0;
 }
 
@@ -56,21 +55,25 @@
 {
 	int i = 0;
 	int status = 0;
+	struct kgsl_device *device;
 
-	/* For IOMMU only map the global structures to global pt */
-	if ((KGSL_MMU_TYPE_NONE != kgsl_mmu_type) &&
-		(KGSL_MMU_TYPE_IOMMU == kgsl_mmu_type) &&
-		(KGSL_MMU_GLOBAL_PT !=  pt->name) &&
-		(KGSL_MMU_PRIV_BANK_TABLE_NAME !=  pt->name))
-		return 0;
 	for (i = 0; i < KGSL_DEVICE_MAX; i++) {
-		struct kgsl_device *device = kgsl_driver.devp[i];
+		device = kgsl_driver.devp[i];
 		if (device) {
 			status = device->ftbl->setup_pt(device, pt);
 			if (status)
 				goto error_pt;
 		}
 	}
+	/* Only the 3d device needs mmu specific pt entries */
+	device = kgsl_driver.devp[KGSL_DEVICE_3D0];
+	if (device->mmu.mmu_ops->mmu_setup_pt != NULL) {
+		status = device->mmu.mmu_ops->mmu_setup_pt(&device->mmu, pt);
+		if (status) {
+			i = KGSL_DEVICE_MAX - 1;
+			goto error_pt;
+		}
+	}
 	return status;
 error_pt:
 	while (i >= 0) {
@@ -101,7 +104,7 @@
 	if (pagetable->pool)
 		gen_pool_destroy(pagetable->pool);
 
-	pagetable->pt_ops->mmu_destroy_pagetable(pagetable->priv);
+	pagetable->pt_ops->mmu_destroy_pagetable(pagetable);
 
 	kfree(pagetable);
 }
@@ -191,7 +194,7 @@
 
 	if (pt) {
 		ret += snprintf(buf, PAGE_SIZE, "0x%x\n",
-			kgsl_mmu_get_ptsize());
+			kgsl_mmu_get_ptsize(pt->mmu));
 	}
 
 	kgsl_put_pagetable(pt);
@@ -310,22 +313,6 @@
 	return ret;
 }
 
-unsigned int kgsl_mmu_get_ptsize(void)
-{
-	/*
-	 * For IOMMU, we could do up to 4G virtual range if we wanted to, but
-	 * it makes more sense to return a smaller range and leave the rest of
-	 * the virtual range for future improvements
-	 */
-
-	if (KGSL_MMU_TYPE_GPU == kgsl_mmu_type)
-		return CONFIG_MSM_KGSL_PAGE_TABLE_SIZE;
-	else if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_type)
-		return SZ_2G - KGSL_PAGETABLE_BASE;
-	else
-		return 0;
-}
-
 int
 kgsl_mmu_get_ptname_from_ptbase(struct kgsl_mmu *mmu, unsigned int pt_base)
 {
@@ -355,15 +342,15 @@
 	unsigned int ret = 0;
 
 	if (!mmu->mmu_ops || !mmu->mmu_ops->mmu_pt_equal)
-		return KGSL_MMU_GLOBAL_PT;
+		return 0;
 	spin_lock(&kgsl_driver.ptlock);
 	list_for_each_entry(pt, &kgsl_driver.pagetable_list, list) {
 		if (mmu->mmu_ops->mmu_pt_equal(mmu, pt, pt_base)) {
-			if ((addr & (PAGE_SIZE-1)) == pt->fault_addr) {
+			if ((addr & ~(PAGE_SIZE-1)) == pt->fault_addr) {
 				ret = 1;
 				break;
 			} else {
-				pt->fault_addr = (addr & (PAGE_SIZE-1));
+				pt->fault_addr = (addr & ~(PAGE_SIZE-1));
 				ret = 0;
 				break;
 			}
@@ -458,7 +445,8 @@
 }
 EXPORT_SYMBOL(kgsl_mh_intrcallback);
 
-static struct kgsl_pagetable *kgsl_mmu_createpagetableobject(
+static struct kgsl_pagetable *
+kgsl_mmu_createpagetableobject(struct kgsl_mmu *mmu,
 				unsigned int name)
 {
 	int status = 0;
@@ -477,8 +465,8 @@
 
 	spin_lock_init(&pagetable->lock);
 
-	ptsize = kgsl_mmu_get_ptsize();
-
+	ptsize = kgsl_mmu_get_ptsize(mmu);
+	pagetable->mmu = mmu;
 	pagetable->name = name;
 	pagetable->max_entries = KGSL_PAGETABLE_ENTRIES(ptsize);
 	pagetable->fault_addr = 0xFFFFFFFF;
@@ -490,10 +478,10 @@
 	if ((KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) &&
 		((KGSL_MMU_GLOBAL_PT == name) ||
 		(KGSL_MMU_PRIV_BANK_TABLE_NAME == name))) {
-		pagetable->kgsl_pool = gen_pool_create(PAGE_SHIFT, -1);
+		pagetable->kgsl_pool = gen_pool_create(ilog2(SZ_8K), -1);
 		if (pagetable->kgsl_pool == NULL) {
 			KGSL_CORE_ERR("gen_pool_create(%d) failed\n",
-					KGSL_MMU_ALIGN_SHIFT);
+					ilog2(SZ_8K));
 			goto err_alloc;
 		}
 		if (gen_pool_add(pagetable->kgsl_pool,
@@ -504,14 +492,14 @@
 		}
 	}
 
-	pagetable->pool = gen_pool_create(KGSL_MMU_ALIGN_SHIFT, -1);
+	pagetable->pool = gen_pool_create(PAGE_SHIFT, -1);
 	if (pagetable->pool == NULL) {
 		KGSL_CORE_ERR("gen_pool_create(%d) failed\n",
-			      KGSL_MMU_ALIGN_SHIFT);
+			      PAGE_SHIFT);
 		goto err_kgsl_pool;
 	}
 
-	if (gen_pool_add(pagetable->pool, KGSL_PAGETABLE_BASE,
+	if (gen_pool_add(pagetable->pool, kgsl_mmu_get_base_addr(mmu),
 				ptsize, -1)) {
 		KGSL_CORE_ERR("gen_pool_add failed\n");
 		goto err_pool;
@@ -540,7 +528,7 @@
 	return pagetable;
 
 err_mmu_create:
-	pagetable->pt_ops->mmu_destroy_pagetable(pagetable->priv);
+	pagetable->pt_ops->mmu_destroy_pagetable(pagetable);
 err_pool:
 	gen_pool_destroy(pagetable->pool);
 err_kgsl_pool:
@@ -552,24 +540,21 @@
 	return NULL;
 }
 
-struct kgsl_pagetable *kgsl_mmu_getpagetable(unsigned long name)
+struct kgsl_pagetable *kgsl_mmu_getpagetable(struct kgsl_mmu *mmu,
+						unsigned long name)
 {
 	struct kgsl_pagetable *pt;
 
 	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_type)
 		return (void *)(-1);
 
-#ifndef CONFIG_KGSL_PER_PROCESS_PAGE_TABLE
-	name = KGSL_MMU_GLOBAL_PT;
-#endif
-	/* We presently do not support per-process for IOMMU-v2 */
-	if (!msm_soc_version_supports_iommu_v1())
+	if (!kgsl_mmu_is_perprocess(mmu))
 		name = KGSL_MMU_GLOBAL_PT;
 
 	pt = kgsl_get_pagetable(name);
 
 	if (pt == NULL)
-		pt = kgsl_mmu_createpagetableobject(name);
+		pt = kgsl_mmu_createpagetableobject(mmu, name);
 
 	return pt;
 }
@@ -626,24 +611,15 @@
 	 */
 }
 
-static inline struct gen_pool *
-_get_pool(struct kgsl_pagetable *pagetable, unsigned int flags)
-{
-	if (pagetable->kgsl_pool &&
-		(KGSL_MEMDESC_GLOBAL & flags))
-		return pagetable->kgsl_pool;
-	return pagetable->pool;
-}
-
 int
 kgsl_mmu_map(struct kgsl_pagetable *pagetable,
-				struct kgsl_memdesc *memdesc,
-				unsigned int protflags)
+				struct kgsl_memdesc *memdesc)
 {
 	int ret;
-	struct gen_pool *pool;
+	struct gen_pool *pool = NULL;
 	int size;
 	int page_align = ilog2(PAGE_SIZE);
+	unsigned int protflags = kgsl_memdesc_protflags(memdesc);
 
 	if (kgsl_mmu_type == KGSL_MMU_TYPE_NONE) {
 		if (memdesc->sglen == 1) {
@@ -665,33 +641,57 @@
 
 	size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
 
-	/* Allocate from kgsl pool if it exists for global mappings */
-	pool = _get_pool(pagetable, memdesc->priv);
+	pool = pagetable->pool;
 
-	/* Allocate aligned virtual addresses for iommu. This allows
-	 * more efficient pagetable entries if the physical memory
-	 * is also aligned. Don't do this for GPUMMU, because
-	 * the address space is so small.
-	 */
-	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype() &&
-	    kgsl_memdesc_get_align(memdesc) > 0)
-		page_align = kgsl_memdesc_get_align(memdesc);
-
-	memdesc->gpuaddr = gen_pool_alloc_aligned(pool, size, page_align);
-	if (memdesc->gpuaddr == 0) {
-		KGSL_CORE_ERR("gen_pool_alloc(%d) failed from pool: %s\n",
-			size,
-			(pool == pagetable->kgsl_pool) ?
-			"kgsl_pool" : "general_pool");
-		KGSL_CORE_ERR(" [%d] allocated=%d, entries=%d\n",
-				pagetable->name, pagetable->stats.mapped,
-				pagetable->stats.entries);
-		return -ENOMEM;
+	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
+		/* Allocate aligned virtual addresses for iommu. This allows
+		 * more efficient pagetable entries if the physical memory
+		 * is also aligned. Don't do this for GPUMMU, because
+		 * the address space is so small.
+		 */
+		if (kgsl_memdesc_get_align(memdesc) > 0)
+			page_align = kgsl_memdesc_get_align(memdesc);
+		if (kgsl_memdesc_is_global(memdesc)) {
+			/*
+			 * Only the default pagetable has a kgsl_pool, and
+			 * it is responsible for creating the mapping for
+			 * each global buffer. The mapping will be reused
+			 * in all other pagetables and it must already exist
+			 * when we're creating other pagetables which do not
+			 * have a kgsl_pool.
+			 */
+			pool = pagetable->kgsl_pool;
+			if (pool == NULL && memdesc->gpuaddr == 0) {
+				KGSL_CORE_ERR(
+				  "No address for global mapping into pt %d\n",
+				  pagetable->name);
+				return -EINVAL;
+			}
+		} else if (kgsl_memdesc_use_cpu_map(memdesc)) {
+			if (memdesc->gpuaddr == 0)
+				return -EINVAL;
+			pool = NULL;
+		}
+	}
+	if (pool) {
+		memdesc->gpuaddr = gen_pool_alloc_aligned(pool, size,
+							  page_align);
+		if (memdesc->gpuaddr == 0) {
+			KGSL_CORE_ERR("gen_pool_alloc(%d) failed, pool: %s\n",
+					size,
+					(pool == pagetable->kgsl_pool) ?
+					"kgsl_pool" : "general_pool");
+			KGSL_CORE_ERR(" [%d] allocated=%d, entries=%d\n",
+					pagetable->name,
+					pagetable->stats.mapped,
+					pagetable->stats.entries);
+			return -ENOMEM;
+		}
 	}
 
 	if (KGSL_MMU_TYPE_IOMMU != kgsl_mmu_get_mmutype())
 		spin_lock(&pagetable->lock);
-	ret = pagetable->pt_ops->mmu_map(pagetable->priv, memdesc, protflags,
+	ret = pagetable->pt_ops->mmu_map(pagetable, memdesc, protflags,
 						&pagetable->tlb_flags);
 	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
 		spin_lock(&pagetable->lock);
@@ -713,7 +713,8 @@
 
 err_free_gpuaddr:
 	spin_unlock(&pagetable->lock);
-	gen_pool_free(pool, memdesc->gpuaddr, size);
+	if (pool)
+		gen_pool_free(pool, memdesc->gpuaddr, size);
 	memdesc->gpuaddr = 0;
 	return ret;
 }
@@ -743,7 +744,7 @@
 
 	if (KGSL_MMU_TYPE_IOMMU != kgsl_mmu_get_mmutype())
 		spin_lock(&pagetable->lock);
-	pagetable->pt_ops->mmu_unmap(pagetable->priv, memdesc,
+	pagetable->pt_ops->mmu_unmap(pagetable, memdesc,
 					&pagetable->tlb_flags);
 
 	/* If buffer is unmapped 0 fault addr */
@@ -759,21 +760,29 @@
 
 	spin_unlock(&pagetable->lock);
 
-	pool = _get_pool(pagetable, memdesc->priv);
-	gen_pool_free(pool, memdesc->gpuaddr, size);
+	pool = pagetable->pool;
+
+	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
+		if (kgsl_memdesc_is_global(memdesc))
+			pool = pagetable->kgsl_pool;
+		else if (kgsl_memdesc_use_cpu_map(memdesc))
+			pool = NULL;
+	}
+	if (pool)
+		gen_pool_free(pool, memdesc->gpuaddr, size);
 
 	/*
 	 * Don't clear the gpuaddr on global mappings because they
 	 * may be in use by other pagetables
 	 */
-	if (!(memdesc->priv & KGSL_MEMDESC_GLOBAL))
+	if (!kgsl_memdesc_is_global(memdesc))
 		memdesc->gpuaddr = 0;
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_mmu_unmap);
 
 int kgsl_mmu_map_global(struct kgsl_pagetable *pagetable,
-			struct kgsl_memdesc *memdesc, unsigned int protflags)
+			struct kgsl_memdesc *memdesc)
 {
 	int result = -EINVAL;
 	unsigned int gpuaddr = 0;
@@ -785,19 +794,17 @@
 	/* Not all global mappings are needed for all MMU types */
 	if (!memdesc->size)
 		return 0;
-
 	gpuaddr = memdesc->gpuaddr;
 	memdesc->priv |= KGSL_MEMDESC_GLOBAL;
 
-	result = kgsl_mmu_map(pagetable, memdesc, protflags);
+	result = kgsl_mmu_map(pagetable, memdesc);
 	if (result)
 		goto error;
 
 	/*global mappings must have the same gpu address in all pagetables*/
 	if (gpuaddr && gpuaddr != memdesc->gpuaddr) {
-		KGSL_CORE_ERR("pt %p addr mismatch phys 0x%08x"
-			"gpu 0x%0x 0x%08x", pagetable, memdesc->physaddr,
-			gpuaddr, memdesc->gpuaddr);
+		KGSL_CORE_ERR("pt %p addr mismatch phys %pa gpu 0x%0x 0x%08x",
+		     pagetable, &memdesc->physaddr, gpuaddr, memdesc->gpuaddr);
 		goto error_unmap;
 	}
 	return result;
@@ -890,12 +897,18 @@
 }
 EXPORT_SYMBOL(kgsl_mmu_set_mmutype);
 
-int kgsl_mmu_gpuaddr_in_range(unsigned int gpuaddr)
+int kgsl_mmu_gpuaddr_in_range(struct kgsl_pagetable *pt, unsigned int gpuaddr)
 {
 	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_type)
 		return 1;
-	return ((gpuaddr >= KGSL_PAGETABLE_BASE) &&
-		(gpuaddr < (KGSL_PAGETABLE_BASE + kgsl_mmu_get_ptsize())));
+	if (gpuaddr >= kgsl_mmu_get_base_addr(pt->mmu) &&
+		gpuaddr < kgsl_mmu_get_base_addr(pt->mmu) +
+		kgsl_mmu_get_ptsize(pt->mmu))
+		return 1;
+	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU
+		&& kgsl_mmu_is_perprocess(pt->mmu))
+		return (gpuaddr > 0 && gpuaddr < TASK_SIZE);
+	return 0;
 }
 EXPORT_SYMBOL(kgsl_mmu_gpuaddr_in_range);
 
diff --git a/drivers/gpu/msm/kgsl_mmu.h b/drivers/gpu/msm/kgsl_mmu.h
index 377f342..d7d9516 100644
--- a/drivers/gpu/msm/kgsl_mmu.h
+++ b/drivers/gpu/msm/kgsl_mmu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -13,16 +13,23 @@
 #ifndef __KGSL_MMU_H
 #define __KGSL_MMU_H
 
-/*
- * These defines control the split between ttbr1 and ttbr0 pagetables of IOMMU
- * and what ranges of memory we map to them
- */
-#define KGSL_IOMMU_GLOBAL_MEM_BASE	0xC0000000
-#define KGSL_IOMMU_GLOBAL_MEM_SIZE	SZ_4M
-#define KGSL_IOMMU_TTBR1_SPLIT		2
+#include <mach/iommu.h>
 
-#define KGSL_MMU_ALIGN_SHIFT    13
-#define KGSL_MMU_ALIGN_MASK     (~((1 << KGSL_MMU_ALIGN_SHIFT) - 1))
+/*
+ * These defines control the address range for allocations that
+ * are mapped into all pagetables.
+ */
+#define KGSL_IOMMU_GLOBAL_MEM_BASE	0xf8000000
+#define KGSL_IOMMU_GLOBAL_MEM_SIZE	SZ_4M
+
+#define KGSL_MMU_ALIGN_MASK     (~((1 << PAGE_SHIFT) - 1))
+
+/* defconfig option for disabling per process pagetables */
+#ifdef CONFIG_KGSL_PER_PROCESS_PAGE_TABLE
+#define KGSL_MMU_USE_PER_PROCESS_PT true
+#else
+#define KGSL_MMU_USE_PER_PROCESS_PT false
+#endif
 
 /* Identifier for the global page table */
 /* Per process page tables will probably pass in the thread group
@@ -116,6 +123,7 @@
 	unsigned int tlb_flags;
 	unsigned int fault_addr;
 	void *priv;
+	struct kgsl_mmu *mmu;
 };
 
 struct kgsl_mmu;
@@ -149,24 +157,26 @@
 	unsigned int (*mmu_get_pt_base_addr)
 			(struct kgsl_mmu *mmu,
 			struct kgsl_pagetable *pt);
+	int (*mmu_setup_pt) (struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt);
+	void (*mmu_cleanup_pt) (struct kgsl_mmu *mmu,
+			struct kgsl_pagetable *pt);
 	unsigned int (*mmu_sync_lock)
-			(struct kgsl_mmu *mmu,
-			unsigned int *cmds);
+			(struct kgsl_mmu *mmu, unsigned int *cmds);
 	unsigned int (*mmu_sync_unlock)
-			(struct kgsl_mmu *mmu,
-			unsigned int *cmds);
+			(struct kgsl_mmu *mmu, unsigned int *cmds);
 };
 
 struct kgsl_mmu_pt_ops {
-	int (*mmu_map) (void *mmu_pt,
+	int (*mmu_map) (struct kgsl_pagetable *pt,
 			struct kgsl_memdesc *memdesc,
 			unsigned int protflags,
 			unsigned int *tlb_flags);
-	int (*mmu_unmap) (void *mmu_pt,
+	int (*mmu_unmap) (struct kgsl_pagetable *pt,
 			struct kgsl_memdesc *memdesc,
 			unsigned int *tlb_flags);
 	void *(*mmu_create_pagetable) (void);
-	void (*mmu_destroy_pagetable) (void *pt);
+	void (*mmu_destroy_pagetable) (struct kgsl_pagetable *);
 };
 
 #define KGSL_MMU_FLAGS_IOMMU_SYNC BIT(31)
@@ -185,14 +195,19 @@
 	const struct kgsl_mmu_ops *mmu_ops;
 	void *priv;
 	int fault;
+	unsigned long pt_base;
+	unsigned long pt_size;
+	bool pt_per_process;
+	bool use_cpu_map;
 };
 
-#include "kgsl_gpummu.h"
-
 extern struct kgsl_mmu_ops iommu_ops;
 extern struct kgsl_mmu_pt_ops iommu_pt_ops;
+extern struct kgsl_mmu_ops gpummu_ops;
+extern struct kgsl_mmu_pt_ops gpummu_pt_ops;
 
-struct kgsl_pagetable *kgsl_mmu_getpagetable(unsigned long name);
+struct kgsl_pagetable *kgsl_mmu_getpagetable(struct kgsl_mmu *,
+						unsigned long name);
 void kgsl_mmu_putpagetable(struct kgsl_pagetable *pagetable);
 void kgsl_mh_start(struct kgsl_device *device);
 void kgsl_mh_intrcallback(struct kgsl_device *device);
@@ -200,10 +215,9 @@
 int kgsl_mmu_start(struct kgsl_device *device);
 int kgsl_mmu_close(struct kgsl_device *device);
 int kgsl_mmu_map(struct kgsl_pagetable *pagetable,
-		 struct kgsl_memdesc *memdesc,
-		 unsigned int protflags);
+		 struct kgsl_memdesc *memdesc);
 int kgsl_mmu_map_global(struct kgsl_pagetable *pagetable,
-			struct kgsl_memdesc *memdesc, unsigned int protflags);
+			struct kgsl_memdesc *memdesc);
 int kgsl_mmu_unmap(struct kgsl_pagetable *pagetable,
 		    struct kgsl_memdesc *memdesc);
 unsigned int kgsl_virtaddr_to_physaddr(void *virtaddr);
@@ -220,8 +234,7 @@
 int kgsl_mmu_enabled(void);
 void kgsl_mmu_set_mmutype(char *mmutype);
 enum kgsl_mmutype kgsl_mmu_get_mmutype(void);
-unsigned int kgsl_mmu_get_ptsize(void);
-int kgsl_mmu_gpuaddr_in_range(unsigned int gpuaddr);
+int kgsl_mmu_gpuaddr_in_range(struct kgsl_pagetable *pt, unsigned int gpuaddr);
 
 /*
  * Static inline functions of MMU that simply call the SMMU specific
@@ -332,6 +345,56 @@
 		return 0;
 }
 
+/*
+ * kgsl_mmu_is_perprocess() - Runtime check for per-process
+ * pagetables.
+ * @mmu: the mmu
+ *
+ * Returns true if per-process pagetables are enabled,
+ * false if not.
+ */
+static inline int kgsl_mmu_is_perprocess(struct kgsl_mmu *mmu)
+{
+	return mmu->pt_per_process;
+}
+
+/*
+ * kgsl_mmu_use_cpu_map() - Runtime check for matching the CPU
+ * address space on the GPU.
+ * @mmu: the mmu
+ *
+ * Returns true if supported false if not.
+ */
+static inline int kgsl_mmu_use_cpu_map(struct kgsl_mmu *mmu)
+{
+	return mmu->pt_per_process;
+}
+
+/*
+ * kgsl_mmu_base_addr() - Get gpu virtual address base.
+ * @mmu: the mmu
+ *
+ * Returns the start address of the allocatable gpu
+ * virtual address space. Other mappings that mirror
+ * the CPU address space are possible outside this range.
+ */
+static inline unsigned int kgsl_mmu_get_base_addr(struct kgsl_mmu *mmu)
+{
+	return mmu->pt_base;
+}
+
+/*
+ * kgsl_mmu_get_ptsize() - Get gpu pagetable size
+ * @mmu: the mmu
+ *
+ * Returns the usable size of the gpu allocatable
+ * address space.
+ */
+static inline unsigned int kgsl_mmu_get_ptsize(struct kgsl_mmu *mmu)
+{
+	return mmu->pt_size;
+}
+
 static inline int kgsl_mmu_sync_lock(struct kgsl_mmu *mmu,
 				unsigned int *cmds)
 {
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index d489119..2f8d93e 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -23,13 +23,13 @@
 #include "kgsl_pwrscale.h"
 #include "kgsl_device.h"
 #include "kgsl_trace.h"
+#include "kgsl_sharedmem.h"
 
 #define KGSL_PWRFLAGS_POWER_ON 0
 #define KGSL_PWRFLAGS_CLK_ON   1
 #define KGSL_PWRFLAGS_AXI_ON   2
 #define KGSL_PWRFLAGS_IRQ_ON   3
 
-#define GPU_SWFI_LATENCY	3
 #define UPDATE_BUSY_VAL		1000000
 #define UPDATE_BUSY		50
 
@@ -130,6 +130,16 @@
 	 */
 
 	pwr->active_pwrlevel = new_level;
+	pwrlevel = &pwr->pwrlevels[pwr->active_pwrlevel];
+
+	if (test_bit(KGSL_PWRFLAGS_AXI_ON, &pwr->power_flags)) {
+
+		if (pwr->pcl)
+			msm_bus_scale_client_update_request(pwr->pcl,
+				pwrlevel->bus_freq);
+		else if (pwr->ebi1_clk)
+			clk_set_rate(pwr->ebi1_clk, pwrlevel->bus_freq);
+	}
 
 	if (test_bit(KGSL_PWRFLAGS_CLK_ON, &pwr->power_flags) ||
 		(device->state == KGSL_STATE_NAP)) {
@@ -156,16 +166,6 @@
 		}
 	}
 
-	pwrlevel = &pwr->pwrlevels[pwr->active_pwrlevel];
-
-	if (test_bit(KGSL_PWRFLAGS_AXI_ON, &pwr->power_flags)) {
-
-		if (pwr->pcl)
-			msm_bus_scale_client_update_request(pwr->pcl,
-				pwrlevel->bus_freq);
-		else if (pwr->ebi1_clk)
-			clk_set_rate(pwr->ebi1_clk, pwrlevel->bus_freq);
-	}
 
 	trace_kgsl_pwrlevel(device, pwr->active_pwrlevel, pwrlevel->gpu_freq);
 }
@@ -357,13 +357,13 @@
 	return snprintf(buf, PAGE_SIZE, "%d\n", pwr->num_pwrlevels - 1);
 }
 
-/* Given a GPU clock value, return the nearest powerlevel */
+/* Given a GPU clock value, return the lowest matching powerlevel */
 
 static int _get_nearest_pwrlevel(struct kgsl_pwrctrl *pwr, unsigned int clock)
 {
 	int i;
 
-	for (i = 0; i < pwr->num_pwrlevels - 1; i++) {
+	for (i = pwr->num_pwrlevels - 1; i >= 0; i--) {
 		if (abs(pwr->pwrlevels[i].gpu_freq - clock) < 5000000)
 			return i;
 	}
@@ -515,7 +515,6 @@
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
 	struct kgsl_pwrctrl *pwr;
 	const long div = 1000/HZ;
-	static unsigned int org_interval_timeout = 1;
 	int rc;
 
 	if (device == NULL)
@@ -528,15 +527,11 @@
 	if (rc)
 		return rc;
 
-	if (org_interval_timeout == 1)
-		org_interval_timeout = pwr->interval_timeout;
-
 	mutex_lock(&device->mutex);
 
 	/* Let the timeout be requested in ms, but convert to jiffies. */
 	val /= div;
-	if (val >= org_interval_timeout)
-		pwr->interval_timeout = val;
+	pwr->interval_timeout = val;
 
 	mutex_unlock(&device->mutex);
 
@@ -548,10 +543,48 @@
 					char *buf)
 {
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	int mul = 1000/HZ;
+	if (device == NULL)
+		return 0;
+	/* Show the idle_timeout converted to msec */
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+		device->pwrctrl.interval_timeout * mul);
+}
+
+static int kgsl_pwrctrl_pmqos_latency_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	char temp[20];
+	unsigned long val;
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	int rc;
+
+	if (device == NULL)
+		return 0;
+
+	snprintf(temp, sizeof(temp), "%.*s",
+			(int)min(count, sizeof(temp) - 1), buf);
+	rc = kstrtoul(temp, 0, &val);
+	if (rc)
+		return rc;
+
+	mutex_lock(&device->mutex);
+	device->pwrctrl.pm_qos_latency = val;
+	mutex_unlock(&device->mutex);
+
+	return count;
+}
+
+static int kgsl_pwrctrl_pmqos_latency_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
 	if (device == NULL)
 		return 0;
 	return snprintf(buf, PAGE_SIZE, "%d\n",
-		device->pwrctrl.interval_timeout);
+		device->pwrctrl.pm_qos_latency);
 }
 
 static int kgsl_pwrctrl_gpubusy_show(struct device *dev,
@@ -615,6 +648,14 @@
 	return num_chars;
 }
 
+static int kgsl_pwrctrl_reset_count_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct kgsl_device *device = kgsl_device_from_dev(dev);
+	return snprintf(buf, PAGE_SIZE, "%d\n", device->reset_counter);
+}
+
 DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store);
 DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show,
 	kgsl_pwrctrl_max_gpuclk_store);
@@ -640,6 +681,12 @@
 DEVICE_ATTR(num_pwrlevels, 0444,
 	kgsl_pwrctrl_num_pwrlevels_show,
 	NULL);
+DEVICE_ATTR(pmqos_latency, 0644,
+	kgsl_pwrctrl_pmqos_latency_show,
+	kgsl_pwrctrl_pmqos_latency_store);
+DEVICE_ATTR(reset_count, 0444,
+	kgsl_pwrctrl_reset_count_show,
+	NULL);
 
 static const struct device_attribute *pwrctrl_attr_list[] = {
 	&dev_attr_gpuclk,
@@ -653,6 +700,8 @@
 	&dev_attr_min_pwrlevel,
 	&dev_attr_thermal_pwrlevel,
 	&dev_attr_num_pwrlevels,
+	&dev_attr_pmqos_latency,
+	&dev_attr_reset_count,
 	NULL
 };
 
@@ -684,6 +733,9 @@
 	clkstats->on_time_old = on_time;
 	clkstats->elapsed_old = clkstats->elapsed;
 	clkstats->elapsed = 0;
+
+	trace_kgsl_gpubusy(device, clkstats->on_time_old,
+		clkstats->elapsed_old);
 }
 
 /* Track the amount of time the gpu is on vs the total system time. *
@@ -714,23 +766,23 @@
 			/* High latency clock maintenance. */
 			if ((pwr->pwrlevels[0].gpu_freq > 0) &&
 				(requested_state != KGSL_STATE_NAP)) {
-				clk_set_rate(pwr->grp_clks[0],
-					pwr->pwrlevels[pwr->num_pwrlevels - 1].
-					gpu_freq);
 				for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
 					if (pwr->grp_clks[i])
 						clk_unprepare(pwr->grp_clks[i]);
+				clk_set_rate(pwr->grp_clks[0],
+					pwr->pwrlevels[pwr->num_pwrlevels - 1].
+					gpu_freq);
 			}
 			kgsl_pwrctrl_busy_time(device, true);
 		} else if (requested_state == KGSL_STATE_SLEEP) {
 			/* High latency clock maintenance. */
+			for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
+				if (pwr->grp_clks[i])
+					clk_unprepare(pwr->grp_clks[i]);
 			if ((pwr->pwrlevels[0].gpu_freq > 0))
 				clk_set_rate(pwr->grp_clks[0],
 					pwr->pwrlevels[pwr->num_pwrlevels - 1].
 					gpu_freq);
-			for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
-				if (pwr->grp_clks[i])
-					clk_unprepare(pwr->grp_clks[i]);
 		}
 	} else if (state == KGSL_PWRFLAGS_ON) {
 		if (!test_and_set_bit(KGSL_PWRFLAGS_CLK_ON,
@@ -738,15 +790,14 @@
 			trace_kgsl_clk(device, state);
 			/* High latency clock maintenance. */
 			if (device->state != KGSL_STATE_NAP) {
-				for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
-					if (pwr->grp_clks[i])
-						clk_prepare(pwr->grp_clks[i]);
-
 				if (pwr->pwrlevels[0].gpu_freq > 0)
 					clk_set_rate(pwr->grp_clks[0],
 						pwr->pwrlevels
 						[pwr->active_pwrlevel].
 						gpu_freq);
+				for (i = KGSL_MAX_CLKS - 1; i > 0; i--)
+					if (pwr->grp_clks[i])
+						clk_prepare(pwr->grp_clks[i]);
 			}
 			/* as last step, enable grp_clk
 			   this is to let GPU interrupt to come */
@@ -878,7 +929,8 @@
 	if (pdata->set_grp_async != NULL)
 		pdata->set_grp_async();
 
-	if (pdata->num_levels > KGSL_MAX_PWRLEVELS) {
+	if (pdata->num_levels > KGSL_MAX_PWRLEVELS ||
+	    pdata->num_levels < 1) {
 		KGSL_PWR_ERR(device, "invalid power level count: %d\n",
 					 pdata->num_levels);
 		result = -EINVAL;
@@ -947,6 +999,11 @@
 		}
 	}
 
+	/* Set the power level step multiplier with 1 as the default */
+	pwr->step_mul = pdata->step_mul ? pdata->step_mul : 1;
+
+	/* Set the CPU latency to 501usec to allow low latency PC modes */
+	pwr->pm_qos_latency = 501;
 
 	pm_runtime_enable(device->parentdev);
 	register_early_suspend(&device->display_off);
@@ -1143,7 +1200,7 @@
 		_sleep_accounting(device);
 		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP);
-		pm_qos_update_request(&device->pm_qos_req_dma,
+		pm_qos_update_request(&device->pwrctrl.pm_qos_req_dma,
 					PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLEEP:
@@ -1177,7 +1234,7 @@
 		device->ftbl->stop(device);
 		_sleep_accounting(device);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER);
-		pm_qos_update_request(&device->pm_qos_req_dma,
+		pm_qos_update_request(&device->pwrctrl.pm_qos_req_dma,
 						PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLUMBER:
@@ -1224,10 +1281,15 @@
 void kgsl_pwrctrl_wake(struct kgsl_device *device)
 {
 	int status;
+	unsigned int context_id;
+	unsigned int state = device->state;
+	unsigned int ts_processed = 0xdeaddead;
+	struct kgsl_context *context;
+
 	kgsl_pwrctrl_request_state(device, KGSL_STATE_ACTIVE);
 	switch (device->state) {
 	case KGSL_STATE_SLUMBER:
-		status = device->ftbl->start(device, 0);
+		status = device->ftbl->start(device);
 		if (status) {
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 			KGSL_DRV_ERR(device, "start failed %d\n", status);
@@ -1237,6 +1299,17 @@
 	case KGSL_STATE_SLEEP:
 		kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON);
 		kgsl_pwrscale_wake(device);
+		kgsl_sharedmem_readl(&device->memstore,
+			(unsigned int *) &context_id,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+				current_context));
+		context = idr_find(&device->context_idr, context_id);
+		if (context)
+			ts_processed = kgsl_readtimestamp(device, context,
+				KGSL_TIMESTAMP_RETIRED);
+		KGSL_PWR_INFO(device, "Wake from %s state. CTXT: %d RTRD TS: %08X\n",
+			kgsl_pwrstate_to_str(state),
+			context ? context->id : -1, ts_processed);
 		/* fall through */
 	case KGSL_STATE_NAP:
 		/* Turn on the core clocks */
@@ -1247,8 +1320,8 @@
 		/* Re-enable HW access */
 		mod_timer(&device->idle_timer,
 				jiffies + device->pwrctrl.interval_timeout);
-		pm_qos_update_request(&device->pm_qos_req_dma,
-					GPU_SWFI_LATENCY);
+		pm_qos_update_request(&device->pwrctrl.pm_qos_req_dma,
+				device->pwrctrl.pm_qos_latency);
 	case KGSL_STATE_ACTIVE:
 		kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 		break;
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h
index 8d66505..ced52e1 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.h
+++ b/drivers/gpu/msm/kgsl_pwrctrl.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -60,6 +60,9 @@
  * @irq_name - resource name for the IRQ
  * @restore_slumber - Flag to indicate that we are in a suspend/restore sequence
  * @clk_stats - structure of clock statistics
+ * @pm_qos_req_dma - the power management quality of service structure
+ * @pm_qos_latency - allowed CPU latency in microseconds
+ * @step_mul - multiplier for moving between power levels
  */
 
 struct kgsl_pwrctrl {
@@ -85,6 +88,9 @@
 	s64 time;
 	unsigned int restore_slumber;
 	struct kgsl_clk_stats clk_stats;
+	struct pm_qos_request pm_qos_req_dma;
+	unsigned int pm_qos_latency;
+	unsigned int step_mul;
 };
 
 void kgsl_pwrctrl_irq(struct kgsl_device *device, int state);
diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c
index dffae70..02ada38 100644
--- a/drivers/gpu/msm/kgsl_pwrscale.c
+++ b/drivers/gpu/msm/kgsl_pwrscale.c
@@ -237,16 +237,14 @@
 void kgsl_pwrscale_busy(struct kgsl_device *device)
 {
 	if (PWRSCALE_ACTIVE(device) && device->pwrscale.policy->busy)
-		if (device->requested_state != KGSL_STATE_SLUMBER)
-			device->pwrscale.policy->busy(device,
-					&device->pwrscale);
+		device->pwrscale.policy->busy(device,
+				&device->pwrscale);
 }
 
 void kgsl_pwrscale_idle(struct kgsl_device *device)
 {
 	if (PWRSCALE_ACTIVE(device) && device->pwrscale.policy->idle)
-		if (device->requested_state != KGSL_STATE_SLUMBER &&
-			device->requested_state != KGSL_STATE_SLEEP)
+		if (device->state == KGSL_STATE_ACTIVE)
 			device->pwrscale.policy->idle(device,
 					&device->pwrscale);
 }
diff --git a/drivers/gpu/msm/kgsl_pwrscale_trustzone.c b/drivers/gpu/msm/kgsl_pwrscale_trustzone.c
index aa6861e..9b2ac70 100644
--- a/drivers/gpu/msm/kgsl_pwrscale_trustzone.c
+++ b/drivers/gpu/msm/kgsl_pwrscale_trustzone.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -38,6 +38,10 @@
  * per frame for 60fps content.
  */
 #define FLOOR			5000
+/* CEILING is 50msec, larger than any standard
+ * frame length, but less than the idle timer.
+ */
+#define CEILING			50000
 #define SWITCH_OFF		200
 #define SWITCH_OFF_RESET_TH	40
 #define SKIP_COUNTER		500
@@ -163,11 +167,24 @@
 		priv->no_switch_cnt = 0;
 	}
 
-	idle = priv->bin.total_time - priv->bin.busy_time;
+	/* If there is an extended block of busy processing,
+	 * increase frequency.  Otherwise run the normal algorithm.
+	 */
+	if (priv->bin.busy_time > CEILING) {
+		val = -1;
+	} else {
+		idle = priv->bin.total_time - priv->bin.busy_time;
+		idle = (idle > 0) ? idle : 0;
+		val = __secure_tz_entry(TZ_UPDATE_ID, idle, device->id);
+	}
 	priv->bin.total_time = 0;
 	priv->bin.busy_time = 0;
-	idle = (idle > 0) ? idle : 0;
-	val = __secure_tz_entry(TZ_UPDATE_ID, idle, device->id);
+
+	/* If the decision is to move to a lower level, make sure the GPU
+	 * frequency drops.
+	 */
+	if (val > 0)
+		val *= pwr->step_mul;
 	if (val)
 		kgsl_pwrctrl_pwrlevel_change(device,
 					     pwr->active_pwrlevel + val);
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
index a345e58..595f78f 100644
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -154,9 +154,7 @@
 
 static struct mem_entry_stats mem_stats[] = {
 	MEM_ENTRY_STAT(KGSL_MEM_ENTRY_KERNEL, kernel),
-#ifdef CONFIG_ANDROID_PMEM
 	MEM_ENTRY_STAT(KGSL_MEM_ENTRY_PMEM, pmem),
-#endif
 #ifdef CONFIG_ASHMEM
 	MEM_ENTRY_STAT(KGSL_MEM_ENTRY_ASHMEM, ashmem),
 #endif
@@ -511,21 +509,29 @@
 
 void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op)
 {
-	void *addr = memdesc->hostptr;
+	/*
+	 * If the buffer is mapped in the kernel operate on that address
+	 * otherwise use the user address
+	 */
+
+	void *addr = (memdesc->hostptr) ?
+		memdesc->hostptr : (void *) memdesc->useraddr;
+
 	int size = memdesc->size;
 
-	switch (op) {
-	case KGSL_CACHE_OP_FLUSH:
-		dmac_flush_range(addr, addr + size);
-		break;
-	case KGSL_CACHE_OP_CLEAN:
-		dmac_clean_range(addr, addr + size);
-		break;
-	case KGSL_CACHE_OP_INV:
-		dmac_inv_range(addr, addr + size);
-		break;
+	if (addr !=  NULL) {
+		switch (op) {
+		case KGSL_CACHE_OP_FLUSH:
+			dmac_flush_range(addr, addr + size);
+			break;
+		case KGSL_CACHE_OP_CLEAN:
+			dmac_clean_range(addr, addr + size);
+			break;
+		case KGSL_CACHE_OP_INV:
+			dmac_inv_range(addr, addr + size);
+			break;
+		}
 	}
-
 	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen, op);
 }
 EXPORT_SYMBOL(kgsl_cache_range_op);
@@ -533,7 +539,7 @@
 static int
 _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
-			size_t size, unsigned int protflags)
+			size_t size)
 {
 	int pcount = 0, order, ret = 0;
 	int j, len, page_size, sglen_alloc, sglen = 0;
@@ -541,13 +547,15 @@
 	pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
 	void *ptr;
 	unsigned int align;
+	int step = ((VMALLOC_END - VMALLOC_START)/8) >> PAGE_SHIFT;
 
 	align = (memdesc->flags & KGSL_MEMALIGN_MASK) >> KGSL_MEMALIGN_SHIFT;
 
 	page_size = (align >= ilog2(SZ_64K) && size >= SZ_64K)
 			? SZ_64K : PAGE_SIZE;
 	/* update align flags for what we actually use */
-	kgsl_memdesc_set_align(memdesc, ilog2(page_size));
+	if (page_size != PAGE_SIZE)
+		kgsl_memdesc_set_align(memdesc, ilog2(page_size));
 
 	/*
 	 * There needs to be enough room in the sg structure to be able to
@@ -568,11 +576,10 @@
 	memdesc->pagetable = pagetable;
 	memdesc->ops = &kgsl_page_alloc_ops;
 
-	memdesc->sg = kgsl_sg_alloc(sglen_alloc);
+	memdesc->sglen_alloc = sglen_alloc;
+	memdesc->sg = kgsl_sg_alloc(memdesc->sglen_alloc);
 
 	if (memdesc->sg == NULL) {
-		KGSL_CORE_ERR("vmalloc(%d) failed\n",
-			sglen_alloc * sizeof(struct scatterlist));
 		ret = -ENOMEM;
 		goto done;
 	}
@@ -584,26 +591,24 @@
 	 * two pages; well within the acceptable limits for using kmalloc.
 	 */
 
-	pages = kmalloc(sglen_alloc * sizeof(struct page *), GFP_KERNEL);
+	pages = kmalloc(memdesc->sglen_alloc * sizeof(struct page *),
+		GFP_KERNEL);
 
 	if (pages == NULL) {
-		KGSL_CORE_ERR("kmalloc (%d) failed\n",
-			sglen_alloc * sizeof(struct page *));
 		ret = -ENOMEM;
 		goto done;
 	}
 
 	kmemleak_not_leak(memdesc->sg);
 
-	memdesc->sglen_alloc = sglen_alloc;
-	sg_init_table(memdesc->sg, sglen_alloc);
+	sg_init_table(memdesc->sg, memdesc->sglen_alloc);
 
 	len = size;
 
 	while (len > 0) {
 		struct page *page;
 		unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM |
-			__GFP_NOWARN;
+			__GFP_NOWARN | __GFP_NORETRY;
 		int j;
 
 		/* don't waste space at the end of the allocation*/
@@ -667,41 +672,42 @@
 	 * zeroed and unmaped each individual page, and then we had to turn
 	 * around and call flush_dcache_page() on that page to clear the caches.
 	 * This was killing us for performance. Instead, we found it is much
-	 * faster to allocate the pages without GFP_ZERO, map the entire range,
-	 * memset it, flush the range and then unmap - this results in a factor
-	 * of 4 improvement for speed for large buffers.  There is a small
-	 * increase in speed for small buffers, but only on the order of a few
-	 * microseconds at best.  The only downside is that there needs to be
-	 * enough temporary space in vmalloc to accomodate the map. This
-	 * shouldn't be a problem, but if it happens, fall back to a much slower
-	 * path
+	 * faster to allocate the pages without GFP_ZERO, map a chunk of the
+	 * range ('step' pages), memset it, flush it and then unmap
+	 * - this results in a factor of 4 improvement for speed for large
+	 * buffers. There is a small decrease in speed for small buffers,
+	 * but only on the order of a few microseconds at best. The 'step'
+	 * size is based on a guess at the amount of free vmalloc space, but
+	 * will scale down if there's not enough free space.
 	 */
+	for (j = 0; j < pcount; j += step) {
+		step = min(step, pcount - j);
 
-	ptr = vmap(pages, pcount, VM_IOREMAP, page_prot);
+		ptr = vmap(&pages[j], step, VM_IOREMAP, page_prot);
 
-	if (ptr != NULL) {
-		memset(ptr, 0, memdesc->size);
-		dmac_flush_range(ptr, ptr + memdesc->size);
-		vunmap(ptr);
-	} else {
-		/* Very, very, very slow path */
+		if (ptr != NULL) {
+			memset(ptr, 0, step * PAGE_SIZE);
+			dmac_flush_range(ptr, ptr + step * PAGE_SIZE);
+			vunmap(ptr);
+		} else {
+			int k;
+			/* Very, very, very slow path */
 
-		for (j = 0; j < pcount; j++) {
-			ptr = kmap_atomic(pages[j]);
-			memset(ptr, 0, PAGE_SIZE);
-			dmac_flush_range(ptr, ptr + PAGE_SIZE);
-			kunmap_atomic(ptr);
+			for (k = j; k < j + step; k++) {
+				ptr = kmap_atomic(pages[k]);
+				memset(ptr, 0, PAGE_SIZE);
+				dmac_flush_range(ptr, ptr + PAGE_SIZE);
+				kunmap_atomic(ptr);
+			}
+			/* scale down the step size to avoid this path */
+			if (step > 1)
+				step >>= 1;
 		}
 	}
 
 	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen,
 				KGSL_CACHE_OP_FLUSH);
 
-	ret = kgsl_mmu_map(pagetable, memdesc, protflags);
-
-	if (ret)
-		goto done;
-
 	KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc,
 		kgsl_driver.stats.page_alloc_max);
 
@@ -728,8 +734,7 @@
 
 	size = ALIGN(size, PAGE_SIZE * 2);
 
-	ret =  _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
-		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	ret =  _kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
 	if (!ret)
 		ret = kgsl_page_alloc_map_kernel(memdesc);
 	if (ret)
@@ -743,17 +748,7 @@
 			    struct kgsl_pagetable *pagetable,
 			    size_t size)
 {
-	unsigned int protflags;
-
-	if (size == 0)
-		return -EINVAL;
-
-	protflags = GSL_PT_PAGE_RV;
-	if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
-		protflags |= GSL_PT_PAGE_WV;
-
-	return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
-		protflags);
+	return _kgsl_sharedmem_page_alloc(memdesc, pagetable, PAGE_ALIGN(size));
 }
 EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user);
 
@@ -831,12 +826,6 @@
 	if (result)
 		goto err;
 
-	result = kgsl_mmu_map(pagetable, memdesc,
-		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-
-	if (result)
-		goto err;
-
 	KGSL_STATS_ADD(size, kgsl_driver.stats.coherent,
 		kgsl_driver.stats.coherent_max);
 
diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h
index 3109ef2..279490f 100644
--- a/drivers/gpu/msm/kgsl_sharedmem.h
+++ b/drivers/gpu/msm/kgsl_sharedmem.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,6 +19,7 @@
 #include "kgsl_mmu.h"
 #include <linux/slab.h>
 #include <linux/kmemleak.h>
+#include <linux/iommu.h>
 
 #include "kgsl_log.h"
 
@@ -83,6 +84,18 @@
 }
 
 /*
+ * kgsl_memdesc_get_cachemode - Get cache mode of a memdesc
+ * @memdesc: the memdesc
+ *
+ * Returns a KGSL_CACHEMODE* value.
+ */
+static inline int
+kgsl_memdesc_get_cachemode(const struct kgsl_memdesc *memdesc)
+{
+	return (memdesc->flags & KGSL_CACHEMODE_MASK) >> KGSL_CACHEMODE_SHIFT;
+}
+
+/*
  * kgsl_memdesc_set_align - Set alignment flags of a memdesc
  * @memdesc - the memdesc
  * @align - alignment requested, as a power of 2 exponent.
@@ -144,7 +157,7 @@
 		unsigned int physaddr, unsigned int size)
 {
 	memdesc->sg = kgsl_sg_alloc(1);
-	if (!memdesc->sg)
+	if (memdesc->sg == NULL)
 		return -ENOMEM;
 
 	kmemleak_not_leak(memdesc->sg);
@@ -157,14 +170,98 @@
 	return 0;
 }
 
+/*
+ * kgsl_memdesc_is_global - is this a globally mapped buffer?
+ * @memdesc: the memdesc
+ *
+ * Returns nonzero if this is a global mapping, 0 otherwise
+ */
+static inline int kgsl_memdesc_is_global(const struct kgsl_memdesc *memdesc)
+{
+	return (memdesc->priv & KGSL_MEMDESC_GLOBAL) != 0;
+}
+
+/*
+ * kgsl_memdesc_has_guard_page - is the last page a guard page?
+ * @memdesc - the memdesc
+ *
+ * Returns nonzero if there is a guard page, 0 otherwise
+ */
+static inline int
+kgsl_memdesc_has_guard_page(const struct kgsl_memdesc *memdesc)
+{
+	return (memdesc->priv & KGSL_MEMDESC_GUARD_PAGE) != 0;
+}
+
+/*
+ * kgsl_memdesc_protflags - get mmu protection flags
+ * @memdesc - the memdesc
+ * Returns a mask of GSL_PT_PAGE* or IOMMU* values based
+ * on the memdesc flags.
+ */
+static inline unsigned int
+kgsl_memdesc_protflags(const struct kgsl_memdesc *memdesc)
+{
+	unsigned int protflags = 0;
+	enum kgsl_mmutype mmutype = kgsl_mmu_get_mmutype();
+
+	if (mmutype == KGSL_MMU_TYPE_GPU) {
+		protflags = GSL_PT_PAGE_RV;
+		if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
+			protflags |= GSL_PT_PAGE_WV;
+	} else if (mmutype == KGSL_MMU_TYPE_IOMMU) {
+		protflags = IOMMU_READ;
+		if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
+			protflags |= IOMMU_WRITE;
+	}
+	return protflags;
+}
+
+/*
+ * kgsl_memdesc_use_cpu_map - use the same virtual mapping on CPU and GPU?
+ * @memdesc - the memdesc
+ */
+static inline int
+kgsl_memdesc_use_cpu_map(const struct kgsl_memdesc *memdesc)
+{
+	return (memdesc->flags & KGSL_MEMFLAGS_USE_CPU_MAP) != 0;
+}
+
+/*
+ * kgsl_memdesc_mmapsize - get the size of the mmap region
+ * @memdesc - the memdesc
+ *
+ * The entire memdesc must be mapped. Additionally if the
+ * CPU mapping is going to be mirrored, there must be room
+ * for the guard page to be mapped so that the address spaces
+ * match up.
+ */
+static inline unsigned int
+kgsl_memdesc_mmapsize(const struct kgsl_memdesc *memdesc)
+{
+	unsigned int size = memdesc->size;
+	if (kgsl_memdesc_use_cpu_map(memdesc) &&
+		kgsl_memdesc_has_guard_page(memdesc))
+		size += SZ_4K;
+	return size;
+}
+
 static inline int
 kgsl_allocate(struct kgsl_memdesc *memdesc,
 		struct kgsl_pagetable *pagetable, size_t size)
 {
+	int ret;
+	memdesc->priv |= (KGSL_MEMTYPE_KERNEL << KGSL_MEMTYPE_SHIFT);
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem(memdesc, pagetable, size);
-	memdesc->flags |= (KGSL_MEMTYPE_KERNEL << KGSL_MEMTYPE_SHIFT);
-	return kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
+
+	ret = kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
+	if (ret)
+		return ret;
+	ret = kgsl_mmu_map(pagetable, memdesc);
+	if (ret)
+		kgsl_sharedmem_free(memdesc);
+	return ret;
 }
 
 static inline int
@@ -174,6 +271,9 @@
 {
 	int ret;
 
+	if (size == 0)
+		return -EINVAL;
+
 	memdesc->flags = flags;
 
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index a5aa42f..4c9c744 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -106,7 +106,12 @@
 {
 	struct kgsl_snapshot_linux_context *header = _ctxtptr;
 	struct kgsl_context *context = ptr;
-	struct kgsl_device *device = context->dev_priv->device;
+	struct kgsl_device *device;
+
+	if (context)
+		device = context->dev_priv->device;
+	else
+		device = (struct kgsl_device *)data;
 
 	header->id = id;
 
@@ -141,6 +146,9 @@
 
 	idr_for_each(&device->context_idr, snapshot_context_count, &ctxtcount);
 
+	/* Increment ctxcount for the global memstore */
+	ctxtcount++;
+
 	size += ctxtcount * sizeof(struct kgsl_snapshot_linux_context);
 
 	/* Make sure there is enough room for the data */
@@ -169,8 +177,9 @@
 	header->grpclk = kgsl_get_clkrate(pwr->grp_clks[0]);
 	header->busclk = kgsl_get_clkrate(pwr->ebi1_clk);
 
-	/* Future proof for per-context timestamps */
-	header->current_context = -1;
+	/* Save the last active context */
+	kgsl_sharedmem_readl(&device->memstore, &header->current_context,
+		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL, current_context));
 
 	/* Get the current PT base */
 	header->ptbase = kgsl_mmu_get_current_ptbase(&device->mmu);
@@ -185,8 +194,10 @@
 
 	header->ctxtcount = ctxtcount;
 
-	/* append information for each context */
 	_ctxtptr = snapshot + sizeof(*header);
+	/* append information for the global context */
+	snapshot_context_info(KGSL_MEMSTORE_GLOBAL, NULL, device);
+	/* append information for each context */
 	idr_for_each(&device->context_idr, snapshot_context_info, NULL);
 
 	/* Return the size of the data segment */
@@ -283,7 +294,7 @@
 {
 	list_del(&obj->node);
 
-	obj->entry->flags &= ~KGSL_MEM_ENTRY_FROZEN;
+	obj->entry->memdesc.priv &= ~KGSL_MEMDESC_FROZEN;
 	kgsl_mem_entry_put(obj->entry);
 
 	kfree(obj);
@@ -375,8 +386,8 @@
 	/* If the buffer is already on the list, skip it */
 	list_for_each_entry(obj, &device->snapshot_obj_list, node) {
 		if (obj->gpuaddr == gpuaddr && obj->ptbase == ptbase) {
-			/* If the size is different, use the new size */
-			if (obj->size != size)
+			/* If the size is different, use the bigger size */
+			if (obj->size < size)
 				obj->size = size;
 
 			return 0;
@@ -416,10 +427,10 @@
 	 * 0 so it doesn't get counted twice
 	 */
 
-	if (entry->flags & KGSL_MEM_ENTRY_FROZEN)
+	if (entry->memdesc.priv & KGSL_MEMDESC_FROZEN)
 		return 0;
 
-	entry->flags |= KGSL_MEM_ENTRY_FROZEN;
+	entry->memdesc.priv |= KGSL_MEMDESC_FROZEN;
 
 	return entry->memdesc.size;
 }
diff --git a/drivers/gpu/msm/kgsl_snapshot.h b/drivers/gpu/msm/kgsl_snapshot.h
index 327d18a..4db2815 100644
--- a/drivers/gpu/msm/kgsl_snapshot.h
+++ b/drivers/gpu/msm/kgsl_snapshot.h
@@ -52,6 +52,7 @@
 #define KGSL_SNAPSHOT_SECTION_DEBUG        0x0901
 #define KGSL_SNAPSHOT_SECTION_DEBUGBUS     0x0A01
 #define KGSL_SNAPSHOT_SECTION_GPU_OBJECT   0x0B01
+#define KGSL_SNAPSHOT_SECTION_MEMLIST      0x0E01
 
 #define KGSL_SNAPSHOT_SECTION_END          0xFFFF
 
@@ -103,6 +104,17 @@
 	int count;  /* Number of dwords in the dump */
 } __packed;
 
+/* Replay or Memory list section, both sections have same header */
+struct kgsl_snapshot_replay_mem_list {
+	/*
+	 * Number of IBs to replay for replay section or
+	 * number of memory list entries for mem list section
+	 */
+	int num_entries;
+	/* Pagetable base to which the replay IBs or memory entries belong */
+	__u32 ptbase;
+} __packed;
+
 /* Indirect buffer sub-section header */
 struct kgsl_snapshot_ib {
 	__u32 gpuaddr; /* GPU address of the the IB */
diff --git a/drivers/gpu/msm/kgsl_sync.c b/drivers/gpu/msm/kgsl_sync.c
index d9ab081..0e3e046 100644
--- a/drivers/gpu/msm/kgsl_sync.c
+++ b/drivers/gpu/msm/kgsl_sync.c
@@ -69,7 +69,6 @@
 
 struct kgsl_fence_event_priv {
 	struct kgsl_context *context;
-	unsigned int timestamp;
 };
 
 /**
@@ -86,7 +85,7 @@
 	void *priv, u32 context_id, u32 timestamp)
 {
 	struct kgsl_fence_event_priv *ev = priv;
-	kgsl_sync_timeline_signal(ev->context->timeline, ev->timestamp);
+	kgsl_sync_timeline_signal(ev->context->timeline, timestamp);
 	kgsl_context_put(ev->context);
 	kfree(ev);
 }
@@ -126,7 +125,6 @@
 	if (event == NULL)
 		return -ENOMEM;
 	event->context = context;
-	event->timestamp = timestamp;
 	kgsl_context_get(context);
 
 	pt = kgsl_sync_pt_create(context->timeline, timestamp);
@@ -181,6 +179,7 @@
 }
 
 static const struct sync_timeline_ops kgsl_sync_timeline_ops = {
+	.driver_name = "kgsl-timeline",
 	.dup = kgsl_sync_pt_dup,
 	.has_signaled = kgsl_sync_pt_has_signaled,
 	.compare = kgsl_sync_pt_compare,
@@ -207,7 +206,7 @@
 	struct kgsl_sync_timeline *ktimeline =
 		(struct kgsl_sync_timeline *) timeline;
 
-	if (timestamp_cmp(timestamp, ktimeline->last_timestamp > 0))
+	if (timestamp_cmp(timestamp, ktimeline->last_timestamp) > 0)
 		ktimeline->last_timestamp = timestamp;
 	sync_timeline_signal(timeline);
 }
diff --git a/drivers/gpu/msm/kgsl_trace.h b/drivers/gpu/msm/kgsl_trace.h
index c54445c..5f7ee3c 100644
--- a/drivers/gpu/msm/kgsl_trace.h
+++ b/drivers/gpu/msm/kgsl_trace.h
@@ -24,6 +24,8 @@
 #include <linux/tracepoint.h>
 #include "kgsl_device.h"
 
+#include "adreno_drawctxt.h"
+
 struct kgsl_device;
 struct kgsl_ringbuffer_issueibcmds;
 struct kgsl_device_waittimestamp;
@@ -34,11 +36,16 @@
 TRACE_EVENT(kgsl_issueibcmds,
 
 	TP_PROTO(struct kgsl_device *device,
-			struct kgsl_ringbuffer_issueibcmds *cmd,
+			int drawctxt_id,
 			struct kgsl_ibdesc *ibdesc,
-			int result),
+			int numibs,
+			int timestamp,
+			int flags,
+			int result,
+			unsigned int type),
 
-	TP_ARGS(device, cmd, ibdesc, result),
+	TP_ARGS(device, drawctxt_id, ibdesc, numibs, timestamp, flags,
+		result, type),
 
 	TP_STRUCT__entry(
 		__string(device_name, device->name)
@@ -48,21 +55,23 @@
 		__field(unsigned int, timestamp)
 		__field(unsigned int, flags)
 		__field(int, result)
+		__field(unsigned int, drawctxt_type)
 	),
 
 	TP_fast_assign(
 		__assign_str(device_name, device->name);
-		__entry->drawctxt_id = cmd->drawctxt_id;
+		__entry->drawctxt_id = drawctxt_id;
 		__entry->ibdesc_addr = ibdesc[0].gpuaddr;
-		__entry->numibs = cmd->numibs;
-		__entry->timestamp = cmd->timestamp;
-		__entry->flags = cmd->flags;
+		__entry->numibs = numibs;
+		__entry->timestamp = timestamp;
+		__entry->flags = flags;
 		__entry->result = result;
+		__entry->drawctxt_type = type;
 	),
 
 	TP_printk(
 		"d_name=%s ctx=%u ib=0x%u numibs=%u timestamp=0x%x "
-		"flags=0x%x(%s) result=%d",
+		"flags=0x%x(%s) result=%d type=%s",
 		__get_str(device_name),
 		__entry->drawctxt_id,
 		__entry->ibdesc_addr,
@@ -74,7 +83,9 @@
 			{ KGSL_CONTEXT_SUBMIT_IB_LIST, "IB_LIST" },
 			{ KGSL_CONTEXT_CTX_SWITCH, "CTX_SWITCH" })
 			: "None",
-		__entry->result
+		__entry->result,
+		__print_symbolic(__entry->drawctxt_type,
+			ADRENO_DRAWCTXT_TYPES)
 	)
 );
 
@@ -274,6 +285,32 @@
 	)
 );
 
+TRACE_EVENT(kgsl_gpubusy,
+	TP_PROTO(struct kgsl_device *device, unsigned int busy,
+		unsigned int elapsed),
+
+	TP_ARGS(device, busy, elapsed),
+
+	TP_STRUCT__entry(
+		__string(device_name, device->name)
+		__field(unsigned int, busy)
+		__field(unsigned int, elapsed)
+	),
+
+	TP_fast_assign(
+		__assign_str(device_name, device->name);
+		__entry->busy = busy;
+		__entry->elapsed = elapsed;
+	),
+
+	TP_printk(
+		"d_name=%s busy=%d elapsed=%d",
+		__get_str(device_name),
+		__entry->busy,
+		__entry->elapsed
+	)
+);
+
 DECLARE_EVENT_CLASS(kgsl_pwrstate_template,
 	TP_PROTO(struct kgsl_device *device, unsigned int state),
 
@@ -317,6 +354,8 @@
 		__field(unsigned int, size)
 		__field(unsigned int, tgid)
 		__array(char, usage, 16)
+		__field(unsigned int, id)
+		__field(unsigned int, flags)
 	),
 
 	TP_fast_assign(
@@ -325,12 +364,76 @@
 		__entry->tgid = mem_entry->priv->pid;
 		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
 				     mem_entry->memdesc.flags);
+		__entry->id = mem_entry->id;
+		__entry->flags = mem_entry->memdesc.flags;
 	),
 
 	TP_printk(
-		"gpuaddr=0x%08x size=%d tgid=%d usage=%s",
+		"gpuaddr=0x%08x size=%d tgid=%d usage=%s id=%d flags=0x%08x",
 		__entry->gpuaddr, __entry->size, __entry->tgid,
-		__entry->usage
+		__entry->usage, __entry->id, __entry->flags
+	)
+);
+
+TRACE_EVENT(kgsl_mem_mmap,
+
+	TP_PROTO(struct kgsl_mem_entry *mem_entry),
+
+	TP_ARGS(mem_entry),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, useraddr)
+		__field(unsigned int, gpuaddr)
+		__field(unsigned int, size)
+		__array(char, usage, 16)
+		__field(unsigned int, id)
+		__field(unsigned int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->useraddr = mem_entry->memdesc.useraddr;
+		__entry->gpuaddr = mem_entry->memdesc.gpuaddr;
+		__entry->size = mem_entry->memdesc.size;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
+		__entry->id = mem_entry->id;
+		__entry->flags = mem_entry->memdesc.flags;
+	),
+
+	TP_printk(
+		"useraddr=%lx gpuaddr=0x%08x size=%d usage=%s id=%d"
+		" flags=0x%08x",
+		__entry->useraddr, __entry->gpuaddr, __entry->size,
+		__entry->usage, __entry->id, __entry->flags
+	)
+);
+
+TRACE_EVENT(kgsl_mem_unmapped_area_collision,
+
+	TP_PROTO(struct kgsl_mem_entry *mem_entry,
+		 unsigned long hint,
+		 unsigned long len,
+		 unsigned long addr),
+
+	TP_ARGS(mem_entry, hint, len, addr),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, id)
+		__field(unsigned long, hint)
+		__field(unsigned long, len)
+		__field(unsigned long, addr)
+	),
+
+	TP_fast_assign(
+		__entry->id = mem_entry->id;
+		__entry->hint  = hint;
+		__entry->len = len;
+		__entry->addr = addr;
+	),
+
+	TP_printk(
+		"id=%d hint=0x%lx len=%ld addr=0x%lx",
+		__entry->id, __entry->hint, __entry->len, __entry->addr
 	)
 );
 
@@ -347,6 +450,7 @@
 		__field(int, type)
 		__field(unsigned int, tgid)
 		__array(char, usage, 16)
+		__field(unsigned int, id)
 	),
 
 	TP_fast_assign(
@@ -357,13 +461,14 @@
 		__entry->tgid = mem_entry->priv->pid;
 		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
 				     mem_entry->memdesc.flags);
+		__entry->id = mem_entry->id;
 	),
 
 	TP_printk(
-		"gpuaddr=0x%08x size=%d type=%d fd=%d tgid=%d usage %s",
+		"gpuaddr=0x%08x size=%d type=%d fd=%d tgid=%d usage=%s id=%d",
 		__entry->gpuaddr, __entry->size,
 		__entry->type, __entry->fd, __entry->tgid,
-		__entry->usage
+		__entry->usage, __entry->id
 	)
 );
 
@@ -380,6 +485,7 @@
 		__field(int, fd)
 		__field(unsigned int, tgid)
 		__array(char, usage, 16)
+		__field(unsigned int, id)
 	),
 
 	TP_fast_assign(
@@ -389,12 +495,47 @@
 		__entry->tgid = mem_entry->priv->pid;
 		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
 				     mem_entry->memdesc.flags);
+		__entry->id = mem_entry->id;
 	),
 
 	TP_printk(
-		"gpuaddr=0x%08x size=%d type=%d tgid=%d usage=%s",
+		"gpuaddr=0x%08x size=%d type=%d tgid=%d usage=%s id=%d",
 		__entry->gpuaddr, __entry->size, __entry->type,
-		__entry->tgid, __entry->usage
+		__entry->tgid, __entry->usage, __entry->id
+	)
+);
+
+TRACE_EVENT(kgsl_mem_sync_cache,
+
+	TP_PROTO(struct kgsl_mem_entry *mem_entry, unsigned int op),
+
+	TP_ARGS(mem_entry, op),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, gpuaddr)
+		__field(unsigned int, size)
+		__array(char, usage, 16)
+		__field(unsigned int, tgid)
+		__field(unsigned int, id)
+		__field(unsigned int, op)
+	),
+
+	TP_fast_assign(
+		__entry->gpuaddr = mem_entry->memdesc.gpuaddr;
+		__entry->size = mem_entry->memdesc.size;
+		__entry->tgid = mem_entry->priv->pid;
+		__entry->id = mem_entry->id;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
+		__entry->op = op;
+	),
+
+	TP_printk(
+		"gpuaddr=0x%08x size=%d tgid=%d usage=%s id=%d op=%c%c",
+		__entry->gpuaddr, __entry->size, __entry->tgid, __entry->usage,
+		__entry->id,
+		(__entry->op & KGSL_GPUMEM_CACHE_CLEAN) ? 'c' : '.',
+		(__entry->op & KGSL_GPUMEM_CACHE_INV) ? 'i' : '.'
 	)
 );
 
@@ -411,6 +552,7 @@
 		__field(unsigned int, size)
 		__field(int, type)
 		__array(char, usage, 16)
+		__field(unsigned int, id)
 		__field(unsigned int, drawctxt_id)
 		__field(unsigned int, curr_ts)
 		__field(unsigned int, free_ts)
@@ -422,6 +564,7 @@
 		__entry->size = mem_entry->memdesc.size;
 		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
 				     mem_entry->memdesc.flags);
+		__entry->id = mem_entry->id;
 		__entry->drawctxt_id = id;
 		__entry->type = mem_entry->memtype;
 		__entry->curr_ts = curr_ts;
@@ -429,13 +572,14 @@
 	),
 
 	TP_printk(
-		"d_name=%s gpuaddr=0x%08x size=%d type=%d usage=%s ctx=%u"
+		"d_name=%s gpuaddr=0x%08x size=%d type=%d usage=%s id=%d ctx=%u"
 		" curr_ts=0x%x free_ts=0x%x",
 		__get_str(device_name),
 		__entry->gpuaddr,
 		__entry->size,
 		__entry->type,
 		__entry->usage,
+		__entry->id,
 		__entry->drawctxt_id,
 		__entry->curr_ts,
 		__entry->free_ts
@@ -535,6 +679,31 @@
 	)
 );
 
+TRACE_EVENT(kgsl_regwrite,
+
+	TP_PROTO(struct kgsl_device *device, unsigned int offset,
+		unsigned int value),
+
+	TP_ARGS(device, offset, value),
+
+	TP_STRUCT__entry(
+		__string(device_name, device->name)
+		__field(unsigned int, offset)
+		__field(unsigned int, value)
+	),
+
+	TP_fast_assign(
+		__assign_str(device_name, device->name);
+		__entry->offset = offset;
+		__entry->value = value;
+	),
+
+	TP_printk(
+		"d_name=%s reg=%x value=%x",
+		__get_str(device_name), __entry->offset, __entry->value
+	)
+);
+
 TRACE_EVENT(kgsl_register_event,
 		TP_PROTO(unsigned int id, unsigned int timestamp),
 		TP_ARGS(id, timestamp),
diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c
index 484630d..a07959b 100644
--- a/drivers/gpu/msm/z180.c
+++ b/drivers/gpu/msm/z180.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -17,6 +17,7 @@
 #include "kgsl.h"
 #include "kgsl_cffdump.h"
 #include "kgsl_sharedmem.h"
+#include "kgsl_trace.h"
 
 #include "z180.h"
 #include "z180_reg.h"
@@ -93,7 +94,8 @@
 #define Z180_CMDWINDOW_TARGET_SHIFT		0
 #define Z180_CMDWINDOW_ADDR_SHIFT		8
 
-static int z180_start(struct kgsl_device *device, unsigned int init_ram);
+static int z180_init(struct kgsl_device *device);
+static int z180_start(struct kgsl_device *device);
 static int z180_stop(struct kgsl_device *device);
 static int z180_wait(struct kgsl_device *device,
 				struct kgsl_context *context,
@@ -212,10 +214,6 @@
 
 			queue_work(device->work_queue, &device->ts_expired_ws);
 			wake_up_interruptible(&device->wait_queue);
-
-			atomic_notifier_call_chain(
-				&(device->ts_notifier_list),
-				device->id, NULL);
 		}
 	}
 
@@ -248,22 +246,26 @@
 	int result = 0;
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 
-	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory);
 
 	if (result)
 		goto error;
 
-	result = kgsl_mmu_map_global(pagetable, &device->memstore,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->memstore);
 	if (result)
 		goto error_unmap_dummy;
 
 	result = kgsl_mmu_map_global(pagetable,
-				     &z180_dev->ringbuffer.cmdbufdesc,
-				     GSL_PT_PAGE_RV);
+				     &z180_dev->ringbuffer.cmdbufdesc);
 	if (result)
 		goto error_unmap_memstore;
+	/*
+	 * Set the mpu end to the last "normal" global memory we use.
+	 * For the IOMMU, this will be used to restrict access to the
+	 * mapped registers.
+	 */
+	device->mh.mpu_range = z180_dev->ringbuffer.cmdbufdesc.gpuaddr +
+				z180_dev->ringbuffer.cmdbufdesc.size;
 	return result;
 
 error_unmap_dummy:
@@ -319,16 +321,11 @@
 	*p++ = ADDR_VGV3_LAST << 24;
 }
 
-static void z180_cmdstream_start(struct kgsl_device *device, int init_ram)
+static void z180_cmdstream_start(struct kgsl_device *device)
 {
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 	unsigned int cmd = VGV3_NEXTCMD_JUMP << VGV3_NEXTCMD_NEXTCMD_FSHIFT;
 
-	if (init_ram) {
-		z180_dev->timestamp = 0;
-		z180_dev->current_timestamp = 0;
-	}
-
 	addmarker(&z180_dev->ringbuffer, 0);
 
 	z180_cmdwindow_write(device, ADDR_VGV3_MODE, 4);
@@ -487,6 +484,10 @@
 	z180_cmdwindow_write(device, ADDR_VGV3_CONTROL, cmd);
 	z180_cmdwindow_write(device, ADDR_VGV3_CONTROL, 0);
 error:
+
+	trace_kgsl_issueibcmds(device, context->id, ibdesc, numibs,
+		*timestamp, ctrl, result, 0);
+
 	return (int)result;
 }
 
@@ -495,6 +496,7 @@
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 	memset(&z180_dev->ringbuffer, 0, sizeof(struct z180_ringbuffer));
 	z180_dev->ringbuffer.prevctx = Z180_INVALID_CONTEXT;
+	z180_dev->ringbuffer.cmdbufdesc.flags = KGSL_MEMFLAGS_GPUREADONLY;
 	return kgsl_allocate_contiguous(&z180_dev->ringbuffer.cmdbufdesc,
 		Z180_RB_SIZE);
 }
@@ -551,7 +553,17 @@
 	return 0;
 }
 
-static int z180_start(struct kgsl_device *device, unsigned int init_ram)
+static int z180_init(struct kgsl_device *device)
+{
+	struct z180_device *z180_dev = Z180_DEVICE(device);
+
+	z180_dev->timestamp = 0;
+	z180_dev->current_timestamp = 0;
+
+	return 0;
+}
+
+static int z180_start(struct kgsl_device *device)
 {
 	int status = 0;
 
@@ -568,11 +580,14 @@
 	if (status)
 		goto error_clk_off;
 
-	z180_cmdstream_start(device, init_ram);
+	z180_cmdstream_start(device);
 
 	mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
 	device->ftbl->irqctrl(device, 1);
+
+	device->reset_counter++;
+
 	return 0;
 
 error_clk_off:
@@ -811,9 +826,9 @@
 {
 	int status = -EINVAL;
 
-	/* Don't wait forever, set a max (10 sec) value for now */
+	/* Don't wait forever, set a max of Z180_IDLE_TIMEOUT */
 	if (msecs == -1)
-		msecs = 10 * MSEC_PER_SEC;
+		msecs = Z180_IDLE_TIMEOUT;
 
 	mutex_unlock(&device->mutex);
 	status = z180_wait(device, context, timestamp, msecs);
@@ -915,6 +930,7 @@
 	.idle = z180_idle,
 	.isidle = z180_isidle,
 	.suspend_context = z180_suspend_context,
+	.init = z180_init,
 	.start = z180_start,
 	.stop = z180_stop,
 	.getproperty = z180_getproperty,
diff --git a/drivers/gpu/msm/z180.h b/drivers/gpu/msm/z180.h
index 268aac3..1be0870 100644
--- a/drivers/gpu/msm/z180.h
+++ b/drivers/gpu/msm/z180.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -29,7 +29,7 @@
 #define Z180_DEFAULT_PWRSCALE_POLICY  NULL
 
 /* Wait a maximum of 10 seconds when trying to idle the core */
-#define Z180_IDLE_TIMEOUT (10 * 1000)
+#define Z180_IDLE_TIMEOUT (20 * 1000)
 
 struct z180_ringbuffer {
 	unsigned int prevctx;