[SPARC64]: Fix streaming buffer flushing on PCI and SBUS.

Firstly, if the direction is TODEVICE, then dirty data in the
streaming cache is impossible so we can elide the flush-flag
synchronization in that case.

Next, the context allocator is broken.  It is highly likely
that contexts get used multiple times for different dma
mappings, which confuses the strbuf flushing code and makes
it run inefficiently.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c
index 33ca56c..1807876 100644
--- a/arch/sparc64/kernel/pci_iommu.c
+++ b/arch/sparc64/kernel/pci_iommu.c
@@ -196,6 +196,34 @@
 	return NULL;
 }
 
+static int iommu_alloc_ctx(struct pci_iommu *iommu)
+{
+	int lowest = iommu->ctx_lowest_free;
+	int sz = IOMMU_NUM_CTXS - lowest;
+	int n = find_next_zero_bit(iommu->ctx_bitmap, sz, lowest);
+
+	if (unlikely(n == sz)) {
+		n = find_next_zero_bit(iommu->ctx_bitmap, lowest, 1);
+		if (unlikely(n == lowest)) {
+			printk(KERN_WARNING "IOMMU: Ran out of contexts.\n");
+			n = 0;
+		}
+	}
+	if (n)
+		__set_bit(n, iommu->ctx_bitmap);
+
+	return n;
+}
+
+static inline void iommu_free_ctx(struct pci_iommu *iommu, int ctx)
+{
+	if (likely(ctx)) {
+		__clear_bit(ctx, iommu->ctx_bitmap);
+		if (ctx < iommu->ctx_lowest_free)
+			iommu->ctx_lowest_free = ctx;
+	}
+}
+
 /* Allocate and map kernel buffer of size SIZE using consistent mode
  * DMA for PCI device PDEV.  Return non-NULL cpu-side address if
  * successful and set *DMA_ADDRP to the PCI side dma address.
@@ -236,7 +264,7 @@
 	npages = size >> IO_PAGE_SHIFT;
 	ctx = 0;
 	if (iommu->iommu_ctxflush)
-		ctx = iommu->iommu_cur_ctx++;
+		ctx = iommu_alloc_ctx(iommu);
 	first_page = __pa(first_page);
 	while (npages--) {
 		iopte_val(*iopte) = (IOPTE_CONSISTENT(ctx) |
@@ -317,6 +345,8 @@
 		}
 	}
 
+	iommu_free_ctx(iommu, ctx);
+
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	order = get_order(size);
@@ -360,7 +390,7 @@
 	base_paddr = __pa(oaddr & IO_PAGE_MASK);
 	ctx = 0;
 	if (iommu->iommu_ctxflush)
-		ctx = iommu->iommu_cur_ctx++;
+		ctx = iommu_alloc_ctx(iommu);
 	if (strbuf->strbuf_enabled)
 		iopte_protection = IOPTE_STREAMING(ctx);
 	else
@@ -380,39 +410,55 @@
 	return PCI_DMA_ERROR_CODE;
 }
 
-static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, u32 vaddr, unsigned long ctx, unsigned long npages)
+static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, u32 vaddr, unsigned long ctx, unsigned long npages, int direction)
 {
 	int limit;
 
-	PCI_STC_FLUSHFLAG_INIT(strbuf);
 	if (strbuf->strbuf_ctxflush &&
 	    iommu->iommu_ctxflush) {
 		unsigned long matchreg, flushreg;
+		u64 val;
 
 		flushreg = strbuf->strbuf_ctxflush;
 		matchreg = PCI_STC_CTXMATCH_ADDR(strbuf, ctx);
 
-		limit = 100000;
+		if (pci_iommu_read(matchreg) == 0)
+			goto do_flush_sync;
+
 		pci_iommu_write(flushreg, ctx);
-		for(;;) {
-			if (((long)pci_iommu_read(matchreg)) >= 0L)
-				break;
-			limit--;
-			if (!limit)
-				break;
-			udelay(1);
+		if ((val = pci_iommu_read(matchreg)) == 0)
+			goto do_flush_sync;
+
+		val &= 0xffff;
+		while (val) {
+			if (val & 0x1)
+				pci_iommu_write(flushreg, ctx);
+			val >>= 1;
 		}
-		if (!limit)
+		val = pci_iommu_read(matchreg);
+		if (unlikely(val)) {
 			printk(KERN_WARNING "pci_strbuf_flush: ctx flush "
-			       "timeout vaddr[%08x] ctx[%lx]\n",
-			       vaddr, ctx);
+			       "timeout matchreg[%lx] ctx[%lx]\n",
+			       val, ctx);
+			goto do_page_flush;
+		}
 	} else {
 		unsigned long i;
 
+	do_page_flush:
 		for (i = 0; i < npages; i++, vaddr += IO_PAGE_SIZE)
 			pci_iommu_write(strbuf->strbuf_pflush, vaddr);
 	}
 
+do_flush_sync:
+	/* If the device could not have possibly put dirty data into
+	 * the streaming cache, no flush-flag synchronization needs
+	 * to be performed.
+	 */
+	if (direction == PCI_DMA_TODEVICE)
+		return;
+
+	PCI_STC_FLUSHFLAG_INIT(strbuf);
 	pci_iommu_write(strbuf->strbuf_fsync, strbuf->strbuf_flushflag_pa);
 	(void) pci_iommu_read(iommu->write_complete_reg);
 
@@ -466,7 +512,7 @@
 
 	/* Step 1: Kick data out of streaming buffers if necessary. */
 	if (strbuf->strbuf_enabled)
-		pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages);
+		pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction);
 
 	/* Step 2: Clear out first TSB entry. */
 	iopte_make_dummy(iommu, base);
@@ -474,6 +520,8 @@
 	free_streaming_cluster(iommu, bus_addr - iommu->page_table_map_base,
 			       npages, ctx);
 
+	iommu_free_ctx(iommu, ctx);
+
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
@@ -613,7 +661,7 @@
 	/* Step 4: Choose a context if necessary. */
 	ctx = 0;
 	if (iommu->iommu_ctxflush)
-		ctx = iommu->iommu_cur_ctx++;
+		ctx = iommu_alloc_ctx(iommu);
 
 	/* Step 5: Create the mappings. */
 	if (strbuf->strbuf_enabled)
@@ -678,7 +726,7 @@
 
 	/* Step 1: Kick data out of streaming buffers if necessary. */
 	if (strbuf->strbuf_enabled)
-		pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages);
+		pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction);
 
 	/* Step 2: Clear out first TSB entry. */
 	iopte_make_dummy(iommu, base);
@@ -686,6 +734,8 @@
 	free_streaming_cluster(iommu, bus_addr - iommu->page_table_map_base,
 			       npages, ctx);
 
+	iommu_free_ctx(iommu, ctx);
+
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
@@ -724,7 +774,7 @@
 	}
 
 	/* Step 2: Kick data out of streaming buffers. */
-	pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages);
+	pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction);
 
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
@@ -768,7 +818,7 @@
 	i--;
 	npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length)
 		  - bus_addr) >> IO_PAGE_SHIFT;
-	pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages);
+	pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction);
 
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
diff --git a/arch/sparc64/kernel/pci_psycho.c b/arch/sparc64/kernel/pci_psycho.c
index 3567fa8..534320e 100644
--- a/arch/sparc64/kernel/pci_psycho.c
+++ b/arch/sparc64/kernel/pci_psycho.c
@@ -1212,7 +1212,7 @@
 
 	/* Setup initial software IOMMU state. */
 	spin_lock_init(&iommu->lock);
-	iommu->iommu_cur_ctx = 0;
+	iommu->ctx_lowest_free = 1;
 
 	/* Register addresses. */
 	iommu->iommu_control  = p->pbm_A.controller_regs + PSYCHO_IOMMU_CONTROL;
diff --git a/arch/sparc64/kernel/pci_sabre.c b/arch/sparc64/kernel/pci_sabre.c
index 5525d1e..53d333b 100644
--- a/arch/sparc64/kernel/pci_sabre.c
+++ b/arch/sparc64/kernel/pci_sabre.c
@@ -1265,7 +1265,7 @@
 
 	/* Setup initial software IOMMU state. */
 	spin_lock_init(&iommu->lock);
-	iommu->iommu_cur_ctx = 0;
+	iommu->ctx_lowest_free = 1;
 
 	/* Register addresses. */
 	iommu->iommu_control  = p->pbm_A.controller_regs + SABRE_IOMMU_CONTROL;
diff --git a/arch/sparc64/kernel/pci_schizo.c b/arch/sparc64/kernel/pci_schizo.c
index e93fcad..5753175 100644
--- a/arch/sparc64/kernel/pci_schizo.c
+++ b/arch/sparc64/kernel/pci_schizo.c
@@ -1753,7 +1753,7 @@
 
 	/* Setup initial software IOMMU state. */
 	spin_lock_init(&iommu->lock);
-	iommu->iommu_cur_ctx = 0;
+	iommu->ctx_lowest_free = 1;
 
 	/* Register addresses, SCHIZO has iommu ctx flushing. */
 	iommu->iommu_control  = pbm->pbm_regs + SCHIZO_IOMMU_CONTROL;
diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c
index 76ea645..89f5e01 100644
--- a/arch/sparc64/kernel/sbus.c
+++ b/arch/sparc64/kernel/sbus.c
@@ -117,17 +117,25 @@
 
 #define STRBUF_TAG_VALID	0x02UL
 
-static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages)
+static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages, int direction)
 {
 	unsigned long n;
 	int limit;
 
-	iommu->strbuf_flushflag = 0UL;
 	n = npages;
 	while (n--)
 		upa_writeq(base + (n << IO_PAGE_SHIFT),
 			   iommu->strbuf_regs + STRBUF_PFLUSH);
 
+	/* If the device could not have possibly put dirty data into
+	 * the streaming cache, no flush-flag synchronization needs
+	 * to be performed.
+	 */
+	if (direction == SBUS_DMA_TODEVICE)
+		return;
+
+	iommu->strbuf_flushflag = 0UL;
+
 	/* Whoopee cushion! */
 	upa_writeq(__pa(&iommu->strbuf_flushflag),
 		   iommu->strbuf_regs + STRBUF_FSYNC);
@@ -421,7 +429,7 @@
 
 	spin_lock_irqsave(&iommu->lock, flags);
 	free_streaming_cluster(iommu, dma_base, size >> IO_PAGE_SHIFT);
-	sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT);
+	sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT, direction);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
@@ -584,7 +592,7 @@
 	iommu = sdev->bus->iommu;
 	spin_lock_irqsave(&iommu->lock, flags);
 	free_streaming_cluster(iommu, dvma_base, size >> IO_PAGE_SHIFT);
-	sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT);
+	sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT, direction);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
@@ -596,7 +604,7 @@
 	size = (IO_PAGE_ALIGN(base + size) - (base & IO_PAGE_MASK));
 
 	spin_lock_irqsave(&iommu->lock, flags);
-	sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT);
+	sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT, direction);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
@@ -620,7 +628,7 @@
 	size = IO_PAGE_ALIGN(sg[i].dma_address + sg[i].dma_length) - base;
 
 	spin_lock_irqsave(&iommu->lock, flags);
-	sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT);
+	sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT, direction);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }