ioat2,3: cacheline align software descriptor allocations

All the necessary fields for handling an ioat2,3 ring entry can fit into
one cacheline.  Move ->len prior to ->txd in struct ioat_ring_ent, and
move allocation of these entries to a hw-cache-aligned kmem cache to
reduce the number of cachelines dirtied for descriptor management.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
index 9baa3d6..ac00adc 100644
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -116,8 +116,8 @@
 
 struct ioat_ring_ent {
 	struct ioat_dma_descriptor *hw;
-	struct dma_async_tx_descriptor txd;
 	size_t len;
+	struct dma_async_tx_descriptor txd;
 	#ifdef DEBUG
 	int id;
 	#endif
@@ -143,4 +143,5 @@
 int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca);
 struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
 struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
+extern struct kmem_cache *ioat2_cache;
 #endif /* IOATDMA_V2_H */