IPoIB: Use separate CQ for UD send completions

Use a dedicated CQ for UD send completions. Also, do not arm the UD
send CQ, which reduces the number of interrupts generated.  This patch
farther reduces overhead by not calling poll CQ for every posted send
WR -- it does polls only when there 16 or more outstanding work requests.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index f1f142d..9044f88 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -95,6 +95,8 @@
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
 	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	MAX_SEND_CQE		  = 16,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -285,7 +287,8 @@
 	u16		  pkey_index;
 	struct ib_pd	 *pd;
 	struct ib_mr	 *mr;
-	struct ib_cq	 *cq;
+	struct ib_cq	 *recv_cq;
+	struct ib_cq	 *send_cq;
 	struct ib_qp	 *qp;
 	u32		  qkey;
 
@@ -305,6 +308,7 @@
 	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
 	struct ib_send_wr    tx_wr;
 	unsigned	     tx_outstanding;
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
 
 	struct ib_recv_wr    rx_wr;
 	struct ib_sge	     rx_sge[IPOIB_UD_RX_SG];
@@ -662,7 +666,6 @@
 static inline void ipoib_unregister_debugfs(void) { }
 #endif
 
-
 #define ipoib_printk(level, priv, format, arg...)	\
 	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
 #define ipoib_warn(priv, format, arg...)		\