RDMA/cxgb4: Support variable sized work requests

T4 EQ entries are in multiples of 64 bytes.  Currently the RDMA SQ and
RQ use fixed sized entries composed of 4 EQ entries for the SQ and 2
EQ entries for the RQ.  For optimial latency with small IO, we need to
change this so the HW only needs to DMA the EQ entries actually used
by a given work request.

Implementation:

- add wq_pidx counter to track where we are in the EQ.  cidx/pidx are
  used for the sw sq/rq tracking and flow control.

- the variable part of work requests is the SGL.  Add new functions to
  build the SGL and/or immediate data directly in the EQ memory
  wrapping when needed.

- adjust the min burst size for the EQ contexts to 64B.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 9cf8d85..aef55f4 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -65,10 +65,10 @@
 	u8 db_off;
 };
 
-#define T4_EQ_SIZE 64
+#define T4_EQ_ENTRY_SIZE 64
 
 #define T4_SQ_NUM_SLOTS 4
-#define T4_SQ_NUM_BYTES (T4_EQ_SIZE * T4_SQ_NUM_SLOTS)
+#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS)
 #define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
 			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
 #define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
@@ -84,7 +84,7 @@
 #define T4_MAX_FR_DEPTH (T4_MAX_FR_IMMD / sizeof(u64))
 
 #define T4_RQ_NUM_SLOTS 2
-#define T4_RQ_NUM_BYTES (T4_EQ_SIZE * T4_RQ_NUM_SLOTS)
+#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS)
 #define T4_MAX_RECV_SGE 4
 
 union t4_wr {
@@ -97,20 +97,18 @@
 	struct fw_ri_fr_nsmr_wr fr;
 	struct fw_ri_inv_lstag_wr inv;
 	struct t4_status_page status;
-	__be64 flits[T4_EQ_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
 };
 
 union t4_recv_wr {
 	struct fw_ri_recv_wr recv;
 	struct t4_status_page status;
-	__be64 flits[T4_EQ_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS];
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS];
 };
 
 static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid,
 			       enum fw_wr_opcodes opcode, u8 flags, u8 len16)
 {
-	int slots_used;
-
 	wqe->send.opcode = (u8)opcode;
 	wqe->send.flags = flags;
 	wqe->send.wrid = wrid;
@@ -118,12 +116,6 @@
 	wqe->send.r1[1] = 0;
 	wqe->send.r1[2] = 0;
 	wqe->send.len16 = len16;
-
-	slots_used = DIV_ROUND_UP(len16*16, T4_EQ_SIZE);
-	while (slots_used < T4_SQ_NUM_SLOTS) {
-		wqe->flits[slots_used * T4_EQ_SIZE / sizeof(__be64)] = 0;
-		slots_used++;
-	}
 }
 
 /* CQE/AE status codes */
@@ -289,6 +281,7 @@
 	u16 size;
 	u16 cidx;
 	u16 pidx;
+	u16 wq_pidx;
 };
 
 struct t4_swrqe {
@@ -310,6 +303,7 @@
 	u16 size;
 	u16 cidx;
 	u16 pidx;
+	u16 wq_pidx;
 };
 
 struct t4_wq {
@@ -340,11 +334,14 @@
 	return wq->rq.size - 1 - wq->rq.in_use;
 }
 
-static inline void t4_rq_produce(struct t4_wq *wq)
+static inline void t4_rq_produce(struct t4_wq *wq, u8 len16)
 {
 	wq->rq.in_use++;
 	if (++wq->rq.pidx == wq->rq.size)
 		wq->rq.pidx = 0;
+	wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS)
+		wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS;
 }
 
 static inline void t4_rq_consume(struct t4_wq *wq)
@@ -370,11 +367,14 @@
 	return wq->sq.size - 1 - wq->sq.in_use;
 }
 
-static inline void t4_sq_produce(struct t4_wq *wq)
+static inline void t4_sq_produce(struct t4_wq *wq, u8 len16)
 {
 	wq->sq.in_use++;
 	if (++wq->sq.pidx == wq->sq.size)
 		wq->sq.pidx = 0;
+	wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS)
+		wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS;
 }
 
 static inline void t4_sq_consume(struct t4_wq *wq)
@@ -386,14 +386,12 @@
 
 static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc)
 {
-	inc *= T4_SQ_NUM_SLOTS;
 	wmb();
 	writel(QID(wq->sq.qid) | PIDX(inc), wq->db);
 }
 
 static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc)
 {
-	inc *= T4_RQ_NUM_SLOTS;
 	wmb();
 	writel(QID(wq->rq.qid) | PIDX(inc), wq->db);
 }