RDMA/cxgb4: Fastreg NSMR fixes

- Remove dsgl support - doesn't work in T4.
- Wrap the immediate PBL as needed when building it in the wr.
- Adjust max pbl depth allowed based on ulptx alignment requirements.
- Bump the slots per SQ to 5 to allow up to 128MB fast registers.
- Advertise fastreg support by default.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index ff04e5c..057cb25 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -505,13 +505,15 @@
 	return 0;
 }
 
-static int build_fastreg(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe,
+			 struct ib_send_wr *wr, u8 *len16)
 {
 
 	struct fw_ri_immd *imdp;
 	__be64 *p;
 	int i;
 	int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32);
+	int rem;
 
 	if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH)
 		return -EINVAL;
@@ -526,32 +528,28 @@
 	wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
 	wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start &
 					0xffffffff);
-	if (pbllen > T4_MAX_FR_IMMD) {
-		struct c4iw_fr_page_list *c4pl =
-				to_c4iw_fr_page_list(wr->wr.fast_reg.page_list);
-		struct fw_ri_dsgl *sglp;
-
-		sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1);
-		sglp->op = FW_RI_DATA_DSGL;
-		sglp->r1 = 0;
-		sglp->nsge = cpu_to_be16(1);
-		sglp->addr0 = cpu_to_be64(c4pl->dma_addr);
-		sglp->len0 = cpu_to_be32(pbllen);
-
-		*len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *sglp, 16);
-	} else {
-		imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
-		imdp->op = FW_RI_DATA_IMMD;
-		imdp->r1 = 0;
-		imdp->r2 = 0;
-		imdp->immdlen = cpu_to_be32(pbllen);
-		p = (__be64 *)(imdp + 1);
-		for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++)
-			*p = cpu_to_be64(
-				(u64)wr->wr.fast_reg.page_list->page_list[i]);
-		*len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen,
-				      16);
+	WARN_ON(pbllen > T4_MAX_FR_IMMD);
+	imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
+	imdp->op = FW_RI_DATA_IMMD;
+	imdp->r1 = 0;
+	imdp->r2 = 0;
+	imdp->immdlen = cpu_to_be32(pbllen);
+	p = (__be64 *)(imdp + 1);
+	rem = pbllen;
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+		*p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]);
+		rem -= sizeof *p;
+		if (++p == (__be64 *)&sq->queue[sq->size])
+			p = (__be64 *)sq->queue;
 	}
+	BUG_ON(rem < 0);
+	while (rem) {
+		*p = 0;
+		rem -= sizeof *p;
+		if (++p == (__be64 *)&sq->queue[sq->size])
+			p = (__be64 *)sq->queue;
+	}
+	*len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen, 16);
 	return 0;
 }
 
@@ -652,7 +650,7 @@
 		case IB_WR_FAST_REG_MR:
 			fw_opcode = FW_RI_FR_NSMR_WR;
 			swsqe->opcode = FW_RI_FAST_REGISTER;
-			err = build_fastreg(wqe, wr, &len16);
+			err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16);
 			break;
 		case IB_WR_LOCAL_INV:
 			if (wr->send_flags & IB_SEND_FENCE)