blob: 570f08dc0b03982306440ebaf86e1f5a8ca34f41 [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090051#include <linux/slab.h>
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040052
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040053#include "xprt_rdma.h"
54
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040055/*
56 * Globals/Macros
57 */
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63/*
64 * internal functions
65 */
66
67/*
68 * handle replies in tasklet context, using a single, global list
69 * rdma tasklet function -- just turn around and call the func
70 * for all replies on the list
71 */
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
Tom Tucker5c635e02011-02-09 19:45:34 +0000147 struct rpcrdma_mw *frmr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400148 struct rpcrdma_rep *rep =
149 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
150
151 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
152 __func__, rep, wc->status, wc->opcode, wc->byte_len);
153
154 if (!rep) /* send or bind completion that we don't care about */
155 return;
156
157 if (IB_WC_SUCCESS != wc->status) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000158 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
159 __func__, wc->opcode, wc->status);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400160 rep->rr_len = ~0U;
Tom Tucker5c635e02011-02-09 19:45:34 +0000161 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162 rpcrdma_schedule_tasklet(rep);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400163 return;
164 }
165
166 switch (wc->opcode) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000167 case IB_WC_FAST_REG_MR:
168 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169 frmr->r.frmr.state = FRMR_IS_VALID;
170 break;
171 case IB_WC_LOCAL_INV:
172 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173 frmr->r.frmr.state = FRMR_IS_INVALID;
174 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400175 case IB_WC_RECV:
176 rep->rr_len = wc->byte_len;
177 ib_dma_sync_single_for_cpu(
178 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180 /* Keep (only) the most recent credits, after check validity */
181 if (rep->rr_len >= 16) {
182 struct rpcrdma_msg *p =
183 (struct rpcrdma_msg *) rep->rr_base;
184 unsigned int credits = ntohl(p->rm_credit);
185 if (credits == 0) {
186 dprintk("RPC: %s: server"
187 " dropped credits to 0!\n", __func__);
188 /* don't deadlock */
189 credits = 1;
190 } else if (credits > rep->rr_buffer->rb_max_requests) {
191 dprintk("RPC: %s: server"
192 " over-crediting: %d (%d)\n",
193 __func__, credits,
194 rep->rr_buffer->rb_max_requests);
195 credits = rep->rr_buffer->rb_max_requests;
196 }
197 atomic_set(&rep->rr_buffer->rb_credits, credits);
198 }
199 /* fall through */
200 case IB_WC_BIND_MW:
201 rpcrdma_schedule_tasklet(rep);
202 break;
203 default:
204 dprintk("RPC: %s: unexpected WC event %X\n",
205 __func__, wc->opcode);
206 break;
207 }
208}
209
210static inline int
211rpcrdma_cq_poll(struct ib_cq *cq)
212{
213 struct ib_wc wc;
214 int rc;
215
216 for (;;) {
217 rc = ib_poll_cq(cq, 1, &wc);
218 if (rc < 0) {
219 dprintk("RPC: %s: ib_poll_cq failed %i\n",
220 __func__, rc);
221 return rc;
222 }
223 if (rc == 0)
224 break;
225
226 rpcrdma_event_process(&wc);
227 }
228
229 return 0;
230}
231
232/*
233 * rpcrdma_cq_event_upcall
234 *
235 * This upcall handles recv, send, bind and unbind events.
236 * It is reentrant but processes single events in order to maintain
237 * ordering of receives to keep server credits.
238 *
239 * It is the responsibility of the scheduled tasklet to return
240 * recv buffers to the pool. NOTE: this affects synchronization of
241 * connection shutdown. That is, the structures required for
242 * the completion of the reply handler must remain intact until
243 * all memory has been reclaimed.
244 *
245 * Note that send events are suppressed and do not result in an upcall.
246 */
247static void
248rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
249{
250 int rc;
251
252 rc = rpcrdma_cq_poll(cq);
253 if (rc)
254 return;
255
256 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257 if (rc) {
258 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
259 __func__, rc);
260 return;
261 }
262
263 rpcrdma_cq_poll(cq);
264}
265
266#ifdef RPC_DEBUG
267static const char * const conn[] = {
268 "address resolved",
269 "address error",
270 "route resolved",
271 "route error",
272 "connect request",
273 "connect response",
274 "connect error",
275 "unreachable",
276 "rejected",
277 "established",
278 "disconnected",
279 "device removal"
280};
281#endif
282
283static int
284rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
285{
286 struct rpcrdma_xprt *xprt = id->context;
287 struct rpcrdma_ia *ia = &xprt->rx_ia;
288 struct rpcrdma_ep *ep = &xprt->rx_ep;
Ingo Molnarff0db042008-11-25 16:58:42 -0800289#ifdef RPC_DEBUG
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400290 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
Ingo Molnarff0db042008-11-25 16:58:42 -0800291#endif
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400292 struct ib_qp_attr attr;
293 struct ib_qp_init_attr iattr;
294 int connstate = 0;
295
296 switch (event->event) {
297 case RDMA_CM_EVENT_ADDR_RESOLVED:
298 case RDMA_CM_EVENT_ROUTE_RESOLVED:
Tom Talpey5675add2008-10-09 15:01:41 -0400299 ia->ri_async_rc = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400300 complete(&ia->ri_done);
301 break;
302 case RDMA_CM_EVENT_ADDR_ERROR:
303 ia->ri_async_rc = -EHOSTUNREACH;
304 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
305 __func__, ep);
306 complete(&ia->ri_done);
307 break;
308 case RDMA_CM_EVENT_ROUTE_ERROR:
309 ia->ri_async_rc = -ENETUNREACH;
310 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
311 __func__, ep);
312 complete(&ia->ri_done);
313 break;
314 case RDMA_CM_EVENT_ESTABLISHED:
315 connstate = 1;
316 ib_query_qp(ia->ri_id->qp, &attr,
317 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318 &iattr);
319 dprintk("RPC: %s: %d responder resources"
320 " (%d initiator)\n",
321 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322 goto connected;
323 case RDMA_CM_EVENT_CONNECT_ERROR:
324 connstate = -ENOTCONN;
325 goto connected;
326 case RDMA_CM_EVENT_UNREACHABLE:
327 connstate = -ENETDOWN;
328 goto connected;
329 case RDMA_CM_EVENT_REJECTED:
330 connstate = -ECONNREFUSED;
331 goto connected;
332 case RDMA_CM_EVENT_DISCONNECTED:
333 connstate = -ECONNABORTED;
334 goto connected;
335 case RDMA_CM_EVENT_DEVICE_REMOVAL:
336 connstate = -ENODEV;
337connected:
Harvey Harrison21454aa2008-10-31 00:54:56 -0700338 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400339 __func__,
340 (event->event <= 11) ? conn[event->event] :
341 "unknown connection error",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700342 &addr->sin_addr.s_addr,
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400343 ntohs(addr->sin_port),
344 ep, event->event);
345 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346 dprintk("RPC: %s: %sconnected\n",
347 __func__, connstate > 0 ? "" : "dis");
348 ep->rep_connected = connstate;
349 ep->rep_func(ep);
350 wake_up_all(&ep->rep_connect_wait);
351 break;
352 default:
Tom Talpey1a954052008-10-09 15:01:31 -0400353 dprintk("RPC: %s: unexpected CM event %d\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400354 __func__, event->event);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400355 break;
356 }
357
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400358#ifdef RPC_DEBUG
359 if (connstate == 1) {
360 int ird = attr.max_dest_rd_atomic;
361 int tird = ep->rep_remote_cma.responder_resources;
Harvey Harrison21454aa2008-10-31 00:54:56 -0700362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400363 "on %s, memreg %d slots %d ird %d%s\n",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700364 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400365 ntohs(addr->sin_port),
366 ia->ri_id->device->name,
367 ia->ri_memreg_strategy,
368 xprt->rx_buf.rb_max_requests,
369 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370 } else if (connstate < 0) {
Harvey Harrison21454aa2008-10-31 00:54:56 -0700371 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400373 ntohs(addr->sin_port),
374 connstate);
375 }
376#endif
377
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400378 return 0;
379}
380
381static struct rdma_cm_id *
382rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383 struct rpcrdma_ia *ia, struct sockaddr *addr)
384{
385 struct rdma_cm_id *id;
386 int rc;
387
Tom Talpey1a954052008-10-09 15:01:31 -0400388 init_completion(&ia->ri_done);
389
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400390 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
391 if (IS_ERR(id)) {
392 rc = PTR_ERR(id);
393 dprintk("RPC: %s: rdma_create_id() failed %i\n",
394 __func__, rc);
395 return id;
396 }
397
Tom Talpey5675add2008-10-09 15:01:41 -0400398 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400399 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400 if (rc) {
401 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
402 __func__, rc);
403 goto out;
404 }
Tom Talpey5675add2008-10-09 15:01:41 -0400405 wait_for_completion_interruptible_timeout(&ia->ri_done,
406 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400407 rc = ia->ri_async_rc;
408 if (rc)
409 goto out;
410
Tom Talpey5675add2008-10-09 15:01:41 -0400411 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400412 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413 if (rc) {
414 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
415 __func__, rc);
416 goto out;
417 }
Tom Talpey5675add2008-10-09 15:01:41 -0400418 wait_for_completion_interruptible_timeout(&ia->ri_done,
419 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400420 rc = ia->ri_async_rc;
421 if (rc)
422 goto out;
423
424 return id;
425
426out:
427 rdma_destroy_id(id);
428 return ERR_PTR(rc);
429}
430
431/*
432 * Drain any cq, prior to teardown.
433 */
434static void
435rpcrdma_clean_cq(struct ib_cq *cq)
436{
437 struct ib_wc wc;
438 int count = 0;
439
440 while (1 == ib_poll_cq(cq, 1, &wc))
441 ++count;
442
443 if (count)
444 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
445 __func__, count, wc.opcode);
446}
447
448/*
449 * Exported functions.
450 */
451
452/*
453 * Open and initialize an Interface Adapter.
454 * o initializes fields of struct rpcrdma_ia, including
455 * interface and provider attributes and protection zone.
456 */
457int
458rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459{
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400460 int rc, mem_priv;
461 struct ib_device_attr devattr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400462 struct rpcrdma_ia *ia = &xprt->rx_ia;
463
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400464 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465 if (IS_ERR(ia->ri_id)) {
466 rc = PTR_ERR(ia->ri_id);
467 goto out1;
468 }
469
470 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471 if (IS_ERR(ia->ri_pd)) {
472 rc = PTR_ERR(ia->ri_pd);
473 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
474 __func__, rc);
475 goto out2;
476 }
477
478 /*
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400479 * Query the device to determine if the requested memory
480 * registration strategy is supported. If it isn't, set the
481 * strategy to a globally supported model.
482 */
483 rc = ib_query_device(ia->ri_id->device, &devattr);
484 if (rc) {
485 dprintk("RPC: %s: ib_query_device failed %d\n",
486 __func__, rc);
487 goto out2;
488 }
489
490 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491 ia->ri_have_dma_lkey = 1;
492 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
493 }
494
495 switch (memreg) {
496 case RPCRDMA_MEMWINDOWS:
497 case RPCRDMA_MEMWINDOWS_ASYNC:
498 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499 dprintk("RPC: %s: MEMWINDOWS registration "
500 "specified but not supported by adapter, "
501 "using slower RPCRDMA_REGISTER\n",
502 __func__);
503 memreg = RPCRDMA_REGISTER;
504 }
505 break;
506 case RPCRDMA_MTHCAFMR:
507 if (!ia->ri_id->device->alloc_fmr) {
508#if RPCRDMA_PERSISTENT_REGISTRATION
509 dprintk("RPC: %s: MTHCAFMR registration "
510 "specified but not supported by adapter, "
511 "using riskier RPCRDMA_ALLPHYSICAL\n",
512 __func__);
513 memreg = RPCRDMA_ALLPHYSICAL;
514#else
515 dprintk("RPC: %s: MTHCAFMR registration "
516 "specified but not supported by adapter, "
517 "using slower RPCRDMA_REGISTER\n",
518 __func__);
519 memreg = RPCRDMA_REGISTER;
520#endif
521 }
522 break;
Tom Talpey3197d302008-10-09 15:00:20 -0400523 case RPCRDMA_FRMR:
524 /* Requires both frmr reg and local dma lkey */
525 if ((devattr.device_cap_flags &
526 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528#if RPCRDMA_PERSISTENT_REGISTRATION
529 dprintk("RPC: %s: FRMR registration "
530 "specified but not supported by adapter, "
531 "using riskier RPCRDMA_ALLPHYSICAL\n",
532 __func__);
533 memreg = RPCRDMA_ALLPHYSICAL;
534#else
535 dprintk("RPC: %s: FRMR registration "
536 "specified but not supported by adapter, "
537 "using slower RPCRDMA_REGISTER\n",
538 __func__);
539 memreg = RPCRDMA_REGISTER;
540#endif
541 }
542 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400543 }
544
545 /*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400546 * Optionally obtain an underlying physical identity mapping in
547 * order to do a memory window-based bind. This base registration
548 * is protected from remote access - that is enabled only by binding
549 * for the specific bytes targeted during each RPC operation, and
550 * revoked after the corresponding completion similar to a storage
551 * adapter.
552 */
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400553 switch (memreg) {
554 case RPCRDMA_BOUNCEBUFFERS:
555 case RPCRDMA_REGISTER:
Tom Talpey3197d302008-10-09 15:00:20 -0400556 case RPCRDMA_FRMR:
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400557 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400558#if RPCRDMA_PERSISTENT_REGISTRATION
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400559 case RPCRDMA_ALLPHYSICAL:
560 mem_priv = IB_ACCESS_LOCAL_WRITE |
561 IB_ACCESS_REMOTE_WRITE |
562 IB_ACCESS_REMOTE_READ;
563 goto register_setup;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400564#endif
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400565 case RPCRDMA_MEMWINDOWS_ASYNC:
566 case RPCRDMA_MEMWINDOWS:
567 mem_priv = IB_ACCESS_LOCAL_WRITE |
568 IB_ACCESS_MW_BIND;
569 goto register_setup;
570 case RPCRDMA_MTHCAFMR:
571 if (ia->ri_have_dma_lkey)
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400572 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400573 mem_priv = IB_ACCESS_LOCAL_WRITE;
574 register_setup:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400575 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576 if (IS_ERR(ia->ri_bind_mem)) {
577 printk(KERN_ALERT "%s: ib_get_dma_mr for "
578 "phys register failed with %lX\n\t"
579 "Will continue with degraded performance\n",
580 __func__, PTR_ERR(ia->ri_bind_mem));
581 memreg = RPCRDMA_REGISTER;
582 ia->ri_bind_mem = NULL;
583 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400584 break;
585 default:
586 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587 __func__, memreg);
588 rc = -EINVAL;
589 goto out2;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400590 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400591 dprintk("RPC: %s: memory registration strategy is %d\n",
592 __func__, memreg);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400593
594 /* Else will do memory reg/dereg for each chunk */
595 ia->ri_memreg_strategy = memreg;
596
597 return 0;
598out2:
599 rdma_destroy_id(ia->ri_id);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400600 ia->ri_id = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400601out1:
602 return rc;
603}
604
605/*
606 * Clean up/close an IA.
607 * o if event handles and PD have been initialized, free them.
608 * o close the IA
609 */
610void
611rpcrdma_ia_close(struct rpcrdma_ia *ia)
612{
613 int rc;
614
615 dprintk("RPC: %s: entering\n", __func__);
616 if (ia->ri_bind_mem != NULL) {
617 rc = ib_dereg_mr(ia->ri_bind_mem);
618 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
619 __func__, rc);
620 }
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400621 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622 if (ia->ri_id->qp)
623 rdma_destroy_qp(ia->ri_id);
624 rdma_destroy_id(ia->ri_id);
625 ia->ri_id = NULL;
626 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400627 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628 rc = ib_dealloc_pd(ia->ri_pd);
629 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
630 __func__, rc);
631 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400632}
633
634/*
635 * Create unconnected endpoint.
636 */
637int
638rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639 struct rpcrdma_create_data_internal *cdata)
640{
641 struct ib_device_attr devattr;
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400642 int rc, err;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400643
644 rc = ib_query_device(ia->ri_id->device, &devattr);
645 if (rc) {
646 dprintk("RPC: %s: ib_query_device failed %d\n",
647 __func__, rc);
648 return rc;
649 }
650
651 /* check provider's send/recv wr limits */
652 if (cdata->max_requests > devattr.max_qp_wr)
653 cdata->max_requests = devattr.max_qp_wr;
654
655 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656 ep->rep_attr.qp_context = ep;
657 /* send_cq and recv_cq initialized below */
658 ep->rep_attr.srq = NULL;
659 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d302008-10-09 15:00:20 -0400661 case RPCRDMA_FRMR:
Tom Tucker15cdc6442010-08-11 12:47:24 -0400662 /* Add room for frmr register and invalidate WRs.
663 * 1. FRMR reg WR for head
664 * 2. FRMR invalidate WR for head
665 * 3. FRMR reg WR for pagelist
666 * 4. FRMR invalidate WR for pagelist
667 * 5. FRMR reg WR for tail
668 * 6. FRMR invalidate WR for tail
669 * 7. The RDMA_SEND WR
670 */
671 ep->rep_attr.cap.max_send_wr *= 7;
672 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673 cdata->max_requests = devattr.max_qp_wr / 7;
674 if (!cdata->max_requests)
675 return -EINVAL;
676 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
677 }
Tom Talpey3197d302008-10-09 15:00:20 -0400678 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400679 case RPCRDMA_MEMWINDOWS_ASYNC:
680 case RPCRDMA_MEMWINDOWS:
681 /* Add room for mw_binds+unbinds - overkill! */
682 ep->rep_attr.cap.max_send_wr++;
683 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685 return -EINVAL;
686 break;
687 default:
688 break;
689 }
690 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692 ep->rep_attr.cap.max_recv_sge = 1;
693 ep->rep_attr.cap.max_inline_data = 0;
694 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695 ep->rep_attr.qp_type = IB_QPT_RC;
696 ep->rep_attr.port_num = ~0;
697
698 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
699 "iovs: send %d recv %d\n",
700 __func__,
701 ep->rep_attr.cap.max_send_wr,
702 ep->rep_attr.cap.max_recv_wr,
703 ep->rep_attr.cap.max_send_sge,
704 ep->rep_attr.cap.max_recv_sge);
705
706 /* set trigger for requesting send completion */
707 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
708 switch (ia->ri_memreg_strategy) {
709 case RPCRDMA_MEMWINDOWS_ASYNC:
710 case RPCRDMA_MEMWINDOWS:
711 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712 break;
713 default:
714 break;
715 }
716 if (ep->rep_cqinit <= 2)
717 ep->rep_cqinit = 0;
718 INIT_CQCOUNT(ep);
719 ep->rep_ia = ia;
720 init_waitqueue_head(&ep->rep_connect_wait);
721
722 /*
723 * Create a single cq for receive dto and mw_bind (only ever
724 * care about unbind, really). Send completions are suppressed.
725 * Use single threaded tasklet upcalls to maintain ordering.
726 */
727 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728 rpcrdma_cq_async_error_upcall, NULL,
729 ep->rep_attr.cap.max_recv_wr +
730 ep->rep_attr.cap.max_send_wr + 1, 0);
731 if (IS_ERR(ep->rep_cq)) {
732 rc = PTR_ERR(ep->rep_cq);
733 dprintk("RPC: %s: ib_create_cq failed: %i\n",
734 __func__, rc);
735 goto out1;
736 }
737
738 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739 if (rc) {
740 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
741 __func__, rc);
742 goto out2;
743 }
744
745 ep->rep_attr.send_cq = ep->rep_cq;
746 ep->rep_attr.recv_cq = ep->rep_cq;
747
748 /* Initialize cma parameters */
749
750 /* RPC/RDMA does not use private data */
751 ep->rep_remote_cma.private_data = NULL;
752 ep->rep_remote_cma.private_data_len = 0;
753
754 /* Client offers RDMA Read but does not initiate */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400755 ep->rep_remote_cma.initiator_depth = 0;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400756 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
757 ep->rep_remote_cma.responder_resources = 0;
758 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
759 ep->rep_remote_cma.responder_resources = 32;
760 else
761 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400762
763 ep->rep_remote_cma.retry_count = 7;
764 ep->rep_remote_cma.flow_control = 0;
765 ep->rep_remote_cma.rnr_retry_count = 0;
766
767 return 0;
768
769out2:
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400770 err = ib_destroy_cq(ep->rep_cq);
771 if (err)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, err);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400774out1:
775 return rc;
776}
777
778/*
779 * rpcrdma_ep_destroy
780 *
781 * Disconnect and destroy endpoint. After this, the only
782 * valid operations on the ep are to free it (if dynamically
783 * allocated) or re-create it.
784 *
785 * The caller's error handling must be sure to not leak the endpoint
786 * if this function fails.
787 */
788int
789rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
790{
791 int rc;
792
793 dprintk("RPC: %s: entering, connected is %d\n",
794 __func__, ep->rep_connected);
795
796 if (ia->ri_id->qp) {
797 rc = rpcrdma_ep_disconnect(ep, ia);
798 if (rc)
799 dprintk("RPC: %s: rpcrdma_ep_disconnect"
800 " returned %i\n", __func__, rc);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400801 rdma_destroy_qp(ia->ri_id);
802 ia->ri_id->qp = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400803 }
804
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400805 /* padding - could be done in rpcrdma_buffer_destroy... */
806 if (ep->rep_pad_mr) {
807 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808 ep->rep_pad_mr = NULL;
809 }
810
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400811 rpcrdma_clean_cq(ep->rep_cq);
812 rc = ib_destroy_cq(ep->rep_cq);
813 if (rc)
814 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
815 __func__, rc);
816
817 return rc;
818}
819
820/*
821 * Connect unconnected endpoint.
822 */
823int
824rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825{
826 struct rdma_cm_id *id;
827 int rc = 0;
828 int retry_count = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400829
Tom Talpeyc0555512008-10-10 11:32:45 -0400830 if (ep->rep_connected != 0) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400831 struct rpcrdma_xprt *xprt;
832retry:
833 rc = rpcrdma_ep_disconnect(ep, ia);
834 if (rc && rc != -ENOTCONN)
835 dprintk("RPC: %s: rpcrdma_ep_disconnect"
836 " status %i\n", __func__, rc);
837 rpcrdma_clean_cq(ep->rep_cq);
838
839 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840 id = rpcrdma_create_id(xprt, ia,
841 (struct sockaddr *)&xprt->rx_data.addr);
842 if (IS_ERR(id)) {
843 rc = PTR_ERR(id);
844 goto out;
845 }
846 /* TEMP TEMP TEMP - fail if new device:
847 * Deregister/remarshal *all* requests!
848 * Close and recreate adapter, pd, etc!
849 * Re-determine all attributes still sane!
850 * More stuff I haven't thought of!
851 * Rrrgh!
852 */
853 if (ia->ri_id->device != id->device) {
854 printk("RPC: %s: can't reconnect on "
855 "different device!\n", __func__);
856 rdma_destroy_id(id);
857 rc = -ENETDOWN;
858 goto out;
859 }
860 /* END TEMP */
Tom Talpey1a954052008-10-09 15:01:31 -0400861 rdma_destroy_qp(ia->ri_id);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400862 rdma_destroy_id(ia->ri_id);
863 ia->ri_id = id;
864 }
865
866 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867 if (rc) {
868 dprintk("RPC: %s: rdma_create_qp failed %i\n",
869 __func__, rc);
870 goto out;
871 }
872
873/* XXX Tavor device performs badly with 2K MTU! */
874if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879 struct ib_qp_attr attr = {
880 .path_mtu = IB_MTU_1024
881 };
882 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
883 }
884}
885
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400886 ep->rep_connected = 0;
887
888 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889 if (rc) {
890 dprintk("RPC: %s: rdma_connect() failed with %i\n",
891 __func__, rc);
892 goto out;
893 }
894
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400895 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
896
897 /*
898 * Check state. A non-peer reject indicates no listener
899 * (ECONNREFUSED), which may be a transient state. All
900 * others indicate a transport condition which has already
901 * undergone a best-effort.
902 */
Joe Perchesf64f9e72009-11-29 16:55:45 -0800903 if (ep->rep_connected == -ECONNREFUSED &&
904 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400905 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
906 goto retry;
907 }
908 if (ep->rep_connected <= 0) {
909 /* Sometimes, the only way to reliably connect to remote
910 * CMs is to use same nonzero values for ORD and IRD. */
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400911 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912 (ep->rep_remote_cma.responder_resources == 0 ||
913 ep->rep_remote_cma.initiator_depth !=
914 ep->rep_remote_cma.responder_resources)) {
915 if (ep->rep_remote_cma.responder_resources == 0)
916 ep->rep_remote_cma.responder_resources = 1;
917 ep->rep_remote_cma.initiator_depth =
918 ep->rep_remote_cma.responder_resources;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400919 goto retry;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400920 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400921 rc = ep->rep_connected;
922 } else {
923 dprintk("RPC: %s: connected\n", __func__);
924 }
925
926out:
927 if (rc)
928 ep->rep_connected = rc;
929 return rc;
930}
931
932/*
933 * rpcrdma_ep_disconnect
934 *
935 * This is separate from destroy to facilitate the ability
936 * to reconnect without recreating the endpoint.
937 *
938 * This call is not reentrant, and must not be made in parallel
939 * on the same endpoint.
940 */
941int
942rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
943{
944 int rc;
945
946 rpcrdma_clean_cq(ep->rep_cq);
947 rc = rdma_disconnect(ia->ri_id);
948 if (!rc) {
949 /* returns without wait if not connected */
950 wait_event_interruptible(ep->rep_connect_wait,
951 ep->rep_connected != 1);
952 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
953 (ep->rep_connected == 1) ? "still " : "dis");
954 } else {
955 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
956 ep->rep_connected = rc;
957 }
958 return rc;
959}
960
961/*
962 * Initialize buffer memory
963 */
964int
965rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
967{
968 char *p;
969 size_t len;
970 int i, rc;
Tom Talpey8d4ba032008-10-09 14:59:49 -0400971 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400972
973 buf->rb_max_requests = cdata->max_requests;
974 spin_lock_init(&buf->rb_lock);
975 atomic_set(&buf->rb_credits, 1);
976
977 /* Need to allocate:
978 * 1. arrays for send and recv pointers
979 * 2. arrays of struct rpcrdma_req to fill in pointers
980 * 3. array of struct rpcrdma_rep for replies
981 * 4. padding, if any
Tom Talpey3197d302008-10-09 15:00:20 -0400982 * 5. mw's, fmr's or frmr's, if any
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400983 * Send/recv buffers in req/rep need to be registered
984 */
985
986 len = buf->rb_max_requests *
987 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988 len += cdata->padding;
989 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d302008-10-09 15:00:20 -0400990 case RPCRDMA_FRMR:
991 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992 sizeof(struct rpcrdma_mw);
993 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400994 case RPCRDMA_MTHCAFMR:
995 /* TBD we are perhaps overallocating here */
996 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997 sizeof(struct rpcrdma_mw);
998 break;
999 case RPCRDMA_MEMWINDOWS_ASYNC:
1000 case RPCRDMA_MEMWINDOWS:
1001 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002 sizeof(struct rpcrdma_mw);
1003 break;
1004 default:
1005 break;
1006 }
1007
1008 /* allocate 1, 4 and 5 in one shot */
1009 p = kzalloc(len, GFP_KERNEL);
1010 if (p == NULL) {
1011 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012 __func__, len);
1013 rc = -ENOMEM;
1014 goto out;
1015 }
1016 buf->rb_pool = p; /* for freeing it later */
1017
1018 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023 /*
1024 * Register the zeroed pad buffer, if any.
1025 */
1026 if (cdata->padding) {
1027 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028 &ep->rep_pad_mr, &ep->rep_pad);
1029 if (rc)
1030 goto out;
1031 }
1032 p += cdata->padding;
1033
1034 /*
1035 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1036 * We "cycle" the mw's in order to minimize rkey reuse,
1037 * and also reduce unbind-to-bind collision.
1038 */
1039 INIT_LIST_HEAD(&buf->rb_mws);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001040 r = (struct rpcrdma_mw *)p;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001041 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d302008-10-09 15:00:20 -04001042 case RPCRDMA_FRMR:
1043 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_mr)) {
1047 rc = PTR_ERR(r->r.frmr.fr_mr);
1048 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1049 " failed %i\n", __func__, rc);
1050 goto out;
1051 }
1052 r->r.frmr.fr_pgl =
1053 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054 RPCRDMA_MAX_SEGS);
1055 if (IS_ERR(r->r.frmr.fr_pgl)) {
1056 rc = PTR_ERR(r->r.frmr.fr_pgl);
1057 dprintk("RPC: %s: "
1058 "ib_alloc_fast_reg_page_list "
1059 "failed %i\n", __func__, rc);
1060 goto out;
1061 }
1062 list_add(&r->mw_list, &buf->rb_mws);
1063 ++r;
1064 }
1065 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001066 case RPCRDMA_MTHCAFMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001067 /* TBD we are perhaps overallocating here */
1068 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001069 static struct ib_fmr_attr fa =
1070 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001071 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073 &fa);
1074 if (IS_ERR(r->r.fmr)) {
1075 rc = PTR_ERR(r->r.fmr);
1076 dprintk("RPC: %s: ib_alloc_fmr"
1077 " failed %i\n", __func__, rc);
1078 goto out;
1079 }
1080 list_add(&r->mw_list, &buf->rb_mws);
1081 ++r;
1082 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001083 break;
1084 case RPCRDMA_MEMWINDOWS_ASYNC:
1085 case RPCRDMA_MEMWINDOWS:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001086 /* Allocate one extra request's worth, for full cycling */
1087 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088 r->r.mw = ib_alloc_mw(ia->ri_pd);
1089 if (IS_ERR(r->r.mw)) {
1090 rc = PTR_ERR(r->r.mw);
1091 dprintk("RPC: %s: ib_alloc_mw"
1092 " failed %i\n", __func__, rc);
1093 goto out;
1094 }
1095 list_add(&r->mw_list, &buf->rb_mws);
1096 ++r;
1097 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001098 break;
1099 default:
1100 break;
1101 }
1102
1103 /*
1104 * Allocate/init the request/reply buffers. Doing this
1105 * using kmalloc for now -- one for each buf.
1106 */
1107 for (i = 0; i < buf->rb_max_requests; i++) {
1108 struct rpcrdma_req *req;
1109 struct rpcrdma_rep *rep;
1110
1111 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113 /* Typical ~2400b, so rounding up saves work later */
1114 if (len < 4096)
1115 len = 4096;
1116 req = kmalloc(len, GFP_KERNEL);
1117 if (req == NULL) {
1118 dprintk("RPC: %s: request buffer %d alloc"
1119 " failed\n", __func__, i);
1120 rc = -ENOMEM;
1121 goto out;
1122 }
1123 memset(req, 0, sizeof(struct rpcrdma_req));
1124 buf->rb_send_bufs[i] = req;
1125 buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127 rc = rpcrdma_register_internal(ia, req->rl_base,
1128 len - offsetof(struct rpcrdma_req, rl_base),
1129 &buf->rb_send_bufs[i]->rl_handle,
1130 &buf->rb_send_bufs[i]->rl_iov);
1131 if (rc)
1132 goto out;
1133
1134 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137 rep = kmalloc(len, GFP_KERNEL);
1138 if (rep == NULL) {
1139 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1140 __func__, i);
1141 rc = -ENOMEM;
1142 goto out;
1143 }
1144 memset(rep, 0, sizeof(struct rpcrdma_rep));
1145 buf->rb_recv_bufs[i] = rep;
1146 buf->rb_recv_bufs[i]->rr_buffer = buf;
1147 init_waitqueue_head(&rep->rr_unbind);
1148
1149 rc = rpcrdma_register_internal(ia, rep->rr_base,
1150 len - offsetof(struct rpcrdma_rep, rr_base),
1151 &buf->rb_recv_bufs[i]->rr_handle,
1152 &buf->rb_recv_bufs[i]->rr_iov);
1153 if (rc)
1154 goto out;
1155
1156 }
1157 dprintk("RPC: %s: max_requests %d\n",
1158 __func__, buf->rb_max_requests);
1159 /* done */
1160 return 0;
1161out:
1162 rpcrdma_buffer_destroy(buf);
1163 return rc;
1164}
1165
1166/*
1167 * Unregister and destroy buffer memory. Need to deal with
1168 * partial initialization, so it's callable from failed create.
1169 * Must be called before destroying endpoint, as registrations
1170 * reference it.
1171 */
1172void
1173rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174{
1175 int rc, i;
1176 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001177 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001178
1179 /* clean up in reverse order from create
1180 * 1. recv mr memory (mr free, then kfree)
1181 * 1a. bind mw memory
1182 * 2. send mr memory (mr free, then kfree)
1183 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1184 * 4. arrays
1185 */
1186 dprintk("RPC: %s: entering\n", __func__);
1187
1188 for (i = 0; i < buf->rb_max_requests; i++) {
1189 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190 rpcrdma_deregister_internal(ia,
1191 buf->rb_recv_bufs[i]->rr_handle,
1192 &buf->rb_recv_bufs[i]->rr_iov);
1193 kfree(buf->rb_recv_bufs[i]);
1194 }
1195 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196 while (!list_empty(&buf->rb_mws)) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001197 r = list_entry(buf->rb_mws.next,
1198 struct rpcrdma_mw, mw_list);
1199 list_del(&r->mw_list);
1200 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d302008-10-09 15:00:20 -04001201 case RPCRDMA_FRMR:
1202 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dereg_mr"
1206 " failed %i\n",
1207 __func__, rc);
1208 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001210 case RPCRDMA_MTHCAFMR:
1211 rc = ib_dealloc_fmr(r->r.fmr);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_fmr"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 case RPCRDMA_MEMWINDOWS_ASYNC:
1219 case RPCRDMA_MEMWINDOWS:
1220 rc = ib_dealloc_mw(r->r.mw);
1221 if (rc)
1222 dprintk("RPC: %s:"
1223 " ib_dealloc_mw"
1224 " failed %i\n",
1225 __func__, rc);
1226 break;
1227 default:
1228 break;
1229 }
1230 }
1231 rpcrdma_deregister_internal(ia,
1232 buf->rb_send_bufs[i]->rl_handle,
1233 &buf->rb_send_bufs[i]->rl_iov);
1234 kfree(buf->rb_send_bufs[i]);
1235 }
1236 }
1237
1238 kfree(buf->rb_pool);
1239}
1240
1241/*
1242 * Get a set of request/reply buffers.
1243 *
1244 * Reply buffer (if needed) is attached to send buffer upon return.
1245 * Rule:
1246 * rb_send_index and rb_recv_index MUST always be pointing to the
1247 * *next* available buffer (non-NULL). They are incremented after
1248 * removing buffers, and decremented *before* returning them.
1249 */
1250struct rpcrdma_req *
1251rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252{
1253 struct rpcrdma_req *req;
1254 unsigned long flags;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001255 int i;
1256 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001257
1258 spin_lock_irqsave(&buffers->rb_lock, flags);
1259 if (buffers->rb_send_index == buffers->rb_max_requests) {
1260 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261 dprintk("RPC: %s: out of request buffers\n", __func__);
1262 return ((struct rpcrdma_req *)NULL);
1263 }
1264
1265 req = buffers->rb_send_bufs[buffers->rb_send_index];
1266 if (buffers->rb_send_index < buffers->rb_recv_index) {
1267 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1268 __func__,
1269 buffers->rb_recv_index - buffers->rb_send_index);
1270 req->rl_reply = NULL;
1271 } else {
1272 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274 }
1275 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276 if (!list_empty(&buffers->rb_mws)) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001277 i = RPCRDMA_MAX_SEGS - 1;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001278 do {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001279 r = list_entry(buffers->rb_mws.next,
1280 struct rpcrdma_mw, mw_list);
1281 list_del(&r->mw_list);
1282 req->rl_segments[i].mr_chunk.rl_mw = r;
1283 } while (--i >= 0);
1284 }
1285 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286 return req;
1287}
1288
1289/*
1290 * Put request/reply buffers back into pool.
1291 * Pre-decrement counter/array index.
1292 */
1293void
1294rpcrdma_buffer_put(struct rpcrdma_req *req)
1295{
1296 struct rpcrdma_buffer *buffers = req->rl_buffer;
1297 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298 int i;
1299 unsigned long flags;
1300
1301 BUG_ON(req->rl_nchunks != 0);
1302 spin_lock_irqsave(&buffers->rb_lock, flags);
1303 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304 req->rl_niovs = 0;
1305 if (req->rl_reply) {
1306 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307 init_waitqueue_head(&req->rl_reply->rr_unbind);
1308 req->rl_reply->rr_func = NULL;
1309 req->rl_reply = NULL;
1310 }
1311 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d302008-10-09 15:00:20 -04001312 case RPCRDMA_FRMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001313 case RPCRDMA_MTHCAFMR:
1314 case RPCRDMA_MEMWINDOWS_ASYNC:
1315 case RPCRDMA_MEMWINDOWS:
1316 /*
1317 * Cycle mw's back in reverse order, and "spin" them.
1318 * This delays and scrambles reuse as much as possible.
1319 */
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break;
1331 default:
1332 break;
1333 }
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335}
1336
1337/*
1338 * Recover reply buffers from pool.
1339 * This happens when recovering from error conditions.
1340 * Post-increment counter/array index.
1341 */
1342void
1343rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344{
1345 struct rpcrdma_buffer *buffers = req->rl_buffer;
1346 unsigned long flags;
1347
1348 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1349 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350 spin_lock_irqsave(&buffers->rb_lock, flags);
1351 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354 }
1355 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356}
1357
1358/*
1359 * Put reply buffers back into pool when not attached to
1360 * request. This happens in error conditions, and when
1361 * aborting unbinds. Pre-decrement counter/array index.
1362 */
1363void
1364rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365{
1366 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367 unsigned long flags;
1368
1369 rep->rr_func = NULL;
1370 spin_lock_irqsave(&buffers->rb_lock, flags);
1371 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373}
1374
1375/*
1376 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1377 */
1378
1379int
1380rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381 struct ib_mr **mrp, struct ib_sge *iov)
1382{
1383 struct ib_phys_buf ipb;
1384 struct ib_mr *mr;
1385 int rc;
1386
1387 /*
1388 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1389 */
1390 iov->addr = ib_dma_map_single(ia->ri_id->device,
1391 va, len, DMA_BIDIRECTIONAL);
1392 iov->length = len;
1393
Tom Talpeybd7ed1d2008-10-09 15:00:09 -04001394 if (ia->ri_have_dma_lkey) {
1395 *mrp = NULL;
1396 iov->lkey = ia->ri_dma_lkey;
1397 return 0;
1398 } else if (ia->ri_bind_mem != NULL) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001399 *mrp = NULL;
1400 iov->lkey = ia->ri_bind_mem->lkey;
1401 return 0;
1402 }
1403
1404 ipb.addr = iov->addr;
1405 ipb.size = iov->length;
1406 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409 dprintk("RPC: %s: phys convert: 0x%llx "
1410 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001411 __func__, (unsigned long long)ipb.addr,
1412 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001413
1414 if (IS_ERR(mr)) {
1415 *mrp = NULL;
1416 rc = PTR_ERR(mr);
1417 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1418 } else {
1419 *mrp = mr;
1420 iov->lkey = mr->lkey;
1421 rc = 0;
1422 }
1423
1424 return rc;
1425}
1426
1427int
1428rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429 struct ib_mr *mr, struct ib_sge *iov)
1430{
1431 int rc;
1432
1433 ib_dma_unmap_single(ia->ri_id->device,
1434 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436 if (NULL == mr)
1437 return 0;
1438
1439 rc = ib_dereg_mr(mr);
1440 if (rc)
1441 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1442 return rc;
1443}
1444
1445/*
1446 * Wrappers for chunk registration, shared by read/write chunk code.
1447 */
1448
1449static void
1450rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451{
1452 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453 seg->mr_dmalen = seg->mr_len;
1454 if (seg->mr_page)
1455 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456 seg->mr_page, offset_in_page(seg->mr_offset),
1457 seg->mr_dmalen, seg->mr_dir);
1458 else
1459 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460 seg->mr_offset,
1461 seg->mr_dmalen, seg->mr_dir);
Tom Tucker5c635e02011-02-09 19:45:34 +00001462 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464 __func__,
1465 seg->mr_dma, seg->mr_offset, seg->mr_dmalen);
1466 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001467}
1468
1469static void
1470rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1471{
1472 if (seg->mr_page)
1473 ib_dma_unmap_page(ia->ri_id->device,
1474 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1475 else
1476 ib_dma_unmap_single(ia->ri_id->device,
1477 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1478}
1479
Tom Talpey8d4ba032008-10-09 14:59:49 -04001480static int
Tom Talpey3197d302008-10-09 15:00:20 -04001481rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1482 int *nsegs, int writing, struct rpcrdma_ia *ia,
1483 struct rpcrdma_xprt *r_xprt)
1484{
1485 struct rpcrdma_mr_seg *seg1 = seg;
Tom Tucker5c635e02011-02-09 19:45:34 +00001486 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1487
Tom Talpey3197d302008-10-09 15:00:20 -04001488 u8 key;
1489 int len, pageoff;
1490 int i, rc;
1491
1492 pageoff = offset_in_page(seg1->mr_offset);
1493 seg1->mr_offset -= pageoff; /* start of page */
1494 seg1->mr_len += pageoff;
1495 len = -pageoff;
1496 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1497 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1498 for (i = 0; i < *nsegs;) {
1499 rpcrdma_map_one(ia, seg, writing);
1500 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1501 len += seg->mr_len;
Tom Tucker5c635e02011-02-09 19:45:34 +00001502 BUG_ON(seg->mr_len > PAGE_SIZE);
Tom Talpey3197d302008-10-09 15:00:20 -04001503 ++seg;
1504 ++i;
1505 /* Check for holes */
1506 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1507 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1508 break;
1509 }
1510 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1511 __func__, seg1->mr_chunk.rl_mw, i);
1512
Tom Tucker5c635e02011-02-09 19:45:34 +00001513 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1514 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1515 __func__,
1516 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1517 /* Invalidate before using. */
1518 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1519 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1520 invalidate_wr.next = &frmr_wr;
1521 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1522 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1523 invalidate_wr.ex.invalidate_rkey =
1524 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1525 DECR_CQCOUNT(&r_xprt->rx_ep);
1526 post_wr = &invalidate_wr;
1527 } else
1528 post_wr = &frmr_wr;
1529
Tom Talpey3197d302008-10-09 15:00:20 -04001530 /* Bump the key */
1531 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1532 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1533
1534 /* Prepare FRMR WR */
1535 memset(&frmr_wr, 0, sizeof frmr_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001536 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d302008-10-09 15:00:20 -04001537 frmr_wr.opcode = IB_WR_FAST_REG_MR;
Tom Tucker5c635e02011-02-09 19:45:34 +00001538 frmr_wr.send_flags = IB_SEND_SIGNALED;
Steve Wise7a8b80eb2010-08-11 12:47:08 -04001539 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
Tom Talpey3197d302008-10-09 15:00:20 -04001540 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1541 frmr_wr.wr.fast_reg.page_list_len = i;
1542 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1543 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
Tom Tucker5c635e02011-02-09 19:45:34 +00001544 BUG_ON(frmr_wr.wr.fast_reg.length < len);
Tom Talpey3197d302008-10-09 15:00:20 -04001545 frmr_wr.wr.fast_reg.access_flags = (writing ?
Vu Pham68743082009-05-26 14:51:00 -04001546 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1547 IB_ACCESS_REMOTE_READ);
Tom Talpey3197d302008-10-09 15:00:20 -04001548 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1549 DECR_CQCOUNT(&r_xprt->rx_ep);
1550
Tom Tucker5c635e02011-02-09 19:45:34 +00001551 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
Tom Talpey3197d302008-10-09 15:00:20 -04001552
1553 if (rc) {
1554 dprintk("RPC: %s: failed ib_post_send for register,"
1555 " status %i\n", __func__, rc);
1556 while (i--)
1557 rpcrdma_unmap_one(ia, --seg);
1558 } else {
1559 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1560 seg1->mr_base = seg1->mr_dma + pageoff;
1561 seg1->mr_nsegs = i;
1562 seg1->mr_len = len;
1563 }
1564 *nsegs = i;
1565 return rc;
1566}
1567
1568static int
1569rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1570 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1571{
1572 struct rpcrdma_mr_seg *seg1 = seg;
1573 struct ib_send_wr invalidate_wr, *bad_wr;
1574 int rc;
1575
1576 while (seg1->mr_nsegs--)
1577 rpcrdma_unmap_one(ia, seg++);
1578
1579 memset(&invalidate_wr, 0, sizeof invalidate_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001580 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d302008-10-09 15:00:20 -04001581 invalidate_wr.opcode = IB_WR_LOCAL_INV;
Tom Tucker5c635e02011-02-09 19:45:34 +00001582 invalidate_wr.send_flags = IB_SEND_SIGNALED;
Tom Talpey3197d302008-10-09 15:00:20 -04001583 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1584 DECR_CQCOUNT(&r_xprt->rx_ep);
1585
1586 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1587 if (rc)
1588 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1589 " status %i\n", __func__, rc);
1590 return rc;
1591}
1592
1593static int
Tom Talpey8d4ba032008-10-09 14:59:49 -04001594rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1595 int *nsegs, int writing, struct rpcrdma_ia *ia)
1596{
1597 struct rpcrdma_mr_seg *seg1 = seg;
1598 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1599 int len, pageoff, i, rc;
1600
1601 pageoff = offset_in_page(seg1->mr_offset);
1602 seg1->mr_offset -= pageoff; /* start of page */
1603 seg1->mr_len += pageoff;
1604 len = -pageoff;
1605 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1606 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1607 for (i = 0; i < *nsegs;) {
1608 rpcrdma_map_one(ia, seg, writing);
1609 physaddrs[i] = seg->mr_dma;
1610 len += seg->mr_len;
1611 ++seg;
1612 ++i;
1613 /* Check for holes */
1614 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1615 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1616 break;
1617 }
1618 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1619 physaddrs, i, seg1->mr_dma);
1620 if (rc) {
1621 dprintk("RPC: %s: failed ib_map_phys_fmr "
1622 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1623 len, (unsigned long long)seg1->mr_dma,
1624 pageoff, i, rc);
1625 while (i--)
1626 rpcrdma_unmap_one(ia, --seg);
1627 } else {
1628 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1629 seg1->mr_base = seg1->mr_dma + pageoff;
1630 seg1->mr_nsegs = i;
1631 seg1->mr_len = len;
1632 }
1633 *nsegs = i;
1634 return rc;
1635}
1636
1637static int
1638rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1639 struct rpcrdma_ia *ia)
1640{
1641 struct rpcrdma_mr_seg *seg1 = seg;
1642 LIST_HEAD(l);
1643 int rc;
1644
1645 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1646 rc = ib_unmap_fmr(&l);
1647 while (seg1->mr_nsegs--)
1648 rpcrdma_unmap_one(ia, seg++);
1649 if (rc)
1650 dprintk("RPC: %s: failed ib_unmap_fmr,"
1651 " status %i\n", __func__, rc);
1652 return rc;
1653}
1654
1655static int
1656rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1657 int *nsegs, int writing, struct rpcrdma_ia *ia,
1658 struct rpcrdma_xprt *r_xprt)
1659{
1660 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1661 IB_ACCESS_REMOTE_READ);
1662 struct ib_mw_bind param;
1663 int rc;
1664
1665 *nsegs = 1;
1666 rpcrdma_map_one(ia, seg, writing);
1667 param.mr = ia->ri_bind_mem;
1668 param.wr_id = 0ULL; /* no send cookie */
1669 param.addr = seg->mr_dma;
1670 param.length = seg->mr_len;
1671 param.send_flags = 0;
1672 param.mw_access_flags = mem_priv;
1673
1674 DECR_CQCOUNT(&r_xprt->rx_ep);
1675 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1676 if (rc) {
1677 dprintk("RPC: %s: failed ib_bind_mw "
1678 "%u@0x%llx status %i\n",
1679 __func__, seg->mr_len,
1680 (unsigned long long)seg->mr_dma, rc);
1681 rpcrdma_unmap_one(ia, seg);
1682 } else {
1683 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1684 seg->mr_base = param.addr;
1685 seg->mr_nsegs = 1;
1686 }
1687 return rc;
1688}
1689
1690static int
1691rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1692 struct rpcrdma_ia *ia,
1693 struct rpcrdma_xprt *r_xprt, void **r)
1694{
1695 struct ib_mw_bind param;
1696 LIST_HEAD(l);
1697 int rc;
1698
1699 BUG_ON(seg->mr_nsegs != 1);
1700 param.mr = ia->ri_bind_mem;
1701 param.addr = 0ULL; /* unbind */
1702 param.length = 0;
1703 param.mw_access_flags = 0;
1704 if (*r) {
1705 param.wr_id = (u64) (unsigned long) *r;
1706 param.send_flags = IB_SEND_SIGNALED;
1707 INIT_CQCOUNT(&r_xprt->rx_ep);
1708 } else {
1709 param.wr_id = 0ULL;
1710 param.send_flags = 0;
1711 DECR_CQCOUNT(&r_xprt->rx_ep);
1712 }
1713 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1714 rpcrdma_unmap_one(ia, seg);
1715 if (rc)
1716 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1717 " status %i\n", __func__, rc);
1718 else
1719 *r = NULL; /* will upcall on completion */
1720 return rc;
1721}
1722
1723static int
1724rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1725 int *nsegs, int writing, struct rpcrdma_ia *ia)
1726{
1727 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1728 IB_ACCESS_REMOTE_READ);
1729 struct rpcrdma_mr_seg *seg1 = seg;
1730 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1731 int len, i, rc = 0;
1732
1733 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1734 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1735 for (len = 0, i = 0; i < *nsegs;) {
1736 rpcrdma_map_one(ia, seg, writing);
1737 ipb[i].addr = seg->mr_dma;
1738 ipb[i].size = seg->mr_len;
1739 len += seg->mr_len;
1740 ++seg;
1741 ++i;
1742 /* Check for holes */
1743 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1744 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1745 break;
1746 }
1747 seg1->mr_base = seg1->mr_dma;
1748 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1749 ipb, i, mem_priv, &seg1->mr_base);
1750 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1751 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1752 dprintk("RPC: %s: failed ib_reg_phys_mr "
1753 "%u@0x%llx (%d)... status %i\n",
1754 __func__, len,
1755 (unsigned long long)seg1->mr_dma, i, rc);
1756 while (i--)
1757 rpcrdma_unmap_one(ia, --seg);
1758 } else {
1759 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1760 seg1->mr_nsegs = i;
1761 seg1->mr_len = len;
1762 }
1763 *nsegs = i;
1764 return rc;
1765}
1766
1767static int
1768rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1769 struct rpcrdma_ia *ia)
1770{
1771 struct rpcrdma_mr_seg *seg1 = seg;
1772 int rc;
1773
1774 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1775 seg1->mr_chunk.rl_mr = NULL;
1776 while (seg1->mr_nsegs--)
1777 rpcrdma_unmap_one(ia, seg++);
1778 if (rc)
1779 dprintk("RPC: %s: failed ib_dereg_mr,"
1780 " status %i\n", __func__, rc);
1781 return rc;
1782}
1783
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001784int
1785rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1786 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1787{
1788 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001789 int rc = 0;
1790
1791 switch (ia->ri_memreg_strategy) {
1792
1793#if RPCRDMA_PERSISTENT_REGISTRATION
1794 case RPCRDMA_ALLPHYSICAL:
1795 rpcrdma_map_one(ia, seg, writing);
1796 seg->mr_rkey = ia->ri_bind_mem->rkey;
1797 seg->mr_base = seg->mr_dma;
1798 seg->mr_nsegs = 1;
1799 nsegs = 1;
1800 break;
1801#endif
1802
Tom Talpey3197d302008-10-09 15:00:20 -04001803 /* Registration using frmr registration */
1804 case RPCRDMA_FRMR:
1805 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1806 break;
1807
Tom Talpey8d4ba032008-10-09 14:59:49 -04001808 /* Registration using fmr memory registration */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001809 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001810 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001811 break;
1812
1813 /* Registration using memory windows */
1814 case RPCRDMA_MEMWINDOWS_ASYNC:
1815 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001816 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001817 break;
1818
1819 /* Default registration each time */
1820 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001821 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001822 break;
1823 }
1824 if (rc)
1825 return -1;
1826
1827 return nsegs;
1828}
1829
1830int
1831rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1832 struct rpcrdma_xprt *r_xprt, void *r)
1833{
1834 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001835 int nsegs = seg->mr_nsegs, rc;
1836
1837 switch (ia->ri_memreg_strategy) {
1838
1839#if RPCRDMA_PERSISTENT_REGISTRATION
1840 case RPCRDMA_ALLPHYSICAL:
1841 BUG_ON(nsegs != 1);
1842 rpcrdma_unmap_one(ia, seg);
1843 rc = 0;
1844 break;
1845#endif
1846
Tom Talpey3197d302008-10-09 15:00:20 -04001847 case RPCRDMA_FRMR:
1848 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1849 break;
1850
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001851 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001852 rc = rpcrdma_deregister_fmr_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001853 break;
1854
1855 case RPCRDMA_MEMWINDOWS_ASYNC:
1856 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001857 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001858 break;
1859
1860 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001861 rc = rpcrdma_deregister_default_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001862 break;
1863 }
1864 if (r) {
1865 struct rpcrdma_rep *rep = r;
1866 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1867 rep->rr_func = NULL;
1868 func(rep); /* dereg done, callback now */
1869 }
1870 return nsegs;
1871}
1872
1873/*
1874 * Prepost any receive buffer, then post send.
1875 *
1876 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1877 */
1878int
1879rpcrdma_ep_post(struct rpcrdma_ia *ia,
1880 struct rpcrdma_ep *ep,
1881 struct rpcrdma_req *req)
1882{
1883 struct ib_send_wr send_wr, *send_wr_fail;
1884 struct rpcrdma_rep *rep = req->rl_reply;
1885 int rc;
1886
1887 if (rep) {
1888 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1889 if (rc)
1890 goto out;
1891 req->rl_reply = NULL;
1892 }
1893
1894 send_wr.next = NULL;
1895 send_wr.wr_id = 0ULL; /* no send cookie */
1896 send_wr.sg_list = req->rl_send_iov;
1897 send_wr.num_sge = req->rl_niovs;
1898 send_wr.opcode = IB_WR_SEND;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001899 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1900 ib_dma_sync_single_for_device(ia->ri_id->device,
1901 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1902 DMA_TO_DEVICE);
1903 ib_dma_sync_single_for_device(ia->ri_id->device,
1904 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1905 DMA_TO_DEVICE);
1906 ib_dma_sync_single_for_device(ia->ri_id->device,
1907 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1908 DMA_TO_DEVICE);
1909
1910 if (DECR_CQCOUNT(ep) > 0)
1911 send_wr.send_flags = 0;
1912 else { /* Provider must take a send completion every now and then */
1913 INIT_CQCOUNT(ep);
1914 send_wr.send_flags = IB_SEND_SIGNALED;
1915 }
1916
1917 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1918 if (rc)
1919 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1920 rc);
1921out:
1922 return rc;
1923}
1924
1925/*
1926 * (Re)post a receive buffer.
1927 */
1928int
1929rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1930 struct rpcrdma_ep *ep,
1931 struct rpcrdma_rep *rep)
1932{
1933 struct ib_recv_wr recv_wr, *recv_wr_fail;
1934 int rc;
1935
1936 recv_wr.next = NULL;
1937 recv_wr.wr_id = (u64) (unsigned long) rep;
1938 recv_wr.sg_list = &rep->rr_iov;
1939 recv_wr.num_sge = 1;
1940
1941 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1942 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1943
1944 DECR_CQCOUNT(ep);
1945 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1946
1947 if (rc)
1948 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1949 rc);
1950 return rc;
1951}