| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. | 
|  | 3 | * | 
|  | 4 | * This software is available to you under a choice of one of two | 
|  | 5 | * licenses.  You may choose to be licensed under the terms of the GNU | 
|  | 6 | * General Public License (GPL) Version 2, available from the file | 
|  | 7 | * COPYING in the main directory of this source tree, or the BSD-type | 
|  | 8 | * license below: | 
|  | 9 | * | 
|  | 10 | * Redistribution and use in source and binary forms, with or without | 
|  | 11 | * modification, are permitted provided that the following conditions | 
|  | 12 | * are met: | 
|  | 13 | * | 
|  | 14 | *      Redistributions of source code must retain the above copyright | 
|  | 15 | *      notice, this list of conditions and the following disclaimer. | 
|  | 16 | * | 
|  | 17 | *      Redistributions in binary form must reproduce the above | 
|  | 18 | *      copyright notice, this list of conditions and the following | 
|  | 19 | *      disclaimer in the documentation and/or other materials provided | 
|  | 20 | *      with the distribution. | 
|  | 21 | * | 
|  | 22 | *      Neither the name of the Network Appliance, Inc. nor the names of | 
|  | 23 | *      its contributors may be used to endorse or promote products | 
|  | 24 | *      derived from this software without specific prior written | 
|  | 25 | *      permission. | 
|  | 26 | * | 
|  | 27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
|  | 30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
|  | 31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
|  | 32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
|  | 33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
|  | 34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
|  | 35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|  | 36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
|  | 37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  | 38 | */ | 
|  | 39 |  | 
|  | 40 | #ifndef _LINUX_SUNRPC_XPRT_RDMA_H | 
|  | 41 | #define _LINUX_SUNRPC_XPRT_RDMA_H | 
|  | 42 |  | 
|  | 43 | #include <linux/wait.h> 		/* wait_queue_head_t, etc */ | 
|  | 44 | #include <linux/spinlock.h> 		/* spinlock_t, etc */ | 
|  | 45 | #include <asm/atomic.h>			/* atomic_t, etc */ | 
|  | 46 |  | 
|  | 47 | #include <rdma/rdma_cm.h>		/* RDMA connection api */ | 
|  | 48 | #include <rdma/ib_verbs.h>		/* RDMA verbs api */ | 
|  | 49 |  | 
|  | 50 | #include <linux/sunrpc/clnt.h> 		/* rpc_xprt */ | 
|  | 51 | #include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */ | 
|  | 52 | #include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */ | 
|  | 53 |  | 
| Tom Talpey | 5675add | 2008-10-09 15:01:41 -0400 | [diff] [blame] | 54 | #define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */ | 
|  | 55 | #define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */ | 
|  | 56 |  | 
| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 57 | /* | 
|  | 58 | * Interface Adapter -- one per transport instance | 
|  | 59 | */ | 
|  | 60 | struct rpcrdma_ia { | 
|  | 61 | struct rdma_cm_id 	*ri_id; | 
|  | 62 | struct ib_pd		*ri_pd; | 
|  | 63 | struct ib_mr		*ri_bind_mem; | 
| Tom Talpey | fe9053b | 2008-10-09 14:59:59 -0400 | [diff] [blame] | 64 | u32			ri_dma_lkey; | 
|  | 65 | int			ri_have_dma_lkey; | 
| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 66 | struct completion	ri_done; | 
|  | 67 | int			ri_async_rc; | 
|  | 68 | enum rpcrdma_memreg	ri_memreg_strategy; | 
|  | 69 | }; | 
|  | 70 |  | 
|  | 71 | /* | 
|  | 72 | * RDMA Endpoint -- one per transport instance | 
|  | 73 | */ | 
|  | 74 |  | 
|  | 75 | struct rpcrdma_ep { | 
|  | 76 | atomic_t		rep_cqcount; | 
|  | 77 | int			rep_cqinit; | 
|  | 78 | int			rep_connected; | 
|  | 79 | struct rpcrdma_ia	*rep_ia; | 
|  | 80 | struct ib_cq		*rep_cq; | 
|  | 81 | struct ib_qp_init_attr	rep_attr; | 
|  | 82 | wait_queue_head_t 	rep_connect_wait; | 
|  | 83 | struct ib_sge		rep_pad;	/* holds zeroed pad */ | 
|  | 84 | struct ib_mr		*rep_pad_mr;	/* holds zeroed pad */ | 
|  | 85 | void			(*rep_func)(struct rpcrdma_ep *); | 
|  | 86 | struct rpc_xprt		*rep_xprt;	/* for rep_func */ | 
|  | 87 | struct rdma_conn_param	rep_remote_cma; | 
|  | 88 | struct sockaddr_storage	rep_remote_addr; | 
|  | 89 | }; | 
|  | 90 |  | 
|  | 91 | #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) | 
|  | 92 | #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) | 
|  | 93 |  | 
|  | 94 | /* | 
|  | 95 | * struct rpcrdma_rep -- this structure encapsulates state required to recv | 
|  | 96 | * and complete a reply, asychronously. It needs several pieces of | 
|  | 97 | * state: | 
|  | 98 | *   o recv buffer (posted to provider) | 
|  | 99 | *   o ib_sge (also donated to provider) | 
|  | 100 | *   o status of reply (length, success or not) | 
|  | 101 | *   o bookkeeping state to get run by tasklet (list, etc) | 
|  | 102 | * | 
|  | 103 | * These are allocated during initialization, per-transport instance; | 
|  | 104 | * however, the tasklet execution list itself is global, as it should | 
|  | 105 | * always be pretty short. | 
|  | 106 | * | 
|  | 107 | * N of these are associated with a transport instance, and stored in | 
|  | 108 | * struct rpcrdma_buffer. N is the max number of outstanding requests. | 
|  | 109 | */ | 
|  | 110 |  | 
|  | 111 | /* temporary static scatter/gather max */ | 
|  | 112 | #define RPCRDMA_MAX_DATA_SEGS	(8)	/* max scatter/gather */ | 
|  | 113 | #define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ | 
|  | 114 | #define MAX_RPCRDMAHDR	(\ | 
|  | 115 | /* max supported RPC/RDMA header */ \ | 
|  | 116 | sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ | 
|  | 117 | (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) | 
|  | 118 |  | 
|  | 119 | struct rpcrdma_buffer; | 
|  | 120 |  | 
|  | 121 | struct rpcrdma_rep { | 
|  | 122 | unsigned int	rr_len;		/* actual received reply length */ | 
|  | 123 | struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ | 
|  | 124 | struct rpc_xprt	*rr_xprt;	/* needed for request/reply matching */ | 
|  | 125 | void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ | 
|  | 126 | struct list_head rr_list;	/* tasklet list */ | 
|  | 127 | wait_queue_head_t rr_unbind;	/* optional unbind wait */ | 
|  | 128 | struct ib_sge	rr_iov;		/* for posting */ | 
|  | 129 | struct ib_mr	*rr_handle;	/* handle for mem in rr_iov */ | 
|  | 130 | char	rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ | 
|  | 131 | }; | 
|  | 132 |  | 
|  | 133 | /* | 
|  | 134 | * struct rpcrdma_req -- structure central to the request/reply sequence. | 
|  | 135 | * | 
|  | 136 | * N of these are associated with a transport instance, and stored in | 
|  | 137 | * struct rpcrdma_buffer. N is the max number of outstanding requests. | 
|  | 138 | * | 
|  | 139 | * It includes pre-registered buffer memory for send AND recv. | 
|  | 140 | * The recv buffer, however, is not owned by this structure, and | 
|  | 141 | * is "donated" to the hardware when a recv is posted. When a | 
|  | 142 | * reply is handled, the recv buffer used is given back to the | 
|  | 143 | * struct rpcrdma_req associated with the request. | 
|  | 144 | * | 
|  | 145 | * In addition to the basic memory, this structure includes an array | 
|  | 146 | * of iovs for send operations. The reason is that the iovs passed to | 
|  | 147 | * ib_post_{send,recv} must not be modified until the work request | 
|  | 148 | * completes. | 
|  | 149 | * | 
|  | 150 | * NOTES: | 
|  | 151 | *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we | 
|  | 152 | *     marshal. The number needed varies depending on the iov lists that | 
|  | 153 | *     are passed to us, the memory registration mode we are in, and if | 
|  | 154 | *     physical addressing is used, the layout. | 
|  | 155 | */ | 
|  | 156 |  | 
|  | 157 | struct rpcrdma_mr_seg {		/* chunk descriptors */ | 
|  | 158 | union {				/* chunk memory handles */ | 
|  | 159 | struct ib_mr	*rl_mr;		/* if registered directly */ | 
|  | 160 | struct rpcrdma_mw {		/* if registered from region */ | 
|  | 161 | union { | 
|  | 162 | struct ib_mw	*mw; | 
|  | 163 | struct ib_fmr	*fmr; | 
| Tom Talpey | fe9053b | 2008-10-09 14:59:59 -0400 | [diff] [blame] | 164 | struct { | 
|  | 165 | struct ib_fast_reg_page_list *fr_pgl; | 
|  | 166 | struct ib_mr *fr_mr; | 
|  | 167 | } frmr; | 
| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 168 | } r; | 
|  | 169 | struct list_head mw_list; | 
|  | 170 | } *rl_mw; | 
|  | 171 | } mr_chunk; | 
|  | 172 | u64		mr_base;	/* registration result */ | 
|  | 173 | u32		mr_rkey;	/* registration result */ | 
|  | 174 | u32		mr_len;		/* length of chunk or segment */ | 
|  | 175 | int		mr_nsegs;	/* number of segments in chunk or 0 */ | 
|  | 176 | enum dma_data_direction	mr_dir;	/* segment mapping direction */ | 
|  | 177 | dma_addr_t	mr_dma;		/* segment mapping address */ | 
|  | 178 | size_t		mr_dmalen;	/* segment mapping length */ | 
|  | 179 | struct page	*mr_page;	/* owning page, if any */ | 
|  | 180 | char		*mr_offset;	/* kva if no page, else offset */ | 
|  | 181 | }; | 
|  | 182 |  | 
|  | 183 | struct rpcrdma_req { | 
|  | 184 | size_t 		rl_size;	/* actual length of buffer */ | 
|  | 185 | unsigned int	rl_niovs;	/* 0, 2 or 4 */ | 
|  | 186 | unsigned int	rl_nchunks;	/* non-zero if chunks */ | 
| Tom Talpey | 575448b | 2008-10-09 15:00:40 -0400 | [diff] [blame] | 187 | unsigned int	rl_connect_cookie;	/* retry detection */ | 
| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 188 | struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ | 
|  | 189 | struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */ | 
|  | 190 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ | 
|  | 191 | struct ib_sge	rl_send_iov[4];	/* for active requests */ | 
|  | 192 | struct ib_sge	rl_iov;		/* for posting */ | 
|  | 193 | struct ib_mr	*rl_handle;	/* handle for mem in rl_iov */ | 
|  | 194 | char		rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ | 
|  | 195 | __u32 		rl_xdr_buf[0];	/* start of returned rpc rq_buffer */ | 
|  | 196 | }; | 
|  | 197 | #define rpcr_to_rdmar(r) \ | 
|  | 198 | container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) | 
|  | 199 |  | 
|  | 200 | /* | 
|  | 201 | * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for | 
|  | 202 | * inline requests/replies, and client/server credits. | 
|  | 203 | * | 
|  | 204 | * One of these is associated with a transport instance | 
|  | 205 | */ | 
|  | 206 | struct rpcrdma_buffer { | 
|  | 207 | spinlock_t	rb_lock;	/* protects indexes */ | 
|  | 208 | atomic_t	rb_credits;	/* most recent server credits */ | 
|  | 209 | unsigned long	rb_cwndscale;	/* cached framework rpc_cwndscale */ | 
|  | 210 | int		rb_max_requests;/* client max requests */ | 
| Tom Talpey | fe9053b | 2008-10-09 14:59:59 -0400 | [diff] [blame] | 211 | struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */ | 
| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 212 | int		rb_send_index; | 
|  | 213 | struct rpcrdma_req	**rb_send_bufs; | 
|  | 214 | int		rb_recv_index; | 
|  | 215 | struct rpcrdma_rep	**rb_recv_bufs; | 
|  | 216 | char		*rb_pool; | 
|  | 217 | }; | 
|  | 218 | #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) | 
|  | 219 |  | 
|  | 220 | /* | 
|  | 221 | * Internal structure for transport instance creation. This | 
|  | 222 | * exists primarily for modularity. | 
|  | 223 | * | 
|  | 224 | * This data should be set with mount options | 
|  | 225 | */ | 
|  | 226 | struct rpcrdma_create_data_internal { | 
|  | 227 | struct sockaddr_storage	addr;	/* RDMA server address */ | 
|  | 228 | unsigned int	max_requests;	/* max requests (slots) in flight */ | 
|  | 229 | unsigned int	rsize;		/* mount rsize - max read hdr+data */ | 
|  | 230 | unsigned int	wsize;		/* mount wsize - max write hdr+data */ | 
|  | 231 | unsigned int	inline_rsize;	/* max non-rdma read data payload */ | 
|  | 232 | unsigned int	inline_wsize;	/* max non-rdma write data payload */ | 
|  | 233 | unsigned int	padding;	/* non-rdma write header padding */ | 
|  | 234 | }; | 
|  | 235 |  | 
|  | 236 | #define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ | 
|  | 237 | (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize) | 
|  | 238 |  | 
|  | 239 | #define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ | 
|  | 240 | (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize) | 
|  | 241 |  | 
|  | 242 | #define RPCRDMA_INLINE_PAD_VALUE(rq)\ | 
|  | 243 | rpcx_to_rdmad(rq->rq_task->tk_xprt).padding | 
|  | 244 |  | 
|  | 245 | /* | 
|  | 246 | * Statistics for RPCRDMA | 
|  | 247 | */ | 
|  | 248 | struct rpcrdma_stats { | 
|  | 249 | unsigned long		read_chunk_count; | 
|  | 250 | unsigned long		write_chunk_count; | 
|  | 251 | unsigned long		reply_chunk_count; | 
|  | 252 |  | 
|  | 253 | unsigned long long	total_rdma_request; | 
|  | 254 | unsigned long long	total_rdma_reply; | 
|  | 255 |  | 
|  | 256 | unsigned long long	pullup_copy_count; | 
|  | 257 | unsigned long long	fixup_copy_count; | 
|  | 258 | unsigned long		hardway_register_count; | 
|  | 259 | unsigned long		failed_marshal_count; | 
|  | 260 | unsigned long		bad_reply_count; | 
|  | 261 | }; | 
|  | 262 |  | 
|  | 263 | /* | 
|  | 264 | * RPCRDMA transport -- encapsulates the structures above for | 
|  | 265 | * integration with RPC. | 
|  | 266 | * | 
|  | 267 | * The contained structures are embedded, not pointers, | 
|  | 268 | * for convenience. This structure need not be visible externally. | 
|  | 269 | * | 
|  | 270 | * It is allocated and initialized during mount, and released | 
|  | 271 | * during unmount. | 
|  | 272 | */ | 
|  | 273 | struct rpcrdma_xprt { | 
|  | 274 | struct rpc_xprt		xprt; | 
|  | 275 | struct rpcrdma_ia	rx_ia; | 
|  | 276 | struct rpcrdma_ep	rx_ep; | 
|  | 277 | struct rpcrdma_buffer	rx_buf; | 
|  | 278 | struct rpcrdma_create_data_internal rx_data; | 
|  | 279 | struct delayed_work	rdma_connect; | 
|  | 280 | struct rpcrdma_stats	rx_stats; | 
|  | 281 | }; | 
|  | 282 |  | 
|  | 283 | #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) | 
|  | 284 | #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) | 
|  | 285 |  | 
| Tom Talpey | 9191ca3 | 2008-10-09 15:01:11 -0400 | [diff] [blame] | 286 | /* Setting this to 0 ensures interoperability with early servers. | 
|  | 287 | * Setting this to 1 enhances certain unaligned read/write performance. | 
|  | 288 | * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ | 
|  | 289 | extern int xprt_rdma_pad_optimize; | 
|  | 290 |  | 
| \"Talpey, Thomas\ | f58851e | 2007-09-10 13:50:12 -0400 | [diff] [blame] | 291 | /* | 
|  | 292 | * Interface Adapter calls - xprtrdma/verbs.c | 
|  | 293 | */ | 
|  | 294 | int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); | 
|  | 295 | void rpcrdma_ia_close(struct rpcrdma_ia *); | 
|  | 296 |  | 
|  | 297 | /* | 
|  | 298 | * Endpoint calls - xprtrdma/verbs.c | 
|  | 299 | */ | 
|  | 300 | int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, | 
|  | 301 | struct rpcrdma_create_data_internal *); | 
|  | 302 | int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); | 
|  | 303 | int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); | 
|  | 304 | int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); | 
|  | 305 |  | 
|  | 306 | int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, | 
|  | 307 | struct rpcrdma_req *); | 
|  | 308 | int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, | 
|  | 309 | struct rpcrdma_rep *); | 
|  | 310 |  | 
|  | 311 | /* | 
|  | 312 | * Buffer calls - xprtrdma/verbs.c | 
|  | 313 | */ | 
|  | 314 | int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, | 
|  | 315 | struct rpcrdma_ia *, | 
|  | 316 | struct rpcrdma_create_data_internal *); | 
|  | 317 | void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); | 
|  | 318 |  | 
|  | 319 | struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); | 
|  | 320 | void rpcrdma_buffer_put(struct rpcrdma_req *); | 
|  | 321 | void rpcrdma_recv_buffer_get(struct rpcrdma_req *); | 
|  | 322 | void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); | 
|  | 323 |  | 
|  | 324 | int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, | 
|  | 325 | struct ib_mr **, struct ib_sge *); | 
|  | 326 | int rpcrdma_deregister_internal(struct rpcrdma_ia *, | 
|  | 327 | struct ib_mr *, struct ib_sge *); | 
|  | 328 |  | 
|  | 329 | int rpcrdma_register_external(struct rpcrdma_mr_seg *, | 
|  | 330 | int, int, struct rpcrdma_xprt *); | 
|  | 331 | int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, | 
|  | 332 | struct rpcrdma_xprt *, void *); | 
|  | 333 |  | 
|  | 334 | /* | 
|  | 335 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c | 
|  | 336 | */ | 
|  | 337 | void rpcrdma_conn_func(struct rpcrdma_ep *); | 
|  | 338 | void rpcrdma_reply_handler(struct rpcrdma_rep *); | 
|  | 339 |  | 
|  | 340 | /* | 
|  | 341 | * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c | 
|  | 342 | */ | 
|  | 343 | int rpcrdma_marshal_req(struct rpc_rqst *); | 
|  | 344 |  | 
|  | 345 | #endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */ |