| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 1 | /* | 
 | 2 |  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. | 
 | 3 |  * | 
 | 4 |  * This software is available to you under a choice of one of two | 
 | 5 |  * licenses.  You may choose to be licensed under the terms of the GNU | 
 | 6 |  * General Public License (GPL) Version 2, available from the file | 
 | 7 |  * COPYING in the main directory of this source tree, or the BSD-type | 
 | 8 |  * license below: | 
 | 9 |  * | 
 | 10 |  * Redistribution and use in source and binary forms, with or without | 
 | 11 |  * modification, are permitted provided that the following conditions | 
 | 12 |  * are met: | 
 | 13 |  * | 
 | 14 |  *      Redistributions of source code must retain the above copyright | 
 | 15 |  *      notice, this list of conditions and the following disclaimer. | 
 | 16 |  * | 
 | 17 |  *      Redistributions in binary form must reproduce the above | 
 | 18 |  *      copyright notice, this list of conditions and the following | 
 | 19 |  *      disclaimer in the documentation and/or other materials provided | 
 | 20 |  *      with the distribution. | 
 | 21 |  * | 
 | 22 |  *      Neither the name of the Network Appliance, Inc. nor the names of | 
 | 23 |  *      its contributors may be used to endorse or promote products | 
 | 24 |  *      derived from this software without specific prior written | 
 | 25 |  *      permission. | 
 | 26 |  * | 
 | 27 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 | 28 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 | 29 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
 | 30 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
 | 31 |  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
 | 32 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
 | 33 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
 | 34 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
 | 35 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
 | 36 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
 | 37 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
 | 38 |  * | 
 | 39 |  * Author: Tom Tucker <tom@opengridcomputing.com> | 
 | 40 |  */ | 
 | 41 |  | 
 | 42 | #ifndef SVC_RDMA_H | 
 | 43 | #define SVC_RDMA_H | 
 | 44 | #include <linux/sunrpc/xdr.h> | 
 | 45 | #include <linux/sunrpc/svcsock.h> | 
 | 46 | #include <linux/sunrpc/rpc_rdma.h> | 
 | 47 | #include <rdma/ib_verbs.h> | 
 | 48 | #include <rdma/rdma_cm.h> | 
 | 49 | #define SVCRDMA_DEBUG | 
 | 50 |  | 
 | 51 | /* RPC/RDMA parameters and stats */ | 
 | 52 | extern unsigned int svcrdma_ord; | 
 | 53 | extern unsigned int svcrdma_max_requests; | 
 | 54 | extern unsigned int svcrdma_max_req_size; | 
 | 55 |  | 
 | 56 | extern atomic_t rdma_stat_recv; | 
 | 57 | extern atomic_t rdma_stat_read; | 
 | 58 | extern atomic_t rdma_stat_write; | 
 | 59 | extern atomic_t rdma_stat_sq_starve; | 
 | 60 | extern atomic_t rdma_stat_rq_starve; | 
 | 61 | extern atomic_t rdma_stat_rq_poll; | 
 | 62 | extern atomic_t rdma_stat_rq_prod; | 
 | 63 | extern atomic_t rdma_stat_sq_poll; | 
 | 64 | extern atomic_t rdma_stat_sq_prod; | 
 | 65 |  | 
 | 66 | #define RPCRDMA_VERSION 1 | 
 | 67 |  | 
 | 68 | /* | 
 | 69 |  * Contexts are built when an RDMA request is created and are a | 
 | 70 |  * record of the resources that can be recovered when the request | 
 | 71 |  * completes. | 
 | 72 |  */ | 
 | 73 | struct svc_rdma_op_ctxt { | 
| Tom Tucker | 02e7452 | 2008-04-30 19:50:56 -0500 | [diff] [blame] | 74 | 	struct svc_rdma_op_ctxt *read_hdr; | 
| Tom Tucker | 0d3ebb9 | 2008-09-30 13:06:13 -0500 | [diff] [blame] | 75 | 	struct svc_rdma_fastreg_mr *frmr; | 
| Tom Tucker | f820c57 | 2008-05-27 17:03:14 -0500 | [diff] [blame] | 76 | 	int hdr_count; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 77 | 	struct xdr_buf arg; | 
 | 78 | 	struct list_head dto_q; | 
 | 79 | 	enum ib_wr_opcode wr_op; | 
 | 80 | 	enum ib_wc_status wc_status; | 
 | 81 | 	u32 byte_len; | 
 | 82 | 	struct svcxprt_rdma *xprt; | 
 | 83 | 	unsigned long flags; | 
 | 84 | 	enum dma_data_direction direction; | 
 | 85 | 	int count; | 
 | 86 | 	struct ib_sge sge[RPCSVC_MAXPAGES]; | 
 | 87 | 	struct page *pages[RPCSVC_MAXPAGES]; | 
 | 88 | }; | 
 | 89 |  | 
| Tom Tucker | ab96ddd | 2008-05-28 13:54:04 -0500 | [diff] [blame] | 90 | /* | 
 | 91 |  * NFS_ requests are mapped on the client side by the chunk lists in | 
 | 92 |  * the RPCRDMA header. During the fetching of the RPC from the client | 
 | 93 |  * and the writing of the reply to the client, the memory in the | 
 | 94 |  * client and the memory in the server must be mapped as contiguous | 
 | 95 |  * vaddr/len for access by the hardware. These data strucures keep | 
 | 96 |  * these mappings. | 
 | 97 |  * | 
 | 98 |  * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the | 
 | 99 |  * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the | 
 | 100 |  * 'ch' field maps the read-list of the RPCRDMA header to the 'sge' | 
 | 101 |  * mapping of the reply. | 
 | 102 |  */ | 
 | 103 | struct svc_rdma_chunk_sge { | 
 | 104 | 	int start;		/* sge no for this chunk */ | 
 | 105 | 	int count;		/* sge count for this chunk */ | 
 | 106 | }; | 
| Tom Tucker | 0d3ebb9 | 2008-09-30 13:06:13 -0500 | [diff] [blame] | 107 | struct svc_rdma_fastreg_mr { | 
 | 108 | 	struct ib_mr *mr; | 
 | 109 | 	void *kva; | 
 | 110 | 	struct ib_fast_reg_page_list *page_list; | 
 | 111 | 	int page_list_len; | 
 | 112 | 	unsigned long access_flags; | 
 | 113 | 	unsigned long map_len; | 
 | 114 | 	enum dma_data_direction direction; | 
 | 115 | 	struct list_head frmr_list; | 
 | 116 | }; | 
| Tom Tucker | ab96ddd | 2008-05-28 13:54:04 -0500 | [diff] [blame] | 117 | struct svc_rdma_req_map { | 
| Tom Tucker | 0d3ebb9 | 2008-09-30 13:06:13 -0500 | [diff] [blame] | 118 | 	struct svc_rdma_fastreg_mr *frmr; | 
| Tom Tucker | ab96ddd | 2008-05-28 13:54:04 -0500 | [diff] [blame] | 119 | 	unsigned long count; | 
 | 120 | 	union { | 
 | 121 | 		struct kvec sge[RPCSVC_MAXPAGES]; | 
 | 122 | 		struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; | 
 | 123 | 	}; | 
 | 124 | }; | 
| Tom Tucker | 0d3ebb9 | 2008-09-30 13:06:13 -0500 | [diff] [blame] | 125 | #define RDMACTXT_F_FAST_UNREG	1 | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 126 | #define RDMACTXT_F_LAST_CTXT	2 | 
 | 127 |  | 
| Tom Tucker | 0d3ebb9 | 2008-09-30 13:06:13 -0500 | [diff] [blame] | 128 | #define	SVCRDMA_DEVCAP_FAST_REG		1	/* fast mr registration */ | 
 | 129 | #define	SVCRDMA_DEVCAP_READ_W_INV	2	/* read w/ invalidate */ | 
 | 130 |  | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 131 | struct svcxprt_rdma { | 
 | 132 | 	struct svc_xprt      sc_xprt;		/* SVC transport structure */ | 
 | 133 | 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */ | 
 | 134 | 	struct list_head     sc_accept_q;	/* Conn. waiting accept */ | 
 | 135 | 	int		     sc_ord;		/* RDMA read limit */ | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 136 | 	int                  sc_max_sge; | 
 | 137 |  | 
 | 138 | 	int                  sc_sq_depth;	/* Depth of SQ */ | 
 | 139 | 	atomic_t             sc_sq_count;	/* Number of SQ WR on queue */ | 
 | 140 |  | 
 | 141 | 	int                  sc_max_requests;	/* Depth of RQ */ | 
 | 142 | 	int                  sc_max_req_size;	/* Size of each RQ WR buf */ | 
 | 143 |  | 
 | 144 | 	struct ib_pd         *sc_pd; | 
 | 145 |  | 
| Tom Tucker | 87295b6 | 2008-05-28 13:17:44 -0500 | [diff] [blame] | 146 | 	atomic_t	     sc_dma_used; | 
| Tom Tucker | 8740767 | 2008-04-30 20:44:39 -0500 | [diff] [blame] | 147 | 	atomic_t	     sc_ctxt_used; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 148 | 	struct list_head     sc_rq_dto_q; | 
 | 149 | 	spinlock_t	     sc_rq_dto_lock; | 
 | 150 | 	struct ib_qp         *sc_qp; | 
 | 151 | 	struct ib_cq         *sc_rq_cq; | 
 | 152 | 	struct ib_cq         *sc_sq_cq; | 
 | 153 | 	struct ib_mr         *sc_phys_mr;	/* MR for server memory */ | 
| Tom Tucker | 0d3ebb9 | 2008-09-30 13:06:13 -0500 | [diff] [blame] | 154 | 	u32		     sc_dev_caps;	/* distilled device caps */ | 
 | 155 | 	u32		     sc_dma_lkey;	/* local dma key */ | 
 | 156 | 	unsigned int	     sc_frmr_pg_list_len; | 
 | 157 | 	struct list_head     sc_frmr_q; | 
 | 158 | 	spinlock_t	     sc_frmr_q_lock; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 159 |  | 
 | 160 | 	spinlock_t	     sc_lock;		/* transport lock */ | 
 | 161 |  | 
 | 162 | 	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */ | 
 | 163 | 	unsigned long	     sc_flags; | 
 | 164 | 	struct list_head     sc_dto_q;		/* DTO tasklet I/O pending Q */ | 
 | 165 | 	struct list_head     sc_read_complete_q; | 
| Tom Tucker | 8da91ea | 2008-04-30 22:00:46 -0500 | [diff] [blame] | 166 | 	struct work_struct   sc_work; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 167 | }; | 
 | 168 | /* sc_flags */ | 
 | 169 | #define RDMAXPRT_RQ_PENDING	1 | 
 | 170 | #define RDMAXPRT_SQ_PENDING	2 | 
 | 171 | #define RDMAXPRT_CONN_PENDING	3 | 
 | 172 |  | 
 | 173 | #define RPCRDMA_LISTEN_BACKLOG  10 | 
 | 174 | /* The default ORD value is based on two outstanding full-size writes with a | 
 | 175 |  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */ | 
 | 176 | #define RPCRDMA_ORD             (64/4) | 
 | 177 | #define RPCRDMA_SQ_DEPTH_MULT   8 | 
 | 178 | #define RPCRDMA_MAX_THREADS     16 | 
 | 179 | #define RPCRDMA_MAX_REQUESTS    16 | 
 | 180 | #define RPCRDMA_MAX_REQ_SIZE    4096 | 
 | 181 |  | 
 | 182 | /* svc_rdma_marshal.c */ | 
 | 183 | extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, | 
 | 184 | 				      int *, int *); | 
 | 185 | extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); | 
 | 186 | extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); | 
 | 187 | extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, | 
 | 188 | 				     struct rpcrdma_msg *, | 
 | 189 | 				     enum rpcrdma_errcode, u32 *); | 
 | 190 | extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); | 
 | 191 | extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); | 
 | 192 | extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, | 
| Dan Carpenter | 1fa9c44 | 2012-02-21 10:28:04 +0300 | [diff] [blame] | 193 | 					    __be32, __be64, u32); | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 194 | extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, | 
 | 195 | 					     struct rpcrdma_msg *, | 
 | 196 | 					     struct rpcrdma_msg *, | 
 | 197 | 					     enum rpcrdma_proc); | 
 | 198 | extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); | 
 | 199 |  | 
 | 200 | /* svc_rdma_recvfrom.c */ | 
 | 201 | extern int svc_rdma_recvfrom(struct svc_rqst *); | 
 | 202 |  | 
 | 203 | /* svc_rdma_sendto.c */ | 
 | 204 | extern int svc_rdma_sendto(struct svc_rqst *); | 
 | 205 |  | 
 | 206 | /* svc_rdma_transport.c */ | 
 | 207 | extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); | 
| Tom Tucker | 008fdbc | 2008-05-07 15:47:42 -0500 | [diff] [blame] | 208 | extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, | 
 | 209 | 				enum rpcrdma_errcode); | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 210 | struct page *svc_rdma_get_page(void); | 
 | 211 | extern int svc_rdma_post_recv(struct svcxprt_rdma *); | 
 | 212 | extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); | 
 | 213 | extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); | 
 | 214 | extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); | 
| Tom Tucker | 146b6df | 2008-08-12 15:12:10 -0500 | [diff] [blame] | 215 | extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); | 
| Tom Tucker | ab96ddd | 2008-05-28 13:54:04 -0500 | [diff] [blame] | 216 | extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); | 
 | 217 | extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); | 
| Tom Tucker | e118321 | 2008-10-03 15:22:18 -0500 | [diff] [blame] | 218 | extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); | 
| Tom Tucker | 64be8608 | 2008-10-06 14:45:18 -0500 | [diff] [blame] | 219 | extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); | 
 | 220 | extern void svc_rdma_put_frmr(struct svcxprt_rdma *, | 
 | 221 | 			      struct svc_rdma_fastreg_mr *); | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 222 | extern void svc_sq_reap(struct svcxprt_rdma *); | 
 | 223 | extern void svc_rq_reap(struct svcxprt_rdma *); | 
 | 224 | extern struct svc_xprt_class svc_rdma_class; | 
 | 225 | extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); | 
 | 226 |  | 
 | 227 | /* svc_rdma.c */ | 
 | 228 | extern int svc_rdma_init(void); | 
 | 229 | extern void svc_rdma_cleanup(void); | 
 | 230 |  | 
 | 231 | /* | 
 | 232 |  * Returns the address of the first read chunk or <nul> if no read chunk is | 
 | 233 |  * present | 
 | 234 |  */ | 
 | 235 | static inline struct rpcrdma_read_chunk * | 
 | 236 | svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) | 
 | 237 | { | 
 | 238 | 	struct rpcrdma_read_chunk *ch = | 
 | 239 | 		(struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; | 
 | 240 |  | 
 | 241 | 	if (ch->rc_discrim == 0) | 
 | 242 | 		return NULL; | 
 | 243 |  | 
 | 244 | 	return ch; | 
 | 245 | } | 
 | 246 |  | 
 | 247 | /* | 
 | 248 |  * Returns the address of the first read write array element or <nul> if no | 
 | 249 |  * write array list is present | 
 | 250 |  */ | 
 | 251 | static inline struct rpcrdma_write_array * | 
 | 252 | svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) | 
 | 253 | { | 
 | 254 | 	if (rmsgp->rm_body.rm_chunks[0] != 0 | 
 | 255 | 	    || rmsgp->rm_body.rm_chunks[1] == 0) | 
 | 256 | 		return NULL; | 
 | 257 |  | 
 | 258 | 	return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; | 
 | 259 | } | 
 | 260 |  | 
 | 261 | /* | 
 | 262 |  * Returns the address of the first reply array element or <nul> if no | 
 | 263 |  * reply array is present | 
 | 264 |  */ | 
 | 265 | static inline struct rpcrdma_write_array * | 
 | 266 | svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) | 
 | 267 | { | 
 | 268 | 	struct rpcrdma_read_chunk *rch; | 
 | 269 | 	struct rpcrdma_write_array *wr_ary; | 
 | 270 | 	struct rpcrdma_write_array *rp_ary; | 
 | 271 |  | 
 | 272 | 	/* XXX: Need to fix when reply list may occur with read-list and/or | 
 | 273 | 	 * write list */ | 
 | 274 | 	if (rmsgp->rm_body.rm_chunks[0] != 0 || | 
 | 275 | 	    rmsgp->rm_body.rm_chunks[1] != 0) | 
 | 276 | 		return NULL; | 
 | 277 |  | 
 | 278 | 	rch = svc_rdma_get_read_chunk(rmsgp); | 
 | 279 | 	if (rch) { | 
 | 280 | 		while (rch->rc_discrim) | 
 | 281 | 			rch++; | 
 | 282 |  | 
 | 283 | 		/* The reply list follows an empty write array located | 
 | 284 | 		 * at 'rc_position' here. The reply array is at rc_target. | 
 | 285 | 		 */ | 
 | 286 | 		rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; | 
 | 287 |  | 
 | 288 | 		goto found_it; | 
 | 289 | 	} | 
 | 290 |  | 
 | 291 | 	wr_ary = svc_rdma_get_write_array(rmsgp); | 
 | 292 | 	if (wr_ary) { | 
 | 293 | 		rp_ary = (struct rpcrdma_write_array *) | 
 | 294 | 			&wr_ary-> | 
| Tom Tucker | cec56c8 | 2012-02-15 11:30:00 -0600 | [diff] [blame] | 295 | 			wc_array[ntohl(wr_ary->wc_nchunks)].wc_target.rs_length; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 296 |  | 
 | 297 | 		goto found_it; | 
 | 298 | 	} | 
 | 299 |  | 
 | 300 | 	/* No read list, no write list */ | 
 | 301 | 	rp_ary = (struct rpcrdma_write_array *) | 
 | 302 | 		&rmsgp->rm_body.rm_chunks[2]; | 
 | 303 |  | 
 | 304 |  found_it: | 
 | 305 | 	if (rp_ary->wc_discrim == 0) | 
 | 306 | 		return NULL; | 
 | 307 |  | 
 | 308 | 	return rp_ary; | 
 | 309 | } | 
 | 310 | #endif |