| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. | 
|  | 3 | * | 
|  | 4 | * This software is available to you under a choice of one of two | 
|  | 5 | * licenses.  You may choose to be licensed under the terms of the GNU | 
|  | 6 | * General Public License (GPL) Version 2, available from the file | 
|  | 7 | * COPYING in the main directory of this source tree, or the BSD-type | 
|  | 8 | * license below: | 
|  | 9 | * | 
|  | 10 | * Redistribution and use in source and binary forms, with or without | 
|  | 11 | * modification, are permitted provided that the following conditions | 
|  | 12 | * are met: | 
|  | 13 | * | 
|  | 14 | *      Redistributions of source code must retain the above copyright | 
|  | 15 | *      notice, this list of conditions and the following disclaimer. | 
|  | 16 | * | 
|  | 17 | *      Redistributions in binary form must reproduce the above | 
|  | 18 | *      copyright notice, this list of conditions and the following | 
|  | 19 | *      disclaimer in the documentation and/or other materials provided | 
|  | 20 | *      with the distribution. | 
|  | 21 | * | 
|  | 22 | *      Neither the name of the Network Appliance, Inc. nor the names of | 
|  | 23 | *      its contributors may be used to endorse or promote products | 
|  | 24 | *      derived from this software without specific prior written | 
|  | 25 | *      permission. | 
|  | 26 | * | 
|  | 27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
|  | 30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
|  | 31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
|  | 32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
|  | 33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
|  | 34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
|  | 35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|  | 36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
|  | 37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  | 38 | * | 
|  | 39 | * Author: Tom Tucker <tom@opengridcomputing.com> | 
|  | 40 | */ | 
|  | 41 |  | 
|  | 42 | #ifndef SVC_RDMA_H | 
|  | 43 | #define SVC_RDMA_H | 
|  | 44 | #include <linux/sunrpc/xdr.h> | 
|  | 45 | #include <linux/sunrpc/svcsock.h> | 
|  | 46 | #include <linux/sunrpc/rpc_rdma.h> | 
|  | 47 | #include <rdma/ib_verbs.h> | 
|  | 48 | #include <rdma/rdma_cm.h> | 
|  | 49 | #define SVCRDMA_DEBUG | 
|  | 50 |  | 
|  | 51 | /* RPC/RDMA parameters and stats */ | 
|  | 52 | extern unsigned int svcrdma_ord; | 
|  | 53 | extern unsigned int svcrdma_max_requests; | 
|  | 54 | extern unsigned int svcrdma_max_req_size; | 
|  | 55 |  | 
|  | 56 | extern atomic_t rdma_stat_recv; | 
|  | 57 | extern atomic_t rdma_stat_read; | 
|  | 58 | extern atomic_t rdma_stat_write; | 
|  | 59 | extern atomic_t rdma_stat_sq_starve; | 
|  | 60 | extern atomic_t rdma_stat_rq_starve; | 
|  | 61 | extern atomic_t rdma_stat_rq_poll; | 
|  | 62 | extern atomic_t rdma_stat_rq_prod; | 
|  | 63 | extern atomic_t rdma_stat_sq_poll; | 
|  | 64 | extern atomic_t rdma_stat_sq_prod; | 
|  | 65 |  | 
|  | 66 | #define RPCRDMA_VERSION 1 | 
|  | 67 |  | 
|  | 68 | /* | 
|  | 69 | * Contexts are built when an RDMA request is created and are a | 
|  | 70 | * record of the resources that can be recovered when the request | 
|  | 71 | * completes. | 
|  | 72 | */ | 
|  | 73 | struct svc_rdma_op_ctxt { | 
| Tom Tucker | 02e7452 | 2008-04-30 19:50:56 -0500 | [diff] [blame] | 74 | struct svc_rdma_op_ctxt *read_hdr; | 
| Tom Tucker | f820c57 | 2008-05-27 17:03:14 -0500 | [diff] [blame] | 75 | int hdr_count; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 76 | struct xdr_buf arg; | 
|  | 77 | struct list_head dto_q; | 
|  | 78 | enum ib_wr_opcode wr_op; | 
|  | 79 | enum ib_wc_status wc_status; | 
|  | 80 | u32 byte_len; | 
|  | 81 | struct svcxprt_rdma *xprt; | 
|  | 82 | unsigned long flags; | 
|  | 83 | enum dma_data_direction direction; | 
|  | 84 | int count; | 
|  | 85 | struct ib_sge sge[RPCSVC_MAXPAGES]; | 
|  | 86 | struct page *pages[RPCSVC_MAXPAGES]; | 
|  | 87 | }; | 
|  | 88 |  | 
| Tom Tucker | ab96ddd | 2008-05-28 13:54:04 -0500 | [diff] [blame] | 89 | /* | 
|  | 90 | * NFS_ requests are mapped on the client side by the chunk lists in | 
|  | 91 | * the RPCRDMA header. During the fetching of the RPC from the client | 
|  | 92 | * and the writing of the reply to the client, the memory in the | 
|  | 93 | * client and the memory in the server must be mapped as contiguous | 
|  | 94 | * vaddr/len for access by the hardware. These data strucures keep | 
|  | 95 | * these mappings. | 
|  | 96 | * | 
|  | 97 | * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the | 
|  | 98 | * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the | 
|  | 99 | * 'ch' field maps the read-list of the RPCRDMA header to the 'sge' | 
|  | 100 | * mapping of the reply. | 
|  | 101 | */ | 
|  | 102 | struct svc_rdma_chunk_sge { | 
|  | 103 | int start;		/* sge no for this chunk */ | 
|  | 104 | int count;		/* sge count for this chunk */ | 
|  | 105 | }; | 
|  | 106 | struct svc_rdma_req_map { | 
|  | 107 | unsigned long count; | 
|  | 108 | union { | 
|  | 109 | struct kvec sge[RPCSVC_MAXPAGES]; | 
|  | 110 | struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; | 
|  | 111 | }; | 
|  | 112 | }; | 
|  | 113 |  | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 114 | #define RDMACTXT_F_LAST_CTXT	2 | 
|  | 115 |  | 
|  | 116 | struct svcxprt_rdma { | 
|  | 117 | struct svc_xprt      sc_xprt;		/* SVC transport structure */ | 
|  | 118 | struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */ | 
|  | 119 | struct list_head     sc_accept_q;	/* Conn. waiting accept */ | 
|  | 120 | int		     sc_ord;		/* RDMA read limit */ | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 121 | int                  sc_max_sge; | 
|  | 122 |  | 
|  | 123 | int                  sc_sq_depth;	/* Depth of SQ */ | 
|  | 124 | atomic_t             sc_sq_count;	/* Number of SQ WR on queue */ | 
|  | 125 |  | 
|  | 126 | int                  sc_max_requests;	/* Depth of RQ */ | 
|  | 127 | int                  sc_max_req_size;	/* Size of each RQ WR buf */ | 
|  | 128 |  | 
|  | 129 | struct ib_pd         *sc_pd; | 
|  | 130 |  | 
| Tom Tucker | 87295b6 | 2008-05-28 13:17:44 -0500 | [diff] [blame] | 131 | atomic_t	     sc_dma_used; | 
| Tom Tucker | 8740767 | 2008-04-30 20:44:39 -0500 | [diff] [blame] | 132 | atomic_t	     sc_ctxt_used; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 133 | struct list_head     sc_rq_dto_q; | 
|  | 134 | spinlock_t	     sc_rq_dto_lock; | 
|  | 135 | struct ib_qp         *sc_qp; | 
|  | 136 | struct ib_cq         *sc_rq_cq; | 
|  | 137 | struct ib_cq         *sc_sq_cq; | 
|  | 138 | struct ib_mr         *sc_phys_mr;	/* MR for server memory */ | 
|  | 139 |  | 
|  | 140 | spinlock_t	     sc_lock;		/* transport lock */ | 
|  | 141 |  | 
|  | 142 | wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */ | 
|  | 143 | unsigned long	     sc_flags; | 
|  | 144 | struct list_head     sc_dto_q;		/* DTO tasklet I/O pending Q */ | 
|  | 145 | struct list_head     sc_read_complete_q; | 
| Tom Tucker | 8da91ea | 2008-04-30 22:00:46 -0500 | [diff] [blame] | 146 | struct work_struct   sc_work; | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 147 | }; | 
|  | 148 | /* sc_flags */ | 
|  | 149 | #define RDMAXPRT_RQ_PENDING	1 | 
|  | 150 | #define RDMAXPRT_SQ_PENDING	2 | 
|  | 151 | #define RDMAXPRT_CONN_PENDING	3 | 
|  | 152 |  | 
|  | 153 | #define RPCRDMA_LISTEN_BACKLOG  10 | 
|  | 154 | /* The default ORD value is based on two outstanding full-size writes with a | 
|  | 155 | * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */ | 
|  | 156 | #define RPCRDMA_ORD             (64/4) | 
|  | 157 | #define RPCRDMA_SQ_DEPTH_MULT   8 | 
|  | 158 | #define RPCRDMA_MAX_THREADS     16 | 
|  | 159 | #define RPCRDMA_MAX_REQUESTS    16 | 
|  | 160 | #define RPCRDMA_MAX_REQ_SIZE    4096 | 
|  | 161 |  | 
|  | 162 | /* svc_rdma_marshal.c */ | 
|  | 163 | extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, | 
|  | 164 | int *, int *); | 
|  | 165 | extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); | 
|  | 166 | extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); | 
|  | 167 | extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, | 
|  | 168 | struct rpcrdma_msg *, | 
|  | 169 | enum rpcrdma_errcode, u32 *); | 
|  | 170 | extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); | 
|  | 171 | extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); | 
|  | 172 | extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, | 
|  | 173 | u32, u64, u32); | 
|  | 174 | extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, | 
|  | 175 | struct rpcrdma_msg *, | 
|  | 176 | struct rpcrdma_msg *, | 
|  | 177 | enum rpcrdma_proc); | 
|  | 178 | extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); | 
|  | 179 |  | 
|  | 180 | /* svc_rdma_recvfrom.c */ | 
|  | 181 | extern int svc_rdma_recvfrom(struct svc_rqst *); | 
|  | 182 |  | 
|  | 183 | /* svc_rdma_sendto.c */ | 
|  | 184 | extern int svc_rdma_sendto(struct svc_rqst *); | 
|  | 185 |  | 
|  | 186 | /* svc_rdma_transport.c */ | 
|  | 187 | extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); | 
| Tom Tucker | 008fdbc | 2008-05-07 15:47:42 -0500 | [diff] [blame] | 188 | extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, | 
|  | 189 | enum rpcrdma_errcode); | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 190 | struct page *svc_rdma_get_page(void); | 
|  | 191 | extern int svc_rdma_post_recv(struct svcxprt_rdma *); | 
|  | 192 | extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); | 
|  | 193 | extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); | 
|  | 194 | extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); | 
| Tom Tucker | ab96ddd | 2008-05-28 13:54:04 -0500 | [diff] [blame] | 195 | extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); | 
|  | 196 | extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); | 
| Tom Tucker | d21b05f | 2007-12-12 16:13:17 -0600 | [diff] [blame] | 197 | extern void svc_sq_reap(struct svcxprt_rdma *); | 
|  | 198 | extern void svc_rq_reap(struct svcxprt_rdma *); | 
|  | 199 | extern struct svc_xprt_class svc_rdma_class; | 
|  | 200 | extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); | 
|  | 201 |  | 
|  | 202 | /* svc_rdma.c */ | 
|  | 203 | extern int svc_rdma_init(void); | 
|  | 204 | extern void svc_rdma_cleanup(void); | 
|  | 205 |  | 
|  | 206 | /* | 
|  | 207 | * Returns the address of the first read chunk or <nul> if no read chunk is | 
|  | 208 | * present | 
|  | 209 | */ | 
|  | 210 | static inline struct rpcrdma_read_chunk * | 
|  | 211 | svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) | 
|  | 212 | { | 
|  | 213 | struct rpcrdma_read_chunk *ch = | 
|  | 214 | (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; | 
|  | 215 |  | 
|  | 216 | if (ch->rc_discrim == 0) | 
|  | 217 | return NULL; | 
|  | 218 |  | 
|  | 219 | return ch; | 
|  | 220 | } | 
|  | 221 |  | 
|  | 222 | /* | 
|  | 223 | * Returns the address of the first read write array element or <nul> if no | 
|  | 224 | * write array list is present | 
|  | 225 | */ | 
|  | 226 | static inline struct rpcrdma_write_array * | 
|  | 227 | svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) | 
|  | 228 | { | 
|  | 229 | if (rmsgp->rm_body.rm_chunks[0] != 0 | 
|  | 230 | || rmsgp->rm_body.rm_chunks[1] == 0) | 
|  | 231 | return NULL; | 
|  | 232 |  | 
|  | 233 | return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; | 
|  | 234 | } | 
|  | 235 |  | 
|  | 236 | /* | 
|  | 237 | * Returns the address of the first reply array element or <nul> if no | 
|  | 238 | * reply array is present | 
|  | 239 | */ | 
|  | 240 | static inline struct rpcrdma_write_array * | 
|  | 241 | svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) | 
|  | 242 | { | 
|  | 243 | struct rpcrdma_read_chunk *rch; | 
|  | 244 | struct rpcrdma_write_array *wr_ary; | 
|  | 245 | struct rpcrdma_write_array *rp_ary; | 
|  | 246 |  | 
|  | 247 | /* XXX: Need to fix when reply list may occur with read-list and/or | 
|  | 248 | * write list */ | 
|  | 249 | if (rmsgp->rm_body.rm_chunks[0] != 0 || | 
|  | 250 | rmsgp->rm_body.rm_chunks[1] != 0) | 
|  | 251 | return NULL; | 
|  | 252 |  | 
|  | 253 | rch = svc_rdma_get_read_chunk(rmsgp); | 
|  | 254 | if (rch) { | 
|  | 255 | while (rch->rc_discrim) | 
|  | 256 | rch++; | 
|  | 257 |  | 
|  | 258 | /* The reply list follows an empty write array located | 
|  | 259 | * at 'rc_position' here. The reply array is at rc_target. | 
|  | 260 | */ | 
|  | 261 | rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; | 
|  | 262 |  | 
|  | 263 | goto found_it; | 
|  | 264 | } | 
|  | 265 |  | 
|  | 266 | wr_ary = svc_rdma_get_write_array(rmsgp); | 
|  | 267 | if (wr_ary) { | 
|  | 268 | rp_ary = (struct rpcrdma_write_array *) | 
|  | 269 | &wr_ary-> | 
|  | 270 | wc_array[wr_ary->wc_nchunks].wc_target.rs_length; | 
|  | 271 |  | 
|  | 272 | goto found_it; | 
|  | 273 | } | 
|  | 274 |  | 
|  | 275 | /* No read list, no write list */ | 
|  | 276 | rp_ary = (struct rpcrdma_write_array *) | 
|  | 277 | &rmsgp->rm_body.rm_chunks[2]; | 
|  | 278 |  | 
|  | 279 | found_it: | 
|  | 280 | if (rp_ary->wc_discrim == 0) | 
|  | 281 | return NULL; | 
|  | 282 |  | 
|  | 283 | return rp_ary; | 
|  | 284 | } | 
|  | 285 | #endif |