| Tom Tucker | fc79d4b | 2008-10-22 18:47:39 -0500 | [diff] [blame] | 1 | /* | 
 | 2 |  * linux/fs/9p/trans_rdma.c | 
 | 3 |  * | 
 | 4 |  * RDMA transport layer based on the trans_fd.c implementation. | 
 | 5 |  * | 
 | 6 |  *  Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> | 
 | 7 |  *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com> | 
 | 8 |  *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> | 
 | 9 |  *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> | 
 | 10 |  *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> | 
 | 11 |  * | 
 | 12 |  *  This program is free software; you can redistribute it and/or modify | 
 | 13 |  *  it under the terms of the GNU General Public License version 2 | 
 | 14 |  *  as published by the Free Software Foundation. | 
 | 15 |  * | 
 | 16 |  *  This program is distributed in the hope that it will be useful, | 
 | 17 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 | 18 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
 | 19 |  *  GNU General Public License for more details. | 
 | 20 |  * | 
 | 21 |  *  You should have received a copy of the GNU General Public License | 
 | 22 |  *  along with this program; if not, write to: | 
 | 23 |  *  Free Software Foundation | 
 | 24 |  *  51 Franklin Street, Fifth Floor | 
 | 25 |  *  Boston, MA  02111-1301  USA | 
 | 26 |  * | 
 | 27 |  */ | 
 | 28 |  | 
 | 29 | #include <linux/in.h> | 
 | 30 | #include <linux/module.h> | 
 | 31 | #include <linux/net.h> | 
 | 32 | #include <linux/ipv6.h> | 
 | 33 | #include <linux/kthread.h> | 
 | 34 | #include <linux/errno.h> | 
 | 35 | #include <linux/kernel.h> | 
 | 36 | #include <linux/un.h> | 
 | 37 | #include <linux/uaccess.h> | 
 | 38 | #include <linux/inet.h> | 
 | 39 | #include <linux/idr.h> | 
 | 40 | #include <linux/file.h> | 
 | 41 | #include <linux/parser.h> | 
 | 42 | #include <linux/semaphore.h> | 
 | 43 | #include <net/9p/9p.h> | 
 | 44 | #include <net/9p/client.h> | 
 | 45 | #include <net/9p/transport.h> | 
 | 46 | #include <rdma/ib_verbs.h> | 
 | 47 | #include <rdma/rdma_cm.h> | 
| Tom Tucker | fc79d4b | 2008-10-22 18:47:39 -0500 | [diff] [blame] | 48 |  | 
 | 49 | #define P9_PORT			5640 | 
 | 50 | #define P9_RDMA_SQ_DEPTH	32 | 
 | 51 | #define P9_RDMA_RQ_DEPTH	32 | 
 | 52 | #define P9_RDMA_SEND_SGE	4 | 
 | 53 | #define P9_RDMA_RECV_SGE	4 | 
 | 54 | #define P9_RDMA_IRD		0 | 
 | 55 | #define P9_RDMA_ORD		0 | 
 | 56 | #define P9_RDMA_TIMEOUT		30000		/* 30 seconds */ | 
 | 57 | #define P9_RDMA_MAXSIZE		(4*4096)	/* Min SGE is 4, so we can | 
 | 58 | 						 * safely advertise a maxsize | 
 | 59 | 						 * of 64k */ | 
 | 60 |  | 
 | 61 | #define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT) | 
 | 62 | /** | 
 | 63 |  * struct p9_trans_rdma - RDMA transport instance | 
 | 64 |  * | 
 | 65 |  * @state: tracks the transport state machine for connection setup and tear down | 
 | 66 |  * @cm_id: The RDMA CM ID | 
 | 67 |  * @pd: Protection Domain pointer | 
 | 68 |  * @qp: Queue Pair pointer | 
 | 69 |  * @cq: Completion Queue pointer | 
 | 70 |  * @lkey: The local access only memory region key | 
 | 71 |  * @timeout: Number of uSecs to wait for connection management events | 
 | 72 |  * @sq_depth: The depth of the Send Queue | 
 | 73 |  * @sq_sem: Semaphore for the SQ | 
 | 74 |  * @rq_depth: The depth of the Receive Queue. | 
 | 75 |  * @addr: The remote peer's address | 
 | 76 |  * @req_lock: Protects the active request list | 
 | 77 |  * @send_wait: Wait list when the SQ fills up | 
 | 78 |  * @cm_done: Completion event for connection management tracking | 
 | 79 |  */ | 
 | 80 | struct p9_trans_rdma { | 
 | 81 | 	enum { | 
 | 82 | 		P9_RDMA_INIT, | 
 | 83 | 		P9_RDMA_ADDR_RESOLVED, | 
 | 84 | 		P9_RDMA_ROUTE_RESOLVED, | 
 | 85 | 		P9_RDMA_CONNECTED, | 
 | 86 | 		P9_RDMA_FLUSHING, | 
 | 87 | 		P9_RDMA_CLOSING, | 
 | 88 | 		P9_RDMA_CLOSED, | 
 | 89 | 	} state; | 
 | 90 | 	struct rdma_cm_id *cm_id; | 
 | 91 | 	struct ib_pd *pd; | 
 | 92 | 	struct ib_qp *qp; | 
 | 93 | 	struct ib_cq *cq; | 
 | 94 | 	struct ib_mr *dma_mr; | 
 | 95 | 	u32 lkey; | 
 | 96 | 	long timeout; | 
 | 97 | 	int sq_depth; | 
 | 98 | 	struct semaphore sq_sem; | 
 | 99 | 	int rq_depth; | 
 | 100 | 	atomic_t rq_count; | 
 | 101 | 	struct sockaddr_in addr; | 
 | 102 | 	spinlock_t req_lock; | 
 | 103 |  | 
 | 104 | 	struct completion cm_done; | 
 | 105 | }; | 
 | 106 |  | 
 | 107 | /** | 
 | 108 |  * p9_rdma_context - Keeps track of in-process WR | 
 | 109 |  * | 
 | 110 |  * @wc_op: The original WR op for when the CQE completes in error. | 
 | 111 |  * @busa: Bus address to unmap when the WR completes | 
 | 112 |  * @req: Keeps track of requests (send) | 
 | 113 |  * @rc: Keepts track of replies (receive) | 
 | 114 |  */ | 
 | 115 | struct p9_rdma_req; | 
 | 116 | struct p9_rdma_context { | 
 | 117 | 	enum ib_wc_opcode wc_op; | 
 | 118 | 	dma_addr_t busa; | 
 | 119 | 	union { | 
 | 120 | 		struct p9_req_t *req; | 
 | 121 | 		struct p9_fcall *rc; | 
 | 122 | 	}; | 
 | 123 | }; | 
 | 124 |  | 
 | 125 | /** | 
 | 126 |  * p9_rdma_opts - Collection of mount options | 
 | 127 |  * @port: port of connection | 
 | 128 |  * @sq_depth: The requested depth of the SQ. This really doesn't need | 
 | 129 |  * to be any deeper than the number of threads used in the client | 
 | 130 |  * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth | 
 | 131 |  * @timeout: Time to wait in msecs for CM events | 
 | 132 |  */ | 
 | 133 | struct p9_rdma_opts { | 
 | 134 | 	short port; | 
 | 135 | 	int sq_depth; | 
 | 136 | 	int rq_depth; | 
 | 137 | 	long timeout; | 
 | 138 | }; | 
 | 139 |  | 
 | 140 | /* | 
 | 141 |  * Option Parsing (code inspired by NFS code) | 
 | 142 |  */ | 
 | 143 | enum { | 
 | 144 | 	/* Options that take integer arguments */ | 
 | 145 | 	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, | 
 | 146 | }; | 
 | 147 |  | 
 | 148 | static match_table_t tokens = { | 
 | 149 | 	{Opt_port, "port=%u"}, | 
 | 150 | 	{Opt_sq_depth, "sq=%u"}, | 
 | 151 | 	{Opt_rq_depth, "rq=%u"}, | 
 | 152 | 	{Opt_timeout, "timeout=%u"}, | 
 | 153 | 	{Opt_err, NULL}, | 
 | 154 | }; | 
 | 155 |  | 
 | 156 | /** | 
 | 157 |  * parse_options - parse mount options into session structure | 
 | 158 |  * @options: options string passed from mount | 
 | 159 |  * @opts: transport-specific structure to parse options into | 
 | 160 |  * | 
 | 161 |  * Returns 0 upon success, -ERRNO upon failure | 
 | 162 |  */ | 
 | 163 | static int parse_opts(char *params, struct p9_rdma_opts *opts) | 
 | 164 | { | 
 | 165 | 	char *p; | 
 | 166 | 	substring_t args[MAX_OPT_ARGS]; | 
 | 167 | 	int option; | 
 | 168 | 	char *options; | 
 | 169 | 	int ret; | 
 | 170 |  | 
 | 171 | 	opts->port = P9_PORT; | 
 | 172 | 	opts->sq_depth = P9_RDMA_SQ_DEPTH; | 
 | 173 | 	opts->rq_depth = P9_RDMA_RQ_DEPTH; | 
 | 174 | 	opts->timeout = P9_RDMA_TIMEOUT; | 
 | 175 |  | 
 | 176 | 	if (!params) | 
 | 177 | 		return 0; | 
 | 178 |  | 
 | 179 | 	options = kstrdup(params, GFP_KERNEL); | 
 | 180 | 	if (!options) { | 
 | 181 | 		P9_DPRINTK(P9_DEBUG_ERROR, | 
 | 182 | 			   "failed to allocate copy of option string\n"); | 
 | 183 | 		return -ENOMEM; | 
 | 184 | 	} | 
 | 185 |  | 
 | 186 | 	while ((p = strsep(&options, ",")) != NULL) { | 
 | 187 | 		int token; | 
 | 188 | 		int r; | 
 | 189 | 		if (!*p) | 
 | 190 | 			continue; | 
 | 191 | 		token = match_token(p, tokens, args); | 
 | 192 | 		r = match_int(&args[0], &option); | 
 | 193 | 		if (r < 0) { | 
 | 194 | 			P9_DPRINTK(P9_DEBUG_ERROR, | 
 | 195 | 				   "integer field, but no integer?\n"); | 
 | 196 | 			ret = r; | 
 | 197 | 			continue; | 
 | 198 | 		} | 
 | 199 | 		switch (token) { | 
 | 200 | 		case Opt_port: | 
 | 201 | 			opts->port = option; | 
 | 202 | 			break; | 
 | 203 | 		case Opt_sq_depth: | 
 | 204 | 			opts->sq_depth = option; | 
 | 205 | 			break; | 
 | 206 | 		case Opt_rq_depth: | 
 | 207 | 			opts->rq_depth = option; | 
 | 208 | 			break; | 
 | 209 | 		case Opt_timeout: | 
 | 210 | 			opts->timeout = option; | 
 | 211 | 			break; | 
 | 212 | 		default: | 
 | 213 | 			continue; | 
 | 214 | 		} | 
 | 215 | 	} | 
 | 216 | 	/* RQ must be at least as large as the SQ */ | 
 | 217 | 	opts->rq_depth = max(opts->rq_depth, opts->sq_depth); | 
 | 218 | 	kfree(options); | 
 | 219 | 	return 0; | 
 | 220 | } | 
 | 221 |  | 
 | 222 | static int | 
 | 223 | p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) | 
 | 224 | { | 
 | 225 | 	struct p9_client *c = id->context; | 
 | 226 | 	struct p9_trans_rdma *rdma = c->trans; | 
 | 227 | 	switch (event->event) { | 
 | 228 | 	case RDMA_CM_EVENT_ADDR_RESOLVED: | 
 | 229 | 		BUG_ON(rdma->state != P9_RDMA_INIT); | 
 | 230 | 		rdma->state = P9_RDMA_ADDR_RESOLVED; | 
 | 231 | 		break; | 
 | 232 |  | 
 | 233 | 	case RDMA_CM_EVENT_ROUTE_RESOLVED: | 
 | 234 | 		BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); | 
 | 235 | 		rdma->state = P9_RDMA_ROUTE_RESOLVED; | 
 | 236 | 		break; | 
 | 237 |  | 
 | 238 | 	case RDMA_CM_EVENT_ESTABLISHED: | 
 | 239 | 		BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); | 
 | 240 | 		rdma->state = P9_RDMA_CONNECTED; | 
 | 241 | 		break; | 
 | 242 |  | 
 | 243 | 	case RDMA_CM_EVENT_DISCONNECTED: | 
 | 244 | 		if (rdma) | 
 | 245 | 			rdma->state = P9_RDMA_CLOSED; | 
 | 246 | 		if (c) | 
 | 247 | 			c->status = Disconnected; | 
 | 248 | 		break; | 
 | 249 |  | 
 | 250 | 	case RDMA_CM_EVENT_TIMEWAIT_EXIT: | 
 | 251 | 		break; | 
 | 252 |  | 
 | 253 | 	case RDMA_CM_EVENT_ADDR_CHANGE: | 
 | 254 | 	case RDMA_CM_EVENT_ROUTE_ERROR: | 
 | 255 | 	case RDMA_CM_EVENT_DEVICE_REMOVAL: | 
 | 256 | 	case RDMA_CM_EVENT_MULTICAST_JOIN: | 
 | 257 | 	case RDMA_CM_EVENT_MULTICAST_ERROR: | 
 | 258 | 	case RDMA_CM_EVENT_REJECTED: | 
 | 259 | 	case RDMA_CM_EVENT_CONNECT_REQUEST: | 
 | 260 | 	case RDMA_CM_EVENT_CONNECT_RESPONSE: | 
 | 261 | 	case RDMA_CM_EVENT_CONNECT_ERROR: | 
 | 262 | 	case RDMA_CM_EVENT_ADDR_ERROR: | 
 | 263 | 	case RDMA_CM_EVENT_UNREACHABLE: | 
 | 264 | 		c->status = Disconnected; | 
 | 265 | 		rdma_disconnect(rdma->cm_id); | 
 | 266 | 		break; | 
 | 267 | 	default: | 
 | 268 | 		BUG(); | 
 | 269 | 	} | 
 | 270 | 	complete(&rdma->cm_done); | 
 | 271 | 	return 0; | 
 | 272 | } | 
 | 273 |  | 
 | 274 | static void | 
 | 275 | handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, | 
 | 276 | 	    struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) | 
 | 277 | { | 
 | 278 | 	struct p9_req_t *req; | 
 | 279 | 	int err = 0; | 
 | 280 | 	int16_t tag; | 
 | 281 |  | 
 | 282 | 	req = NULL; | 
 | 283 | 	ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, | 
 | 284 | 							 DMA_FROM_DEVICE); | 
 | 285 |  | 
 | 286 | 	if (status != IB_WC_SUCCESS) | 
 | 287 | 		goto err_out; | 
 | 288 |  | 
 | 289 | 	err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); | 
 | 290 | 	if (err) | 
 | 291 | 		goto err_out; | 
 | 292 |  | 
 | 293 | 	req = p9_tag_lookup(client, tag); | 
 | 294 | 	if (!req) | 
 | 295 | 		goto err_out; | 
 | 296 |  | 
 | 297 | 	req->rc = c->rc; | 
 | 298 | 	p9_client_cb(client, req); | 
 | 299 |  | 
 | 300 | 	return; | 
 | 301 |  | 
 | 302 |  err_out: | 
 | 303 | 	P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n", | 
 | 304 | 		   req, err, status); | 
 | 305 | 	rdma->state = P9_RDMA_FLUSHING; | 
 | 306 | 	client->status = Disconnected; | 
 | 307 | 	return; | 
 | 308 | } | 
 | 309 |  | 
 | 310 | static void | 
 | 311 | handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, | 
 | 312 | 	    struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) | 
 | 313 | { | 
 | 314 | 	ib_dma_unmap_single(rdma->cm_id->device, | 
 | 315 | 			    c->busa, c->req->tc->size, | 
 | 316 | 			    DMA_TO_DEVICE); | 
 | 317 | } | 
 | 318 |  | 
 | 319 | static void qp_event_handler(struct ib_event *event, void *context) | 
 | 320 | { | 
 | 321 | 	P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, | 
 | 322 | 								context); | 
 | 323 | } | 
 | 324 |  | 
 | 325 | static void cq_comp_handler(struct ib_cq *cq, void *cq_context) | 
 | 326 | { | 
 | 327 | 	struct p9_client *client = cq_context; | 
 | 328 | 	struct p9_trans_rdma *rdma = client->trans; | 
 | 329 | 	int ret; | 
 | 330 | 	struct ib_wc wc; | 
 | 331 |  | 
 | 332 | 	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); | 
 | 333 | 	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { | 
 | 334 | 		struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; | 
 | 335 |  | 
 | 336 | 		switch (c->wc_op) { | 
 | 337 | 		case IB_WC_RECV: | 
 | 338 | 			atomic_dec(&rdma->rq_count); | 
 | 339 | 			handle_recv(client, rdma, c, wc.status, wc.byte_len); | 
 | 340 | 			break; | 
 | 341 |  | 
 | 342 | 		case IB_WC_SEND: | 
 | 343 | 			handle_send(client, rdma, c, wc.status, wc.byte_len); | 
 | 344 | 			up(&rdma->sq_sem); | 
 | 345 | 			break; | 
 | 346 |  | 
 | 347 | 		default: | 
 | 348 | 			printk(KERN_ERR "9prdma: unexpected completion type, " | 
 | 349 | 			       "c->wc_op=%d, wc.opcode=%d, status=%d\n", | 
 | 350 | 			       c->wc_op, wc.opcode, wc.status); | 
 | 351 | 			break; | 
 | 352 | 		} | 
 | 353 | 		kfree(c); | 
 | 354 | 	} | 
 | 355 | } | 
 | 356 |  | 
 | 357 | static void cq_event_handler(struct ib_event *e, void *v) | 
 | 358 | { | 
 | 359 | 	P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); | 
 | 360 | } | 
 | 361 |  | 
 | 362 | static void rdma_destroy_trans(struct p9_trans_rdma *rdma) | 
 | 363 | { | 
 | 364 | 	if (!rdma) | 
 | 365 | 		return; | 
 | 366 |  | 
 | 367 | 	if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) | 
 | 368 | 		ib_dereg_mr(rdma->dma_mr); | 
 | 369 |  | 
 | 370 | 	if (rdma->qp && !IS_ERR(rdma->qp)) | 
 | 371 | 		ib_destroy_qp(rdma->qp); | 
 | 372 |  | 
 | 373 | 	if (rdma->pd && !IS_ERR(rdma->pd)) | 
 | 374 | 		ib_dealloc_pd(rdma->pd); | 
 | 375 |  | 
 | 376 | 	if (rdma->cq && !IS_ERR(rdma->cq)) | 
 | 377 | 		ib_destroy_cq(rdma->cq); | 
 | 378 |  | 
 | 379 | 	if (rdma->cm_id && !IS_ERR(rdma->cm_id)) | 
 | 380 | 		rdma_destroy_id(rdma->cm_id); | 
 | 381 |  | 
 | 382 | 	kfree(rdma); | 
 | 383 | } | 
 | 384 |  | 
 | 385 | static int | 
 | 386 | post_recv(struct p9_client *client, struct p9_rdma_context *c) | 
 | 387 | { | 
 | 388 | 	struct p9_trans_rdma *rdma = client->trans; | 
 | 389 | 	struct ib_recv_wr wr, *bad_wr; | 
 | 390 | 	struct ib_sge sge; | 
 | 391 |  | 
 | 392 | 	c->busa = ib_dma_map_single(rdma->cm_id->device, | 
 | 393 | 				    c->rc->sdata, client->msize, | 
 | 394 | 				    DMA_FROM_DEVICE); | 
 | 395 | 	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) | 
 | 396 | 		goto error; | 
 | 397 |  | 
 | 398 | 	sge.addr = c->busa; | 
 | 399 | 	sge.length = client->msize; | 
 | 400 | 	sge.lkey = rdma->lkey; | 
 | 401 |  | 
 | 402 | 	wr.next = NULL; | 
 | 403 | 	c->wc_op = IB_WC_RECV; | 
 | 404 | 	wr.wr_id = (unsigned long) c; | 
 | 405 | 	wr.sg_list = &sge; | 
 | 406 | 	wr.num_sge = 1; | 
 | 407 | 	return ib_post_recv(rdma->qp, &wr, &bad_wr); | 
 | 408 |  | 
 | 409 |  error: | 
 | 410 | 	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); | 
 | 411 | 	return -EIO; | 
 | 412 | } | 
 | 413 |  | 
 | 414 | static int rdma_request(struct p9_client *client, struct p9_req_t *req) | 
 | 415 | { | 
 | 416 | 	struct p9_trans_rdma *rdma = client->trans; | 
 | 417 | 	struct ib_send_wr wr, *bad_wr; | 
 | 418 | 	struct ib_sge sge; | 
 | 419 | 	int err = 0; | 
 | 420 | 	unsigned long flags; | 
 | 421 | 	struct p9_rdma_context *c = NULL; | 
 | 422 | 	struct p9_rdma_context *rpl_context = NULL; | 
 | 423 |  | 
 | 424 | 	/* Allocate an fcall for the reply */ | 
 | 425 | 	rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); | 
 | 426 | 	if (!rpl_context) | 
 | 427 | 		goto err_close; | 
 | 428 |  | 
 | 429 | 	/* | 
 | 430 | 	 * If the request has a buffer, steal it, otherwise | 
 | 431 | 	 * allocate a new one.  Typically, requests should already | 
 | 432 | 	 * have receive buffers allocated and just swap them around | 
 | 433 | 	 */ | 
 | 434 | 	if (!req->rc) { | 
 | 435 | 		req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, | 
 | 436 | 								GFP_KERNEL); | 
 | 437 | 		if (req->rc) { | 
 | 438 | 			req->rc->sdata = (char *) req->rc + | 
 | 439 | 						sizeof(struct p9_fcall); | 
 | 440 | 			req->rc->capacity = client->msize; | 
 | 441 | 		} | 
 | 442 | 	} | 
 | 443 | 	rpl_context->rc = req->rc; | 
 | 444 | 	if (!rpl_context->rc) { | 
 | 445 | 		kfree(rpl_context); | 
 | 446 | 		goto err_close; | 
 | 447 | 	} | 
 | 448 |  | 
 | 449 | 	/* | 
 | 450 | 	 * Post a receive buffer for this request. We need to ensure | 
 | 451 | 	 * there is a reply buffer available for every outstanding | 
 | 452 | 	 * request. A flushed request can result in no reply for an | 
 | 453 | 	 * outstanding request, so we must keep a count to avoid | 
 | 454 | 	 * overflowing the RQ. | 
 | 455 | 	 */ | 
 | 456 | 	if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { | 
 | 457 | 		err = post_recv(client, rpl_context); | 
 | 458 | 		if (err) { | 
 | 459 | 			kfree(rpl_context->rc); | 
 | 460 | 			kfree(rpl_context); | 
 | 461 | 			goto err_close; | 
 | 462 | 		} | 
 | 463 | 	} else | 
 | 464 | 		atomic_dec(&rdma->rq_count); | 
 | 465 |  | 
 | 466 | 	/* remove posted receive buffer from request structure */ | 
 | 467 | 	req->rc = NULL; | 
 | 468 |  | 
 | 469 | 	/* Post the request */ | 
 | 470 | 	c = kmalloc(sizeof *c, GFP_KERNEL); | 
 | 471 | 	if (!c) | 
 | 472 | 		goto err_close; | 
 | 473 | 	c->req = req; | 
 | 474 |  | 
 | 475 | 	c->busa = ib_dma_map_single(rdma->cm_id->device, | 
 | 476 | 				    c->req->tc->sdata, c->req->tc->size, | 
 | 477 | 				    DMA_TO_DEVICE); | 
 | 478 | 	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) | 
 | 479 | 		goto error; | 
 | 480 |  | 
 | 481 | 	sge.addr = c->busa; | 
 | 482 | 	sge.length = c->req->tc->size; | 
 | 483 | 	sge.lkey = rdma->lkey; | 
 | 484 |  | 
 | 485 | 	wr.next = NULL; | 
 | 486 | 	c->wc_op = IB_WC_SEND; | 
 | 487 | 	wr.wr_id = (unsigned long) c; | 
 | 488 | 	wr.opcode = IB_WR_SEND; | 
 | 489 | 	wr.send_flags = IB_SEND_SIGNALED; | 
 | 490 | 	wr.sg_list = &sge; | 
 | 491 | 	wr.num_sge = 1; | 
 | 492 |  | 
 | 493 | 	if (down_interruptible(&rdma->sq_sem)) | 
 | 494 | 		goto error; | 
 | 495 |  | 
 | 496 | 	return ib_post_send(rdma->qp, &wr, &bad_wr); | 
 | 497 |  | 
 | 498 |  error: | 
 | 499 | 	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); | 
 | 500 | 	return -EIO; | 
 | 501 |  | 
 | 502 |  err_close: | 
 | 503 | 	spin_lock_irqsave(&rdma->req_lock, flags); | 
 | 504 | 	if (rdma->state < P9_RDMA_CLOSING) { | 
 | 505 | 		rdma->state = P9_RDMA_CLOSING; | 
 | 506 | 		spin_unlock_irqrestore(&rdma->req_lock, flags); | 
 | 507 | 		rdma_disconnect(rdma->cm_id); | 
 | 508 | 	} else | 
 | 509 | 		spin_unlock_irqrestore(&rdma->req_lock, flags); | 
 | 510 | 	return err; | 
 | 511 | } | 
 | 512 |  | 
 | 513 | static void rdma_close(struct p9_client *client) | 
 | 514 | { | 
 | 515 | 	struct p9_trans_rdma *rdma; | 
 | 516 |  | 
 | 517 | 	if (!client) | 
 | 518 | 		return; | 
 | 519 |  | 
 | 520 | 	rdma = client->trans; | 
 | 521 | 	if (!rdma) | 
 | 522 | 		return; | 
 | 523 |  | 
 | 524 | 	client->status = Disconnected; | 
 | 525 | 	rdma_disconnect(rdma->cm_id); | 
 | 526 | 	rdma_destroy_trans(rdma); | 
 | 527 | } | 
 | 528 |  | 
 | 529 | /** | 
 | 530 |  * alloc_rdma - Allocate and initialize the rdma transport structure | 
| Tom Tucker | fc79d4b | 2008-10-22 18:47:39 -0500 | [diff] [blame] | 531 |  * @opts: Mount options structure | 
 | 532 |  */ | 
 | 533 | static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) | 
 | 534 | { | 
 | 535 | 	struct p9_trans_rdma *rdma; | 
 | 536 |  | 
 | 537 | 	rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); | 
 | 538 | 	if (!rdma) | 
 | 539 | 		return NULL; | 
 | 540 |  | 
 | 541 | 	rdma->sq_depth = opts->sq_depth; | 
 | 542 | 	rdma->rq_depth = opts->rq_depth; | 
 | 543 | 	rdma->timeout = opts->timeout; | 
 | 544 | 	spin_lock_init(&rdma->req_lock); | 
 | 545 | 	init_completion(&rdma->cm_done); | 
 | 546 | 	sema_init(&rdma->sq_sem, rdma->sq_depth); | 
 | 547 | 	atomic_set(&rdma->rq_count, 0); | 
 | 548 |  | 
 | 549 | 	return rdma; | 
 | 550 | } | 
 | 551 |  | 
 | 552 | /* its not clear to me we can do anything after send has been posted */ | 
 | 553 | static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) | 
 | 554 | { | 
 | 555 | 	return 1; | 
 | 556 | } | 
 | 557 |  | 
 | 558 | /** | 
 | 559 |  * trans_create_rdma - Transport method for creating atransport instance | 
 | 560 |  * @client: client instance | 
 | 561 |  * @addr: IP address string | 
 | 562 |  * @args: Mount options string | 
 | 563 |  */ | 
 | 564 | static int | 
 | 565 | rdma_create_trans(struct p9_client *client, const char *addr, char *args) | 
 | 566 | { | 
 | 567 | 	int err; | 
 | 568 | 	struct p9_rdma_opts opts; | 
 | 569 | 	struct p9_trans_rdma *rdma; | 
 | 570 | 	struct rdma_conn_param conn_param; | 
 | 571 | 	struct ib_qp_init_attr qp_attr; | 
 | 572 | 	struct ib_device_attr devattr; | 
 | 573 |  | 
 | 574 | 	/* Parse the transport specific mount options */ | 
 | 575 | 	err = parse_opts(args, &opts); | 
 | 576 | 	if (err < 0) | 
 | 577 | 		return err; | 
 | 578 |  | 
 | 579 | 	/* Create and initialize the RDMA transport structure */ | 
 | 580 | 	rdma = alloc_rdma(&opts); | 
 | 581 | 	if (!rdma) | 
 | 582 | 		return -ENOMEM; | 
 | 583 |  | 
 | 584 | 	/* Create the RDMA CM ID */ | 
 | 585 | 	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); | 
 | 586 | 	if (IS_ERR(rdma->cm_id)) | 
 | 587 | 		goto error; | 
 | 588 |  | 
| Tom Tucker | 517ac45 | 2008-10-23 16:30:13 -0500 | [diff] [blame] | 589 | 	/* Associate the client with the transport */ | 
 | 590 | 	client->trans = rdma; | 
 | 591 |  | 
| Tom Tucker | fc79d4b | 2008-10-22 18:47:39 -0500 | [diff] [blame] | 592 | 	/* Resolve the server's address */ | 
 | 593 | 	rdma->addr.sin_family = AF_INET; | 
 | 594 | 	rdma->addr.sin_addr.s_addr = in_aton(addr); | 
 | 595 | 	rdma->addr.sin_port = htons(opts.port); | 
 | 596 | 	err = rdma_resolve_addr(rdma->cm_id, NULL, | 
 | 597 | 				(struct sockaddr *)&rdma->addr, | 
 | 598 | 				rdma->timeout); | 
 | 599 | 	if (err) | 
 | 600 | 		goto error; | 
 | 601 | 	err = wait_for_completion_interruptible(&rdma->cm_done); | 
 | 602 | 	if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) | 
 | 603 | 		goto error; | 
 | 604 |  | 
 | 605 | 	/* Resolve the route to the server */ | 
 | 606 | 	err = rdma_resolve_route(rdma->cm_id, rdma->timeout); | 
 | 607 | 	if (err) | 
 | 608 | 		goto error; | 
 | 609 | 	err = wait_for_completion_interruptible(&rdma->cm_done); | 
 | 610 | 	if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) | 
 | 611 | 		goto error; | 
 | 612 |  | 
 | 613 | 	/* Query the device attributes */ | 
 | 614 | 	err = ib_query_device(rdma->cm_id->device, &devattr); | 
 | 615 | 	if (err) | 
 | 616 | 		goto error; | 
 | 617 |  | 
 | 618 | 	/* Create the Completion Queue */ | 
 | 619 | 	rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, | 
 | 620 | 				cq_event_handler, client, | 
 | 621 | 				opts.sq_depth + opts.rq_depth + 1, 0); | 
 | 622 | 	if (IS_ERR(rdma->cq)) | 
 | 623 | 		goto error; | 
 | 624 | 	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); | 
 | 625 |  | 
 | 626 | 	/* Create the Protection Domain */ | 
 | 627 | 	rdma->pd = ib_alloc_pd(rdma->cm_id->device); | 
 | 628 | 	if (IS_ERR(rdma->pd)) | 
 | 629 | 		goto error; | 
 | 630 |  | 
 | 631 | 	/* Cache the DMA lkey in the transport */ | 
 | 632 | 	rdma->dma_mr = NULL; | 
 | 633 | 	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) | 
 | 634 | 		rdma->lkey = rdma->cm_id->device->local_dma_lkey; | 
 | 635 | 	else { | 
 | 636 | 		rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); | 
 | 637 | 		if (IS_ERR(rdma->dma_mr)) | 
 | 638 | 			goto error; | 
 | 639 | 		rdma->lkey = rdma->dma_mr->lkey; | 
 | 640 | 	} | 
 | 641 |  | 
 | 642 | 	/* Create the Queue Pair */ | 
 | 643 | 	memset(&qp_attr, 0, sizeof qp_attr); | 
 | 644 | 	qp_attr.event_handler = qp_event_handler; | 
 | 645 | 	qp_attr.qp_context = client; | 
 | 646 | 	qp_attr.cap.max_send_wr = opts.sq_depth; | 
 | 647 | 	qp_attr.cap.max_recv_wr = opts.rq_depth; | 
 | 648 | 	qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; | 
 | 649 | 	qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; | 
 | 650 | 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; | 
 | 651 | 	qp_attr.qp_type = IB_QPT_RC; | 
 | 652 | 	qp_attr.send_cq = rdma->cq; | 
 | 653 | 	qp_attr.recv_cq = rdma->cq; | 
 | 654 | 	err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); | 
 | 655 | 	if (err) | 
 | 656 | 		goto error; | 
 | 657 | 	rdma->qp = rdma->cm_id->qp; | 
 | 658 |  | 
 | 659 | 	/* Request a connection */ | 
 | 660 | 	memset(&conn_param, 0, sizeof(conn_param)); | 
 | 661 | 	conn_param.private_data = NULL; | 
 | 662 | 	conn_param.private_data_len = 0; | 
 | 663 | 	conn_param.responder_resources = P9_RDMA_IRD; | 
 | 664 | 	conn_param.initiator_depth = P9_RDMA_ORD; | 
 | 665 | 	err = rdma_connect(rdma->cm_id, &conn_param); | 
 | 666 | 	if (err) | 
 | 667 | 		goto error; | 
 | 668 | 	err = wait_for_completion_interruptible(&rdma->cm_done); | 
 | 669 | 	if (err || (rdma->state != P9_RDMA_CONNECTED)) | 
 | 670 | 		goto error; | 
 | 671 |  | 
| Tom Tucker | fc79d4b | 2008-10-22 18:47:39 -0500 | [diff] [blame] | 672 | 	client->status = Connected; | 
 | 673 |  | 
 | 674 | 	return 0; | 
 | 675 |  | 
 | 676 | error: | 
 | 677 | 	rdma_destroy_trans(rdma); | 
 | 678 | 	return -ENOTCONN; | 
 | 679 | } | 
 | 680 |  | 
 | 681 | static struct p9_trans_module p9_rdma_trans = { | 
 | 682 | 	.name = "rdma", | 
 | 683 | 	.maxsize = P9_RDMA_MAXSIZE, | 
 | 684 | 	.def = 0, | 
 | 685 | 	.owner = THIS_MODULE, | 
 | 686 | 	.create = rdma_create_trans, | 
 | 687 | 	.close = rdma_close, | 
 | 688 | 	.request = rdma_request, | 
 | 689 | 	.cancel = rdma_cancel, | 
 | 690 | }; | 
 | 691 |  | 
 | 692 | /** | 
 | 693 |  * p9_trans_rdma_init - Register the 9P RDMA transport driver | 
 | 694 |  */ | 
 | 695 | static int __init p9_trans_rdma_init(void) | 
 | 696 | { | 
 | 697 | 	v9fs_register_trans(&p9_rdma_trans); | 
 | 698 | 	return 0; | 
 | 699 | } | 
 | 700 |  | 
 | 701 | static void __exit p9_trans_rdma_exit(void) | 
 | 702 | { | 
 | 703 | 	v9fs_unregister_trans(&p9_rdma_trans); | 
 | 704 | } | 
 | 705 |  | 
 | 706 | module_init(p9_trans_rdma_init); | 
 | 707 | module_exit(p9_trans_rdma_exit); | 
 | 708 |  | 
 | 709 | MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); | 
 | 710 | MODULE_DESCRIPTION("RDMA Transport for 9P"); | 
 | 711 | MODULE_LICENSE("Dual BSD/GPL"); |