| Andy Grover | 39de828 | 2009-02-24 15:30:19 +0000 | [diff] [blame] | 1 | #ifndef _RDS_RDS_H | 
|  | 2 | #define _RDS_RDS_H | 
|  | 3 |  | 
|  | 4 | #include <net/sock.h> | 
|  | 5 | #include <linux/scatterlist.h> | 
|  | 6 | #include <linux/highmem.h> | 
|  | 7 | #include <rdma/rdma_cm.h> | 
|  | 8 | #include <linux/mutex.h> | 
|  | 9 | #include <linux/rds.h> | 
|  | 10 |  | 
|  | 11 | #include "info.h" | 
|  | 12 |  | 
|  | 13 | /* | 
|  | 14 | * RDS Network protocol version | 
|  | 15 | */ | 
|  | 16 | #define RDS_PROTOCOL_3_0	0x0300 | 
|  | 17 | #define RDS_PROTOCOL_3_1	0x0301 | 
|  | 18 | #define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_1 | 
|  | 19 | #define RDS_PROTOCOL_MAJOR(v)	((v) >> 8) | 
|  | 20 | #define RDS_PROTOCOL_MINOR(v)	((v) & 255) | 
|  | 21 | #define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min) | 
|  | 22 |  | 
|  | 23 | /* | 
|  | 24 | * XXX randomly chosen, but at least seems to be unused: | 
|  | 25 | * #               18464-18768 Unassigned | 
|  | 26 | * We should do better.  We want a reserved port to discourage unpriv'ed | 
|  | 27 | * userspace from listening. | 
|  | 28 | */ | 
|  | 29 | #define RDS_PORT	18634 | 
|  | 30 |  | 
| Andy Grover | 8cbd960 | 2009-04-01 08:20:20 +0000 | [diff] [blame] | 31 | #ifdef ATOMIC64_INIT | 
|  | 32 | #define KERNEL_HAS_ATOMIC64 | 
|  | 33 | #endif | 
|  | 34 |  | 
| Andy Grover | 39de828 | 2009-02-24 15:30:19 +0000 | [diff] [blame] | 35 | #ifdef DEBUG | 
|  | 36 | #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) | 
|  | 37 | #else | 
|  | 38 | /* sigh, pr_debug() causes unused variable warnings */ | 
|  | 39 | static inline void __attribute__ ((format (printf, 1, 2))) | 
|  | 40 | rdsdebug(char *fmt, ...) | 
|  | 41 | { | 
|  | 42 | } | 
|  | 43 | #endif | 
|  | 44 |  | 
|  | 45 | /* XXX is there one of these somewhere? */ | 
|  | 46 | #define ceil(x, y) \ | 
|  | 47 | ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) | 
|  | 48 |  | 
|  | 49 | #define RDS_FRAG_SHIFT	12 | 
|  | 50 | #define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT)) | 
|  | 51 |  | 
|  | 52 | #define RDS_CONG_MAP_BYTES	(65536 / 8) | 
|  | 53 | #define RDS_CONG_MAP_LONGS	(RDS_CONG_MAP_BYTES / sizeof(unsigned long)) | 
|  | 54 | #define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) | 
|  | 55 | #define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8) | 
|  | 56 |  | 
|  | 57 | struct rds_cong_map { | 
|  | 58 | struct rb_node		m_rb_node; | 
|  | 59 | __be32			m_addr; | 
|  | 60 | wait_queue_head_t	m_waitq; | 
|  | 61 | struct list_head	m_conn_list; | 
|  | 62 | unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES]; | 
|  | 63 | }; | 
|  | 64 |  | 
|  | 65 |  | 
|  | 66 | /* | 
|  | 67 | * This is how we will track the connection state: | 
|  | 68 | * A connection is always in one of the following | 
|  | 69 | * states. Updates to the state are atomic and imply | 
|  | 70 | * a memory barrier. | 
|  | 71 | */ | 
|  | 72 | enum { | 
|  | 73 | RDS_CONN_DOWN = 0, | 
|  | 74 | RDS_CONN_CONNECTING, | 
|  | 75 | RDS_CONN_DISCONNECTING, | 
|  | 76 | RDS_CONN_UP, | 
|  | 77 | RDS_CONN_ERROR, | 
|  | 78 | }; | 
|  | 79 |  | 
|  | 80 | /* Bits for c_flags */ | 
|  | 81 | #define RDS_LL_SEND_FULL	0 | 
|  | 82 | #define RDS_RECONNECT_PENDING	1 | 
|  | 83 |  | 
|  | 84 | struct rds_connection { | 
|  | 85 | struct hlist_node	c_hash_node; | 
|  | 86 | __be32			c_laddr; | 
|  | 87 | __be32			c_faddr; | 
|  | 88 | unsigned int		c_loopback:1; | 
|  | 89 | struct rds_connection	*c_passive; | 
|  | 90 |  | 
|  | 91 | struct rds_cong_map	*c_lcong; | 
|  | 92 | struct rds_cong_map	*c_fcong; | 
|  | 93 |  | 
|  | 94 | struct mutex		c_send_lock;	/* protect send ring */ | 
|  | 95 | struct rds_message	*c_xmit_rm; | 
|  | 96 | unsigned long		c_xmit_sg; | 
|  | 97 | unsigned int		c_xmit_hdr_off; | 
|  | 98 | unsigned int		c_xmit_data_off; | 
|  | 99 | unsigned int		c_xmit_rdma_sent; | 
|  | 100 |  | 
|  | 101 | spinlock_t		c_lock;		/* protect msg queues */ | 
|  | 102 | u64			c_next_tx_seq; | 
|  | 103 | struct list_head	c_send_queue; | 
|  | 104 | struct list_head	c_retrans; | 
|  | 105 |  | 
|  | 106 | u64			c_next_rx_seq; | 
|  | 107 |  | 
|  | 108 | struct rds_transport	*c_trans; | 
|  | 109 | void			*c_transport_data; | 
|  | 110 |  | 
|  | 111 | atomic_t		c_state; | 
|  | 112 | unsigned long		c_flags; | 
|  | 113 | unsigned long		c_reconnect_jiffies; | 
|  | 114 | struct delayed_work	c_send_w; | 
|  | 115 | struct delayed_work	c_recv_w; | 
|  | 116 | struct delayed_work	c_conn_w; | 
|  | 117 | struct work_struct	c_down_w; | 
|  | 118 | struct mutex		c_cm_lock;	/* protect conn state & cm */ | 
|  | 119 |  | 
|  | 120 | struct list_head	c_map_item; | 
|  | 121 | unsigned long		c_map_queued; | 
|  | 122 | unsigned long		c_map_offset; | 
|  | 123 | unsigned long		c_map_bytes; | 
|  | 124 |  | 
|  | 125 | unsigned int		c_unacked_packets; | 
|  | 126 | unsigned int		c_unacked_bytes; | 
|  | 127 |  | 
|  | 128 | /* Protocol version */ | 
|  | 129 | unsigned int		c_version; | 
|  | 130 | }; | 
|  | 131 |  | 
|  | 132 | #define RDS_FLAG_CONG_BITMAP	0x01 | 
|  | 133 | #define RDS_FLAG_ACK_REQUIRED	0x02 | 
|  | 134 | #define RDS_FLAG_RETRANSMITTED	0x04 | 
| Steve Wise | 7b70d03 | 2009-04-09 14:09:39 +0000 | [diff] [blame] | 135 | #define RDS_MAX_ADV_CREDIT	255 | 
| Andy Grover | 39de828 | 2009-02-24 15:30:19 +0000 | [diff] [blame] | 136 |  | 
|  | 137 | /* | 
|  | 138 | * Maximum space available for extension headers. | 
|  | 139 | */ | 
|  | 140 | #define RDS_HEADER_EXT_SPACE	16 | 
|  | 141 |  | 
|  | 142 | struct rds_header { | 
|  | 143 | __be64	h_sequence; | 
|  | 144 | __be64	h_ack; | 
|  | 145 | __be32	h_len; | 
|  | 146 | __be16	h_sport; | 
|  | 147 | __be16	h_dport; | 
|  | 148 | u8	h_flags; | 
|  | 149 | u8	h_credit; | 
|  | 150 | u8	h_padding[4]; | 
|  | 151 | __sum16	h_csum; | 
|  | 152 |  | 
|  | 153 | u8	h_exthdr[RDS_HEADER_EXT_SPACE]; | 
|  | 154 | }; | 
|  | 155 |  | 
|  | 156 | /* | 
|  | 157 | * Reserved - indicates end of extensions | 
|  | 158 | */ | 
|  | 159 | #define RDS_EXTHDR_NONE		0 | 
|  | 160 |  | 
|  | 161 | /* | 
|  | 162 | * This extension header is included in the very | 
|  | 163 | * first message that is sent on a new connection, | 
|  | 164 | * and identifies the protocol level. This will help | 
|  | 165 | * rolling updates if a future change requires breaking | 
|  | 166 | * the protocol. | 
|  | 167 | * NB: This is no longer true for IB, where we do a version | 
|  | 168 | * negotiation during the connection setup phase (protocol | 
|  | 169 | * version information is included in the RDMA CM private data). | 
|  | 170 | */ | 
|  | 171 | #define RDS_EXTHDR_VERSION	1 | 
|  | 172 | struct rds_ext_header_version { | 
|  | 173 | __be32			h_version; | 
|  | 174 | }; | 
|  | 175 |  | 
|  | 176 | /* | 
|  | 177 | * This extension header is included in the RDS message | 
|  | 178 | * chasing an RDMA operation. | 
|  | 179 | */ | 
|  | 180 | #define RDS_EXTHDR_RDMA		2 | 
|  | 181 | struct rds_ext_header_rdma { | 
|  | 182 | __be32			h_rdma_rkey; | 
|  | 183 | }; | 
|  | 184 |  | 
|  | 185 | /* | 
|  | 186 | * This extension header tells the peer about the | 
|  | 187 | * destination <R_Key,offset> of the requested RDMA | 
|  | 188 | * operation. | 
|  | 189 | */ | 
|  | 190 | #define RDS_EXTHDR_RDMA_DEST	3 | 
|  | 191 | struct rds_ext_header_rdma_dest { | 
|  | 192 | __be32			h_rdma_rkey; | 
|  | 193 | __be32			h_rdma_offset; | 
|  | 194 | }; | 
|  | 195 |  | 
|  | 196 | #define __RDS_EXTHDR_MAX	16 /* for now */ | 
|  | 197 |  | 
|  | 198 | struct rds_incoming { | 
|  | 199 | atomic_t		i_refcount; | 
|  | 200 | struct list_head	i_item; | 
|  | 201 | struct rds_connection	*i_conn; | 
|  | 202 | struct rds_header	i_hdr; | 
|  | 203 | unsigned long		i_rx_jiffies; | 
|  | 204 | __be32			i_saddr; | 
|  | 205 |  | 
|  | 206 | rds_rdma_cookie_t	i_rdma_cookie; | 
|  | 207 | }; | 
|  | 208 |  | 
|  | 209 | /* | 
|  | 210 | * m_sock_item and m_conn_item are on lists that are serialized under | 
|  | 211 | * conn->c_lock.  m_sock_item has additional meaning in that once it is empty | 
|  | 212 | * the message will not be put back on the retransmit list after being sent. | 
|  | 213 | * messages that are canceled while being sent rely on this. | 
|  | 214 | * | 
|  | 215 | * m_inc is used by loopback so that it can pass an incoming message straight | 
|  | 216 | * back up into the rx path.  It embeds a wire header which is also used by | 
|  | 217 | * the send path, which is kind of awkward. | 
|  | 218 | * | 
|  | 219 | * m_sock_item indicates the message's presence on a socket's send or receive | 
|  | 220 | * queue.  m_rs will point to that socket. | 
|  | 221 | * | 
|  | 222 | * m_daddr is used by cancellation to prune messages to a given destination. | 
|  | 223 | * | 
|  | 224 | * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock | 
|  | 225 | * nesting.  As paths iterate over messages on a sock, or conn, they must | 
|  | 226 | * also lock the conn, or sock, to remove the message from those lists too. | 
|  | 227 | * Testing the flag to determine if the message is still on the lists lets | 
|  | 228 | * us avoid testing the list_head directly.  That means each path can use | 
|  | 229 | * the message's list_head to keep it on a local list while juggling locks | 
|  | 230 | * without confusing the other path. | 
|  | 231 | * | 
|  | 232 | * m_ack_seq is an optional field set by transports who need a different | 
|  | 233 | * sequence number range to invalidate.  They can use this in a callback | 
|  | 234 | * that they pass to rds_send_drop_acked() to see if each message has been | 
|  | 235 | * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't | 
|  | 236 | * had ack_seq set yet. | 
|  | 237 | */ | 
|  | 238 | #define RDS_MSG_ON_SOCK		1 | 
|  | 239 | #define RDS_MSG_ON_CONN		2 | 
|  | 240 | #define RDS_MSG_HAS_ACK_SEQ	3 | 
|  | 241 | #define RDS_MSG_ACK_REQUIRED	4 | 
|  | 242 | #define RDS_MSG_RETRANSMITTED	5 | 
|  | 243 | #define RDS_MSG_MAPPED		6 | 
|  | 244 | #define RDS_MSG_PAGEVEC		7 | 
|  | 245 |  | 
|  | 246 | struct rds_message { | 
|  | 247 | atomic_t		m_refcount; | 
|  | 248 | struct list_head	m_sock_item; | 
|  | 249 | struct list_head	m_conn_item; | 
|  | 250 | struct rds_incoming	m_inc; | 
|  | 251 | u64			m_ack_seq; | 
|  | 252 | __be32			m_daddr; | 
|  | 253 | unsigned long		m_flags; | 
|  | 254 |  | 
|  | 255 | /* Never access m_rs without holding m_rs_lock. | 
|  | 256 | * Lock nesting is | 
|  | 257 | *  rm->m_rs_lock | 
|  | 258 | *   -> rs->rs_lock | 
|  | 259 | */ | 
|  | 260 | spinlock_t		m_rs_lock; | 
|  | 261 | struct rds_sock		*m_rs; | 
|  | 262 | struct rds_rdma_op	*m_rdma_op; | 
|  | 263 | rds_rdma_cookie_t	m_rdma_cookie; | 
|  | 264 | struct rds_mr		*m_rdma_mr; | 
|  | 265 | unsigned int		m_nents; | 
|  | 266 | unsigned int		m_count; | 
|  | 267 | struct scatterlist	m_sg[0]; | 
|  | 268 | }; | 
|  | 269 |  | 
|  | 270 | /* | 
|  | 271 | * The RDS notifier is used (optionally) to tell the application about | 
|  | 272 | * completed RDMA operations. Rather than keeping the whole rds message | 
|  | 273 | * around on the queue, we allocate a small notifier that is put on the | 
|  | 274 | * socket's notifier_list. Notifications are delivered to the application | 
|  | 275 | * through control messages. | 
|  | 276 | */ | 
|  | 277 | struct rds_notifier { | 
|  | 278 | struct list_head	n_list; | 
|  | 279 | uint64_t		n_user_token; | 
|  | 280 | int			n_status; | 
|  | 281 | }; | 
|  | 282 |  | 
|  | 283 | /** | 
|  | 284 | * struct rds_transport -  transport specific behavioural hooks | 
|  | 285 | * | 
|  | 286 | * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send | 
|  | 287 | *        part of a message.  The caller serializes on the send_sem so this | 
|  | 288 | *        doesn't need to be reentrant for a given conn.  The header must be | 
|  | 289 | *        sent before the data payload.  .xmit must be prepared to send a | 
|  | 290 | *        message with no data payload.  .xmit should return the number of | 
|  | 291 | *        bytes that were sent down the connection, including header bytes. | 
|  | 292 | *        Returning 0 tells the caller that it doesn't need to perform any | 
|  | 293 | *        additional work now.  This is usually the case when the transport has | 
|  | 294 | *        filled the sending queue for its connection and will handle | 
|  | 295 | *        triggering the rds thread to continue the send when space becomes | 
|  | 296 | *        available.  Returning -EAGAIN tells the caller to retry the send | 
|  | 297 | *        immediately.  Returning -ENOMEM tells the caller to retry the send at | 
|  | 298 | *        some point in the future. | 
|  | 299 | * | 
|  | 300 | * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once | 
|  | 301 | *                 it returns the connection can not call rds_recv_incoming(). | 
|  | 302 | *                 This will only be called once after conn_connect returns | 
|  | 303 | *                 non-zero success and will The caller serializes this with | 
|  | 304 | *                 the send and connecting paths (xmit_* and conn_*).  The | 
|  | 305 | *                 transport is responsible for other serialization, including | 
|  | 306 | *                 rds_recv_incoming().  This is called in process context but | 
|  | 307 | *                 should try hard not to block. | 
|  | 308 | * | 
|  | 309 | * @xmit_cong_map: This asks the transport to send the local bitmap down the | 
|  | 310 | * 		   given connection.  XXX get a better story about the bitmap | 
|  | 311 | * 		   flag and header. | 
|  | 312 | */ | 
|  | 313 |  | 
|  | 314 | struct rds_transport { | 
|  | 315 | char			t_name[TRANSNAMSIZ]; | 
|  | 316 | struct list_head	t_item; | 
|  | 317 | struct module		*t_owner; | 
|  | 318 | unsigned int		t_prefer_loopback:1; | 
|  | 319 |  | 
|  | 320 | int (*laddr_check)(__be32 addr); | 
|  | 321 | int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); | 
|  | 322 | void (*conn_free)(void *data); | 
|  | 323 | int (*conn_connect)(struct rds_connection *conn); | 
|  | 324 | void (*conn_shutdown)(struct rds_connection *conn); | 
|  | 325 | void (*xmit_prepare)(struct rds_connection *conn); | 
|  | 326 | void (*xmit_complete)(struct rds_connection *conn); | 
|  | 327 | int (*xmit)(struct rds_connection *conn, struct rds_message *rm, | 
|  | 328 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 
|  | 329 | int (*xmit_cong_map)(struct rds_connection *conn, | 
|  | 330 | struct rds_cong_map *map, unsigned long offset); | 
|  | 331 | int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); | 
|  | 332 | int (*recv)(struct rds_connection *conn); | 
|  | 333 | int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, | 
|  | 334 | size_t size); | 
|  | 335 | void (*inc_purge)(struct rds_incoming *inc); | 
|  | 336 | void (*inc_free)(struct rds_incoming *inc); | 
|  | 337 |  | 
|  | 338 | int (*cm_handle_connect)(struct rdma_cm_id *cm_id, | 
|  | 339 | struct rdma_cm_event *event); | 
|  | 340 | int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); | 
|  | 341 | void (*cm_connect_complete)(struct rds_connection *conn, | 
|  | 342 | struct rdma_cm_event *event); | 
|  | 343 |  | 
|  | 344 | unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, | 
|  | 345 | unsigned int avail); | 
|  | 346 | void (*exit)(void); | 
|  | 347 | void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, | 
|  | 348 | struct rds_sock *rs, u32 *key_ret); | 
|  | 349 | void (*sync_mr)(void *trans_private, int direction); | 
|  | 350 | void (*free_mr)(void *trans_private, int invalidate); | 
|  | 351 | void (*flush_mrs)(void); | 
|  | 352 | }; | 
|  | 353 |  | 
|  | 354 | struct rds_sock { | 
|  | 355 | struct sock		rs_sk; | 
|  | 356 |  | 
|  | 357 | u64			rs_user_addr; | 
|  | 358 | u64			rs_user_bytes; | 
|  | 359 |  | 
|  | 360 | /* | 
|  | 361 | * bound_addr used for both incoming and outgoing, no INADDR_ANY | 
|  | 362 | * support. | 
|  | 363 | */ | 
|  | 364 | struct rb_node		rs_bound_node; | 
|  | 365 | __be32			rs_bound_addr; | 
|  | 366 | __be32			rs_conn_addr; | 
|  | 367 | __be16			rs_bound_port; | 
|  | 368 | __be16			rs_conn_port; | 
|  | 369 |  | 
|  | 370 | /* | 
|  | 371 | * This is only used to communicate the transport between bind and | 
|  | 372 | * initiating connections.  All other trans use is referenced through | 
|  | 373 | * the connection. | 
|  | 374 | */ | 
|  | 375 | struct rds_transport    *rs_transport; | 
|  | 376 |  | 
|  | 377 | /* | 
|  | 378 | * rds_sendmsg caches the conn it used the last time around. | 
|  | 379 | * This helps avoid costly lookups. | 
|  | 380 | */ | 
|  | 381 | struct rds_connection	*rs_conn; | 
|  | 382 |  | 
|  | 383 | /* flag indicating we were congested or not */ | 
|  | 384 | int			rs_congested; | 
|  | 385 |  | 
|  | 386 | /* rs_lock protects all these adjacent members before the newline */ | 
|  | 387 | spinlock_t		rs_lock; | 
|  | 388 | struct list_head	rs_send_queue; | 
|  | 389 | u32			rs_snd_bytes; | 
|  | 390 | int			rs_rcv_bytes; | 
|  | 391 | struct list_head	rs_notify_queue;	/* currently used for failed RDMAs */ | 
|  | 392 |  | 
|  | 393 | /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask | 
|  | 394 | * to decide whether the application should be woken up. | 
|  | 395 | * If not set, we use rs_cong_track to find out whether a cong map | 
|  | 396 | * update arrived. | 
|  | 397 | */ | 
|  | 398 | uint64_t		rs_cong_mask; | 
|  | 399 | uint64_t		rs_cong_notify; | 
|  | 400 | struct list_head	rs_cong_list; | 
|  | 401 | unsigned long		rs_cong_track; | 
|  | 402 |  | 
|  | 403 | /* | 
|  | 404 | * rs_recv_lock protects the receive queue, and is | 
|  | 405 | * used to serialize with rds_release. | 
|  | 406 | */ | 
|  | 407 | rwlock_t		rs_recv_lock; | 
|  | 408 | struct list_head	rs_recv_queue; | 
|  | 409 |  | 
|  | 410 | /* just for stats reporting */ | 
|  | 411 | struct list_head	rs_item; | 
|  | 412 |  | 
|  | 413 | /* these have their own lock */ | 
|  | 414 | spinlock_t		rs_rdma_lock; | 
|  | 415 | struct rb_root		rs_rdma_keys; | 
|  | 416 |  | 
|  | 417 | /* Socket options - in case there will be more */ | 
|  | 418 | unsigned char		rs_recverr, | 
|  | 419 | rs_cong_monitor; | 
|  | 420 | }; | 
|  | 421 |  | 
|  | 422 | static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) | 
|  | 423 | { | 
|  | 424 | return container_of(sk, struct rds_sock, rs_sk); | 
|  | 425 | } | 
|  | 426 | static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) | 
|  | 427 | { | 
|  | 428 | return &rs->rs_sk; | 
|  | 429 | } | 
|  | 430 |  | 
|  | 431 | /* | 
|  | 432 | * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value | 
|  | 433 | * to account for overhead.  We don't account for overhead, we just apply | 
|  | 434 | * the number of payload bytes to the specified value. | 
|  | 435 | */ | 
|  | 436 | static inline int rds_sk_sndbuf(struct rds_sock *rs) | 
|  | 437 | { | 
|  | 438 | return rds_rs_to_sk(rs)->sk_sndbuf / 2; | 
|  | 439 | } | 
|  | 440 | static inline int rds_sk_rcvbuf(struct rds_sock *rs) | 
|  | 441 | { | 
|  | 442 | return rds_rs_to_sk(rs)->sk_rcvbuf / 2; | 
|  | 443 | } | 
|  | 444 |  | 
|  | 445 | struct rds_statistics { | 
|  | 446 | uint64_t	s_conn_reset; | 
|  | 447 | uint64_t	s_recv_drop_bad_checksum; | 
|  | 448 | uint64_t	s_recv_drop_old_seq; | 
|  | 449 | uint64_t	s_recv_drop_no_sock; | 
|  | 450 | uint64_t	s_recv_drop_dead_sock; | 
|  | 451 | uint64_t	s_recv_deliver_raced; | 
|  | 452 | uint64_t	s_recv_delivered; | 
|  | 453 | uint64_t	s_recv_queued; | 
|  | 454 | uint64_t	s_recv_immediate_retry; | 
|  | 455 | uint64_t	s_recv_delayed_retry; | 
|  | 456 | uint64_t	s_recv_ack_required; | 
|  | 457 | uint64_t	s_recv_rdma_bytes; | 
|  | 458 | uint64_t	s_recv_ping; | 
|  | 459 | uint64_t	s_send_queue_empty; | 
|  | 460 | uint64_t	s_send_queue_full; | 
|  | 461 | uint64_t	s_send_sem_contention; | 
|  | 462 | uint64_t	s_send_sem_queue_raced; | 
|  | 463 | uint64_t	s_send_immediate_retry; | 
|  | 464 | uint64_t	s_send_delayed_retry; | 
|  | 465 | uint64_t	s_send_drop_acked; | 
|  | 466 | uint64_t	s_send_ack_required; | 
|  | 467 | uint64_t	s_send_queued; | 
|  | 468 | uint64_t	s_send_rdma; | 
|  | 469 | uint64_t	s_send_rdma_bytes; | 
|  | 470 | uint64_t	s_send_pong; | 
|  | 471 | uint64_t	s_page_remainder_hit; | 
|  | 472 | uint64_t	s_page_remainder_miss; | 
|  | 473 | uint64_t	s_copy_to_user; | 
|  | 474 | uint64_t	s_copy_from_user; | 
|  | 475 | uint64_t	s_cong_update_queued; | 
|  | 476 | uint64_t	s_cong_update_received; | 
|  | 477 | uint64_t	s_cong_send_error; | 
|  | 478 | uint64_t	s_cong_send_blocked; | 
|  | 479 | }; | 
|  | 480 |  | 
|  | 481 | /* af_rds.c */ | 
|  | 482 | void rds_sock_addref(struct rds_sock *rs); | 
|  | 483 | void rds_sock_put(struct rds_sock *rs); | 
|  | 484 | void rds_wake_sk_sleep(struct rds_sock *rs); | 
|  | 485 | static inline void __rds_wake_sk_sleep(struct sock *sk) | 
|  | 486 | { | 
|  | 487 | wait_queue_head_t *waitq = sk->sk_sleep; | 
|  | 488 |  | 
|  | 489 | if (!sock_flag(sk, SOCK_DEAD) && waitq) | 
|  | 490 | wake_up(waitq); | 
|  | 491 | } | 
|  | 492 | extern wait_queue_head_t rds_poll_waitq; | 
|  | 493 |  | 
|  | 494 |  | 
|  | 495 | /* bind.c */ | 
|  | 496 | int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); | 
|  | 497 | void rds_remove_bound(struct rds_sock *rs); | 
|  | 498 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port); | 
|  | 499 |  | 
|  | 500 | /* cong.c */ | 
|  | 501 | int rds_cong_get_maps(struct rds_connection *conn); | 
|  | 502 | void rds_cong_add_conn(struct rds_connection *conn); | 
|  | 503 | void rds_cong_remove_conn(struct rds_connection *conn); | 
|  | 504 | void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); | 
|  | 505 | void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); | 
|  | 506 | int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); | 
|  | 507 | void rds_cong_queue_updates(struct rds_cong_map *map); | 
|  | 508 | void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); | 
|  | 509 | int rds_cong_updated_since(unsigned long *recent); | 
|  | 510 | void rds_cong_add_socket(struct rds_sock *); | 
|  | 511 | void rds_cong_remove_socket(struct rds_sock *); | 
|  | 512 | void rds_cong_exit(void); | 
|  | 513 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); | 
|  | 514 |  | 
|  | 515 | /* conn.c */ | 
|  | 516 | int __init rds_conn_init(void); | 
|  | 517 | void rds_conn_exit(void); | 
|  | 518 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, | 
|  | 519 | struct rds_transport *trans, gfp_t gfp); | 
|  | 520 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | 
|  | 521 | struct rds_transport *trans, gfp_t gfp); | 
|  | 522 | void rds_conn_destroy(struct rds_connection *conn); | 
|  | 523 | void rds_conn_reset(struct rds_connection *conn); | 
|  | 524 | void rds_conn_drop(struct rds_connection *conn); | 
|  | 525 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, | 
|  | 526 | struct rds_info_iterator *iter, | 
|  | 527 | struct rds_info_lengths *lens, | 
|  | 528 | int (*visitor)(struct rds_connection *, void *), | 
|  | 529 | size_t item_len); | 
|  | 530 | void __rds_conn_error(struct rds_connection *conn, const char *, ...) | 
|  | 531 | __attribute__ ((format (printf, 2, 3))); | 
|  | 532 | #define rds_conn_error(conn, fmt...) \ | 
|  | 533 | __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) | 
|  | 534 |  | 
|  | 535 | static inline int | 
|  | 536 | rds_conn_transition(struct rds_connection *conn, int old, int new) | 
|  | 537 | { | 
|  | 538 | return atomic_cmpxchg(&conn->c_state, old, new) == old; | 
|  | 539 | } | 
|  | 540 |  | 
|  | 541 | static inline int | 
|  | 542 | rds_conn_state(struct rds_connection *conn) | 
|  | 543 | { | 
|  | 544 | return atomic_read(&conn->c_state); | 
|  | 545 | } | 
|  | 546 |  | 
|  | 547 | static inline int | 
|  | 548 | rds_conn_up(struct rds_connection *conn) | 
|  | 549 | { | 
|  | 550 | return atomic_read(&conn->c_state) == RDS_CONN_UP; | 
|  | 551 | } | 
|  | 552 |  | 
|  | 553 | static inline int | 
|  | 554 | rds_conn_connecting(struct rds_connection *conn) | 
|  | 555 | { | 
|  | 556 | return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; | 
|  | 557 | } | 
|  | 558 |  | 
|  | 559 | /* message.c */ | 
|  | 560 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); | 
|  | 561 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | 
|  | 562 | size_t total_len); | 
|  | 563 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); | 
|  | 564 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | 
|  | 565 | __be16 dport, u64 seq); | 
|  | 566 | int rds_message_add_extension(struct rds_header *hdr, | 
|  | 567 | unsigned int type, const void *data, unsigned int len); | 
|  | 568 | int rds_message_next_extension(struct rds_header *hdr, | 
|  | 569 | unsigned int *pos, void *buf, unsigned int *buflen); | 
|  | 570 | int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); | 
|  | 571 | int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); | 
|  | 572 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); | 
|  | 573 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | 
|  | 574 | struct iovec *first_iov, size_t size); | 
|  | 575 | void rds_message_inc_purge(struct rds_incoming *inc); | 
|  | 576 | void rds_message_inc_free(struct rds_incoming *inc); | 
|  | 577 | void rds_message_addref(struct rds_message *rm); | 
|  | 578 | void rds_message_put(struct rds_message *rm); | 
|  | 579 | void rds_message_wait(struct rds_message *rm); | 
|  | 580 | void rds_message_unmapped(struct rds_message *rm); | 
|  | 581 |  | 
|  | 582 | static inline void rds_message_make_checksum(struct rds_header *hdr) | 
|  | 583 | { | 
|  | 584 | hdr->h_csum = 0; | 
|  | 585 | hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); | 
|  | 586 | } | 
|  | 587 |  | 
|  | 588 | static inline int rds_message_verify_checksum(const struct rds_header *hdr) | 
|  | 589 | { | 
|  | 590 | return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; | 
|  | 591 | } | 
|  | 592 |  | 
|  | 593 |  | 
|  | 594 | /* page.c */ | 
|  | 595 | int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | 
|  | 596 | gfp_t gfp); | 
|  | 597 | int rds_page_copy_user(struct page *page, unsigned long offset, | 
|  | 598 | void __user *ptr, unsigned long bytes, | 
|  | 599 | int to_user); | 
|  | 600 | #define rds_page_copy_to_user(page, offset, ptr, bytes) \ | 
|  | 601 | rds_page_copy_user(page, offset, ptr, bytes, 1) | 
|  | 602 | #define rds_page_copy_from_user(page, offset, ptr, bytes) \ | 
|  | 603 | rds_page_copy_user(page, offset, ptr, bytes, 0) | 
|  | 604 | void rds_page_exit(void); | 
|  | 605 |  | 
|  | 606 | /* recv.c */ | 
|  | 607 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, | 
|  | 608 | __be32 saddr); | 
|  | 609 | void rds_inc_addref(struct rds_incoming *inc); | 
|  | 610 | void rds_inc_put(struct rds_incoming *inc); | 
|  | 611 | void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, | 
|  | 612 | struct rds_incoming *inc, gfp_t gfp, enum km_type km); | 
|  | 613 | int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | 
|  | 614 | size_t size, int msg_flags); | 
|  | 615 | void rds_clear_recv_queue(struct rds_sock *rs); | 
|  | 616 | int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); | 
|  | 617 | void rds_inc_info_copy(struct rds_incoming *inc, | 
|  | 618 | struct rds_info_iterator *iter, | 
|  | 619 | __be32 saddr, __be32 daddr, int flip); | 
|  | 620 |  | 
|  | 621 | /* send.c */ | 
|  | 622 | int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | 
|  | 623 | size_t payload_len); | 
|  | 624 | void rds_send_reset(struct rds_connection *conn); | 
|  | 625 | int rds_send_xmit(struct rds_connection *conn); | 
|  | 626 | struct sockaddr_in; | 
|  | 627 | void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); | 
|  | 628 | typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); | 
|  | 629 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, | 
|  | 630 | is_acked_func is_acked); | 
|  | 631 | int rds_send_acked_before(struct rds_connection *conn, u64 seq); | 
|  | 632 | void rds_send_remove_from_sock(struct list_head *messages, int status); | 
|  | 633 | int rds_send_pong(struct rds_connection *conn, __be16 dport); | 
|  | 634 | struct rds_message *rds_send_get_message(struct rds_connection *, | 
|  | 635 | struct rds_rdma_op *); | 
|  | 636 |  | 
|  | 637 | /* rdma.c */ | 
|  | 638 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); | 
|  | 639 |  | 
|  | 640 | /* stats.c */ | 
| David Howells | 9b8de74 | 2009-04-21 23:00:24 +0100 | [diff] [blame] | 641 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | 
| Andy Grover | 39de828 | 2009-02-24 15:30:19 +0000 | [diff] [blame] | 642 | #define rds_stats_inc_which(which, member) do {		\ | 
|  | 643 | per_cpu(which, get_cpu()).member++;		\ | 
|  | 644 | put_cpu();					\ | 
|  | 645 | } while (0) | 
|  | 646 | #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) | 
|  | 647 | #define rds_stats_add_which(which, member, count) do {		\ | 
|  | 648 | per_cpu(which, get_cpu()).member += count;	\ | 
|  | 649 | put_cpu();					\ | 
|  | 650 | } while (0) | 
|  | 651 | #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) | 
|  | 652 | int __init rds_stats_init(void); | 
|  | 653 | void rds_stats_exit(void); | 
|  | 654 | void rds_stats_info_copy(struct rds_info_iterator *iter, | 
|  | 655 | uint64_t *values, char **names, size_t nr); | 
|  | 656 |  | 
|  | 657 | /* sysctl.c */ | 
|  | 658 | int __init rds_sysctl_init(void); | 
|  | 659 | void rds_sysctl_exit(void); | 
|  | 660 | extern unsigned long rds_sysctl_sndbuf_min; | 
|  | 661 | extern unsigned long rds_sysctl_sndbuf_default; | 
|  | 662 | extern unsigned long rds_sysctl_sndbuf_max; | 
|  | 663 | extern unsigned long rds_sysctl_reconnect_min_jiffies; | 
|  | 664 | extern unsigned long rds_sysctl_reconnect_max_jiffies; | 
|  | 665 | extern unsigned int  rds_sysctl_max_unacked_packets; | 
|  | 666 | extern unsigned int  rds_sysctl_max_unacked_bytes; | 
|  | 667 | extern unsigned int  rds_sysctl_ping_enable; | 
|  | 668 | extern unsigned long rds_sysctl_trace_flags; | 
|  | 669 | extern unsigned int  rds_sysctl_trace_level; | 
|  | 670 |  | 
|  | 671 | /* threads.c */ | 
|  | 672 | int __init rds_threads_init(void); | 
|  | 673 | void rds_threads_exit(void); | 
|  | 674 | extern struct workqueue_struct *rds_wq; | 
|  | 675 | void rds_connect_worker(struct work_struct *); | 
|  | 676 | void rds_shutdown_worker(struct work_struct *); | 
|  | 677 | void rds_send_worker(struct work_struct *); | 
|  | 678 | void rds_recv_worker(struct work_struct *); | 
|  | 679 | void rds_connect_complete(struct rds_connection *conn); | 
|  | 680 |  | 
|  | 681 | /* transport.c */ | 
|  | 682 | int rds_trans_register(struct rds_transport *trans); | 
|  | 683 | void rds_trans_unregister(struct rds_transport *trans); | 
|  | 684 | struct rds_transport *rds_trans_get_preferred(__be32 addr); | 
|  | 685 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, | 
|  | 686 | unsigned int avail); | 
|  | 687 | int __init rds_trans_init(void); | 
|  | 688 | void rds_trans_exit(void); | 
|  | 689 |  | 
|  | 690 | #endif |