Blame - net/rds/send.c - android_kernel_oneplus_msm8996

blob: 28c88ff3d038a04e94b6a99d705c3b9a8fb6a056 [file] [log] [blame]

Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
				34	#include <net/sock.h>
				35	#include <linux/in.h>
				36	#include <linux/list.h>
				37
				38	#include "rds.h"
				39	#include "rdma.h"
				40
				41	/* When transmitting messages in rds_send_xmit, we need to emerge from
				42	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				43	* will kick our shin.
				44	* Also, it seems fairer to not let one busy connection stall all the
				45	* others.
				46	*
				47	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				48	* it to 0 will restore the old behavior (where we looped until we had
				49	* drained the queue).
				50	*/
				51	static int send_batch_count = 64;
				52	module_param(send_batch_count, int, 0444);
				53	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				54
				55	/*
				56	* Reset the send state. Caller must hold c_send_lock when calling here.
				57	*/
				58	void rds_send_reset(struct rds_connection *conn)
				59	{
				60	struct rds_message rm, tmp;
				61	unsigned long flags;
				62
				63	if (conn->c_xmit_rm) {
				64	/* Tell the user the RDMA op is no longer mapped by the
				65	* transport. This isn't entirely true (it's flushed out
				66	* independently) but as the connection is down, there's
				67	* no ongoing RDMA to/from that memory */
				68	rds_message_unmapped(conn->c_xmit_rm);
				69	rds_message_put(conn->c_xmit_rm);
				70	conn->c_xmit_rm = NULL;
				71	}
				72	conn->c_xmit_sg = 0;
				73	conn->c_xmit_hdr_off = 0;
				74	conn->c_xmit_data_off = 0;
				75	conn->c_xmit_rdma_sent = 0;
				76
				77	conn->c_map_queued = 0;
				78
				79	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				80	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				81
				82	/* Mark messages as retransmissions, and move them to the send q */
				83	spin_lock_irqsave(&conn->c_lock, flags);
				84	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				85	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				86	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				87	}
				88	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
				89	spin_unlock_irqrestore(&conn->c_lock, flags);
				90	}
				91
				92	/*
				93	* We're making the concious trade-off here to only send one message
				94	* down the connection at a time.
				95	* Pro:
				96	* - tx queueing is a simple fifo list
				97	* - reassembly is optional and easily done by transports per conn
				98	* - no per flow rx lookup at all, straight to the socket
				99	* - less per-frag memory and wire overhead
				100	* Con:
				101	* - queued acks can be delayed behind large messages
				102	* Depends:
				103	* - small message latency is higher behind queued large messages
				104	* - large message latency isn't starved by intervening small sends
				105	*/
				106	int rds_send_xmit(struct rds_connection *conn)
				107	{
				108	struct rds_message *rm;
				109	unsigned long flags;
				110	unsigned int tmp;
				111	unsigned int send_quota = send_batch_count;
				112	struct scatterlist *sg;
				113	int ret = 0;
				114	int was_empty = 0;
				115	LIST_HEAD(to_be_dropped);
				116
				117	/*
				118	* sendmsg calls here after having queued its message on the send
				119	* queue. We only have one task feeding the connection at a time. If
				120	* another thread is already feeding the queue then we back off. This
				121	* avoids blocking the caller and trading per-connection data between
				122	* caches per message.
				123	*
				124	* The sem holder will issue a retry if they notice that someone queued
				125	* a message after they stopped walking the send queue but before they
				126	* dropped the sem.
				127	*/
				128	if (!mutex_trylock(&conn->c_send_lock)) {
				129	rds_stats_inc(s_send_sem_contention);
				130	ret = -ENOMEM;
				131	goto out;
				132	}
				133
				134	if (conn->c_trans->xmit_prepare)
				135	conn->c_trans->xmit_prepare(conn);
				136
				137	/*
				138	* spin trying to push headers and data down the connection until
				139	* the connection doens't make forward progress.
				140	*/
				141	while (--send_quota) {
				142	/*
				143	* See if need to send a congestion map update if we're
				144	* between sending messages. The send_sem protects our sole
				145	* use of c_map_offset and _bytes.
				146	* Note this is used only by transports that define a special
				147	* xmit_cong_map function. For all others, we create allocate
				148	* a cong_map message and treat it just like any other send.
				149	*/
				150	if (conn->c_map_bytes) {
				151	ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
				152	conn->c_map_offset);
				153	if (ret <= 0)
				154	break;
				155
				156	conn->c_map_offset += ret;
				157	conn->c_map_bytes -= ret;
				158	if (conn->c_map_bytes)
				159	continue;
				160	}
				161
				162	/* If we're done sending the current message, clear the
				163	* offset and S/G temporaries.
				164	*/
				165	rm = conn->c_xmit_rm;
				166	if (rm != NULL &&
				167	conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
				168	conn->c_xmit_sg == rm->m_nents) {
				169	conn->c_xmit_rm = NULL;
				170	conn->c_xmit_sg = 0;
				171	conn->c_xmit_hdr_off = 0;
				172	conn->c_xmit_data_off = 0;
				173	conn->c_xmit_rdma_sent = 0;
				174
				175	/* Release the reference to the previous message. */
				176	rds_message_put(rm);
				177	rm = NULL;
				178	}
				179
				180	/* If we're asked to send a cong map update, do so.
				181	*/
				182	if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
				183	if (conn->c_trans->xmit_cong_map != NULL) {
				184	conn->c_map_offset = 0;
				185	conn->c_map_bytes = sizeof(struct rds_header) +
				186	RDS_CONG_MAP_BYTES;
				187	continue;
				188	}
				189
				190	rm = rds_cong_update_alloc(conn);
				191	if (IS_ERR(rm)) {
				192	ret = PTR_ERR(rm);
				193	break;
				194	}
				195
				196	conn->c_xmit_rm = rm;
				197	}
				198
				199	/*
				200	* Grab the next message from the send queue, if there is one.
				201	*
				202	* c_xmit_rm holds a ref while we're sending this message down
				203	* the connction. We can use this ref while holding the
				204	* send_sem.. rds_send_reset() is serialized with it.
				205	*/
				206	if (rm == NULL) {
				207	unsigned int len;
				208
				209	spin_lock_irqsave(&conn->c_lock, flags);
				210
				211	if (!list_empty(&conn->c_send_queue)) {
				212	rm = list_entry(conn->c_send_queue.next,
				213	struct rds_message,
				214	m_conn_item);
				215	rds_message_addref(rm);
				216
				217	/*
				218	* Move the message from the send queue to the retransmit
				219	* list right away.
				220	*/
				221	list_move_tail(&rm->m_conn_item, &conn->c_retrans);
				222	}
				223
				224	spin_unlock_irqrestore(&conn->c_lock, flags);
				225
				226	if (rm == NULL) {
				227	was_empty = 1;
				228	break;
				229	}
				230
				231	/* Unfortunately, the way Infiniband deals with
				232	* RDMA to a bad MR key is by moving the entire
				233	* queue pair to error state. We cold possibly
				234	* recover from that, but right now we drop the
				235	* connection.
				236	* Therefore, we never retransmit messages with RDMA ops.
				237	*/
				238	if (rm->m_rdma_op
				239	&& test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
				240	spin_lock_irqsave(&conn->c_lock, flags);
				241	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				242	list_move(&rm->m_conn_item, &to_be_dropped);
				243	spin_unlock_irqrestore(&conn->c_lock, flags);
				244	rds_message_put(rm);
				245	continue;
				246	}
				247
				248	/* Require an ACK every once in a while */
				249	len = ntohl(rm->m_inc.i_hdr.h_len);
				250	if (conn->c_unacked_packets == 0
				251	\|\| conn->c_unacked_bytes < len) {
				252	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				253
				254	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				255	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				256	rds_stats_inc(s_send_ack_required);
				257	} else {
				258	conn->c_unacked_bytes -= len;
				259	conn->c_unacked_packets--;
				260	}
				261
				262	conn->c_xmit_rm = rm;
				263	}
				264
				265	/*
				266	* Try and send an rdma message. Let's see if we can
				267	* keep this simple and require that the transport either
				268	* send the whole rdma or none of it.
				269	*/
				270	if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
				271	ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
				272	if (ret)
				273	break;
				274	conn->c_xmit_rdma_sent = 1;
				275	/* The transport owns the mapped memory for now.
				276	* You can't unmap it while it's on the send queue */
				277	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				278	}
				279
				280	if (conn->c_xmit_hdr_off < sizeof(struct rds_header) \|\|
				281	conn->c_xmit_sg < rm->m_nents) {
				282	ret = conn->c_trans->xmit(conn, rm,
				283	conn->c_xmit_hdr_off,
				284	conn->c_xmit_sg,
				285	conn->c_xmit_data_off);
				286	if (ret <= 0)
				287	break;
				288
				289	if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				290	tmp = min_t(int, ret,
				291	sizeof(struct rds_header) -
				292	conn->c_xmit_hdr_off);
				293	conn->c_xmit_hdr_off += tmp;
				294	ret -= tmp;
				295	}
				296
				297	sg = &rm->m_sg[conn->c_xmit_sg];
				298	while (ret) {
				299	tmp = min_t(int, ret, sg->length -
				300	conn->c_xmit_data_off);
				301	conn->c_xmit_data_off += tmp;
				302	ret -= tmp;
				303	if (conn->c_xmit_data_off == sg->length) {
				304	conn->c_xmit_data_off = 0;
				305	sg++;
				306	conn->c_xmit_sg++;
				307	BUG_ON(ret != 0 &&
				308	conn->c_xmit_sg == rm->m_nents);
				309	}
				310	}
				311	}
				312	}
				313
				314	/* Nuke any messages we decided not to retransmit. */
				315	if (!list_empty(&to_be_dropped))
				316	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				317
				318	if (conn->c_trans->xmit_complete)
				319	conn->c_trans->xmit_complete(conn);
				320
				321	/*
				322	* We might be racing with another sender who queued a message but
				323	* backed off on noticing that we held the c_send_lock. If we check
				324	* for queued messages after dropping the sem then either we'll
				325	* see the queued message or the queuer will get the sem. If we
				326	* notice the queued message then we trigger an immediate retry.
				327	*
				328	* We need to be careful only to do this when we stopped processing
				329	* the send queue because it was empty. It's the only way we
				330	* stop processing the loop when the transport hasn't taken
				331	* responsibility for forward progress.
				332	*/
				333	mutex_unlock(&conn->c_send_lock);
				334
				335	if (conn->c_map_bytes \|\| (send_quota == 0 && !was_empty)) {
				336	/* We exhausted the send quota, but there's work left to
				337	* do. Return and (re-)schedule the send worker.
				338	*/
				339	ret = -EAGAIN;
				340	}
				341
				342	if (ret == 0 && was_empty) {
				343	/* A simple bit test would be way faster than taking the
				344	* spin lock */
				345	spin_lock_irqsave(&conn->c_lock, flags);
				346	if (!list_empty(&conn->c_send_queue)) {
				347	rds_stats_inc(s_send_sem_queue_raced);
				348	ret = -EAGAIN;
				349	}
				350	spin_unlock_irqrestore(&conn->c_lock, flags);
				351	}
				352	out:
				353	return ret;
				354	}
				355
				356	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				357	{
				358	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				359
				360	assert_spin_locked(&rs->rs_lock);
				361
				362	BUG_ON(rs->rs_snd_bytes < len);
				363	rs->rs_snd_bytes -= len;
				364
				365	if (rs->rs_snd_bytes == 0)
				366	rds_stats_inc(s_send_queue_empty);
				367	}
				368
				369	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				370	is_acked_func is_acked)
				371	{
				372	if (is_acked)
				373	return is_acked(rm, ack);
				374	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				375	}
				376
				377	/*
				378	* Returns true if there are no messages on the send and retransmit queues
				379	* which have a sequence number greater than or equal to the given sequence
				380	* number.
				381	*/
				382	int rds_send_acked_before(struct rds_connection *conn, u64 seq)
				383	{
				384	struct rds_message rm, tmp;
				385	int ret = 1;
				386
				387	spin_lock(&conn->c_lock);
				388
				389	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				390	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				391	ret = 0;
				392	break;
				393	}
				394
				395	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				396	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				397	ret = 0;
				398	break;
				399	}
				400
				401	spin_unlock(&conn->c_lock);
				402
				403	return ret;
				404	}
				405
				406	/*
				407	* This is pretty similar to what happens below in the ACK
				408	* handling code - except that we call here as soon as we get
				409	* the IB send completion on the RDMA op and the accompanying
				410	* message.
				411	*/
				412	void rds_rdma_send_complete(struct rds_message *rm, int status)
				413	{
				414	struct rds_sock *rs = NULL;
				415	struct rds_rdma_op *ro;
				416	struct rds_notifier *notifier;
				417
				418	spin_lock(&rm->m_rs_lock);
				419
				420	ro = rm->m_rdma_op;
				421	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
				422	&& ro && ro->r_notify && ro->r_notifier) {
				423	notifier = ro->r_notifier;
				424	rs = rm->m_rs;
				425	sock_hold(rds_rs_to_sk(rs));
				426
				427	notifier->n_status = status;
				428	spin_lock(&rs->rs_lock);
				429	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				430	spin_unlock(&rs->rs_lock);
				431
				432	ro->r_notifier = NULL;
				433	}
				434
				435	spin_unlock(&rm->m_rs_lock);
				436
				437	if (rs) {
				438	rds_wake_sk_sleep(rs);
				439	sock_put(rds_rs_to_sk(rs));
				440	}
				441	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	442	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	443
				444	/*
				445	* This is the same as rds_rdma_send_complete except we
				446	* don't do any locking - we have all the ingredients (message,
				447	* socket, socket lock) and can just move the notifier.
				448	*/
				449	static inline void
				450	__rds_rdma_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				451	{
				452	struct rds_rdma_op *ro;
				453
				454	ro = rm->m_rdma_op;
				455	if (ro && ro->r_notify && ro->r_notifier) {
				456	ro->r_notifier->n_status = status;
				457	list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
				458	ro->r_notifier = NULL;
				459	}
				460
				461	/* No need to wake the app - caller does this */
				462	}
				463
				464	/*
				465	* This is called from the IB send completion when we detect
				466	* a RDMA operation that failed with remote access error.
				467	* So speed is not an issue here.
				468	*/
				469	struct rds_message rds_send_get_message(struct rds_connection conn,
				470	struct rds_rdma_op *op)
				471	{
				472	struct rds_message rm, tmp, *found = NULL;
				473	unsigned long flags;
				474
				475	spin_lock_irqsave(&conn->c_lock, flags);
				476
				477	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				478	if (rm->m_rdma_op == op) {
				479	atomic_inc(&rm->m_refcount);
				480	found = rm;
				481	goto out;
				482	}
				483	}
				484
				485	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				486	if (rm->m_rdma_op == op) {
				487	atomic_inc(&rm->m_refcount);
				488	found = rm;
				489	break;
				490	}
				491	}
				492
				493	out:
				494	spin_unlock_irqrestore(&conn->c_lock, flags);
				495
				496	return found;
				497	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	498	EXPORT_SYMBOL_GPL(rds_send_get_message);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	499
				500	/*
				501	* This removes messages from the socket's list if they're on it. The list
				502	* argument must be private to the caller, we must be able to modify it
				503	* without locks. The messages must have a reference held for their
				504	* position on the list. This function will drop that reference after
				505	* removing the messages from the 'messages' list regardless of if it found
				506	* the messages on the socket list or not.
				507	*/
				508	void rds_send_remove_from_sock(struct list_head *messages, int status)
				509	{
				510	unsigned long flags = 0; /* silence gcc :P */
				511	struct rds_sock *rs = NULL;
				512	struct rds_message *rm;
				513
				514	local_irq_save(flags);
				515	while (!list_empty(messages)) {
				516	rm = list_entry(messages->next, struct rds_message,
				517	m_conn_item);
				518	list_del_init(&rm->m_conn_item);
				519
				520	/*
				521	* If we see this flag cleared then we're sure that someone
				522	* else beat us to removing it from the sock. If we race
				523	* with their flag update we'll get the lock and then really
				524	* see that the flag has been cleared.
				525	*
				526	* The message spinlock makes sure nobody clears rm->m_rs
				527	* while we're messing with it. It does not prevent the
				528	* message from being removed from the socket, though.
				529	*/
				530	spin_lock(&rm->m_rs_lock);
				531	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				532	goto unlock_and_drop;
				533
				534	if (rs != rm->m_rs) {
				535	if (rs) {
				536	spin_unlock(&rs->rs_lock);
				537	rds_wake_sk_sleep(rs);
				538	sock_put(rds_rs_to_sk(rs));
				539	}
				540	rs = rm->m_rs;
				541	spin_lock(&rs->rs_lock);
				542	sock_hold(rds_rs_to_sk(rs));
				543	}
				544
				545	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
				546	struct rds_rdma_op *ro = rm->m_rdma_op;
				547	struct rds_notifier *notifier;
				548
				549	list_del_init(&rm->m_sock_item);
				550	rds_send_sndbuf_remove(rs, rm);
				551
				552	if (ro && ro->r_notifier
				553	&& (status \|\| ro->r_notify)) {
				554	notifier = ro->r_notifier;
				555	list_add_tail(&notifier->n_list,
				556	&rs->rs_notify_queue);
				557	if (!notifier->n_status)
				558	notifier->n_status = status;
				559	rm->m_rdma_op->r_notifier = NULL;
				560	}
				561	rds_message_put(rm);
				562	rm->m_rs = NULL;
				563	}
				564
				565	unlock_and_drop:
				566	spin_unlock(&rm->m_rs_lock);
				567	rds_message_put(rm);
				568	}
				569
				570	if (rs) {
				571	spin_unlock(&rs->rs_lock);
				572	rds_wake_sk_sleep(rs);
				573	sock_put(rds_rs_to_sk(rs));
				574	}
				575	local_irq_restore(flags);
				576	}
				577
				578	/*
				579	* Transports call here when they've determined that the receiver queued
				580	* messages up to, and including, the given sequence number. Messages are
				581	* moved to the retrans queue when rds_send_xmit picks them off the send
				582	* queue. This means that in the TCP case, the message may not have been
				583	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				584	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				585	*
				586	* XXX It's not clear to me how this is safely serialized with socket
				587	* destruction. Maybe it should bail if it sees SOCK_DEAD.
				588	*/
				589	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				590	is_acked_func is_acked)
				591	{
				592	struct rds_message rm, tmp;
				593	unsigned long flags;
				594	LIST_HEAD(list);
				595
				596	spin_lock_irqsave(&conn->c_lock, flags);
				597
				598	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				599	if (!rds_send_is_acked(rm, ack, is_acked))
				600	break;
				601
				602	list_move(&rm->m_conn_item, &list);
				603	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				604	}
				605
				606	/* order flag updates with spin locks */
				607	if (!list_empty(&list))
				608	smp_mb__after_clear_bit();
				609
				610	spin_unlock_irqrestore(&conn->c_lock, flags);
				611
				612	/* now remove the messages from the sock list as needed */
				613	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				614	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	615	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	616
				617	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				618	{
				619	struct rds_message rm, tmp;
				620	struct rds_connection *conn;
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	621	unsigned long flags, flags2;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	622	LIST_HEAD(list);
				623	int wake = 0;
				624
				625	/* get all the messages we're dropping under the rs lock */
				626	spin_lock_irqsave(&rs->rs_lock, flags);
				627
				628	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				629	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				630	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				631	continue;
				632
				633	wake = 1;
				634	list_move(&rm->m_sock_item, &list);
				635	rds_send_sndbuf_remove(rs, rm);
				636	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				637
				638	/* If this is a RDMA operation, notify the app. */
				639	__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
				640	}
				641
				642	/* order flag updates with the rs lock */
				643	if (wake)
				644	smp_mb__after_clear_bit();
				645
				646	spin_unlock_irqrestore(&rs->rs_lock, flags);
				647
				648	if (wake)
				649	rds_wake_sk_sleep(rs);
				650
				651	conn = NULL;
				652
				653	/* now remove the messages from the conn list as needed */
				654	list_for_each_entry(rm, &list, m_sock_item) {
				655	/* We do this here rather than in the loop above, so that
				656	* we don't have to nest m_rs_lock under rs->rs_lock */
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	657	spin_lock_irqsave(&rm->m_rs_lock, flags2);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	658	rm->m_rs = NULL;
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	659	spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	660
				661	/*
				662	* If we see this flag cleared then we're sure that someone
				663	* else beat us to removing it from the conn. If we race
				664	* with their flag update we'll get the lock and then really
				665	* see that the flag has been cleared.
				666	*/
				667	if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				668	continue;
				669
				670	if (conn != rm->m_inc.i_conn) {
				671	if (conn)
				672	spin_unlock_irqrestore(&conn->c_lock, flags);
				673	conn = rm->m_inc.i_conn;
				674	spin_lock_irqsave(&conn->c_lock, flags);
				675	}
				676
				677	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				678	list_del_init(&rm->m_conn_item);
				679	rds_message_put(rm);
				680	}
				681	}
				682
				683	if (conn)
				684	spin_unlock_irqrestore(&conn->c_lock, flags);
				685
				686	while (!list_empty(&list)) {
				687	rm = list_entry(list.next, struct rds_message, m_sock_item);
				688	list_del_init(&rm->m_sock_item);
				689
				690	rds_message_wait(rm);
				691	rds_message_put(rm);
				692	}
				693	}
				694
				695	/*
				696	* we only want this to fire once so we use the callers 'queued'. It's
				697	* possible that another thread can race with us and remove the
				698	* message from the flow with RDS_CANCEL_SENT_TO.
				699	*/
				700	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				701	struct rds_message *rm, __be16 sport,
				702	__be16 dport, int *queued)
				703	{
				704	unsigned long flags;
				705	u32 len;
				706
				707	if (*queued)
				708	goto out;
				709
				710	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				711
				712	/* this is the only place which holds both the socket's rs_lock
				713	* and the connection's c_lock */
				714	spin_lock_irqsave(&rs->rs_lock, flags);
				715
				716	/*
				717	* If there is a little space in sndbuf, we don't queue anything,
				718	* and userspace gets -EAGAIN. But poll() indicates there's send
				719	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				720	* freed up by incoming acks. So we check the old value of
				721	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				722	* and poll() now knows no more data can be sent.
				723	*/
				724	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				725	rs->rs_snd_bytes += len;
				726
				727	/* let recv side know we are close to send space exhaustion.
				728	* This is probably not the optimal way to do it, as this
				729	* means we set the flag on all messages as soon as our
				730	* throughput hits a certain threshold.
				731	*/
				732	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				733	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				734
				735	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				736	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				737	rds_message_addref(rm);
				738	rm->m_rs = rs;
				739
				740	/* The code ordering is a little weird, but we're
				741	trying to minimize the time we hold c_lock */
				742	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				743	rm->m_inc.i_conn = conn;
				744	rds_message_addref(rm);
				745
				746	spin_lock(&conn->c_lock);
				747	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
				748	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				749	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				750	spin_unlock(&conn->c_lock);
				751
				752	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				753	rm, len, rs, rs->rs_snd_bytes,
				754	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				755
				756	*queued = 1;
				757	}
				758
				759	spin_unlock_irqrestore(&rs->rs_lock, flags);
				760	out:
				761	return *queued;
				762	}
				763
				764	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				765	struct msghdr msg, int allocated_mr)
				766	{
				767	struct cmsghdr *cmsg;
				768	int ret = 0;
				769
				770	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				771	if (!CMSG_OK(msg, cmsg))
				772	return -EINVAL;
				773
				774	if (cmsg->cmsg_level != SOL_RDS)
				775	continue;
				776
				777	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				778	* rm->m_rdma_cookie and rm->m_rdma_mr.
				779	*/
				780	switch (cmsg->cmsg_type) {
				781	case RDS_CMSG_RDMA_ARGS:
				782	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				783	break;
				784
				785	case RDS_CMSG_RDMA_DEST:
				786	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				787	break;
				788
				789	case RDS_CMSG_RDMA_MAP:
				790	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				791	if (!ret)
				792	*allocated_mr = 1;
				793	break;
				794
				795	default:
				796	return -EINVAL;
				797	}
				798
				799	if (ret)
				800	break;
				801	}
				802
				803	return ret;
				804	}
				805
				806	int rds_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
				807	size_t payload_len)
				808	{
				809	struct sock *sk = sock->sk;
				810	struct rds_sock *rs = rds_sk_to_rs(sk);
				811	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
				812	__be32 daddr;
				813	__be16 dport;
				814	struct rds_message *rm = NULL;
				815	struct rds_connection *conn;
				816	int ret = 0;
				817	int queued = 0, allocated_mr = 0;
				818	int nonblock = msg->msg_flags & MSG_DONTWAIT;
				819	long timeo = sock_rcvtimeo(sk, nonblock);
				820
				821	/* Mirror Linux UDP mirror of BSD error message compatibility */
				822	/* XXX: Perhaps MSG_MORE someday */
				823	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				824	printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
				825	ret = -EOPNOTSUPP;
				826	goto out;
				827	}
				828
				829	if (msg->msg_namelen) {
				830	/* XXX fail non-unicast destination IPs? */
				831	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				832	ret = -EINVAL;
				833	goto out;
				834	}
				835	daddr = usin->sin_addr.s_addr;
				836	dport = usin->sin_port;
				837	} else {
				838	/* We only care about consistency with ->connect() */
				839	lock_sock(sk);
				840	daddr = rs->rs_conn_addr;
				841	dport = rs->rs_conn_port;
				842	release_sock(sk);
				843	}
				844
				845	/* racing with another thread binding seems ok here */
				846	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				847	ret = -ENOTCONN; /* XXX not a great errno */
				848	goto out;
				849	}
				850
				851	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
				852	if (IS_ERR(rm)) {
				853	ret = PTR_ERR(rm);
				854	rm = NULL;
				855	goto out;
				856	}
				857
				858	rm->m_daddr = daddr;
				859
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	860	/* rds_conn_create has a spinlock that runs with IRQ off.
				861	* Caching the conn in the socket helps a lot. */
				862	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				863	conn = rs->rs_conn;
				864	else {
				865	conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
				866	rs->rs_transport,
				867	sock->sk->sk_allocation);
				868	if (IS_ERR(conn)) {
				869	ret = PTR_ERR(conn);
				870	goto out;
				871	}
				872	rs->rs_conn = conn;
				873	}
				874
Andy Grover	49f6969	2009-04-09 14:09:41 +0000	[diff] [blame]	875	/* Parse any control messages the user may have included. */
				876	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				877	if (ret)
				878	goto out;
				879
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	880	if ((rm->m_rdma_cookie \|\| rm->m_rdma_op)
				881	&& conn->c_trans->xmit_rdma == NULL) {
				882	if (printk_ratelimit())
				883	printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
				884	rm->m_rdma_op, conn->c_trans->xmit_rdma);
				885	ret = -EOPNOTSUPP;
				886	goto out;
				887	}
				888
				889	/* If the connection is down, trigger a connect. We may
				890	* have scheduled a delayed reconnect however - in this case
				891	* we should not interfere.
				892	*/
				893	if (rds_conn_state(conn) == RDS_CONN_DOWN
				894	&& !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
				895	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				896
				897	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
				898	if (ret)
				899	goto out;
				900
				901	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
				902	dport, &queued)) {
				903	rds_stats_inc(s_send_queue_full);
				904	/* XXX make sure this is reasonable */
				905	if (payload_len > rds_sk_sndbuf(rs)) {
				906	ret = -EMSGSIZE;
				907	goto out;
				908	}
				909	if (nonblock) {
				910	ret = -EAGAIN;
				911	goto out;
				912	}
				913
				914	timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
				915	rds_send_queue_rm(rs, conn, rm,
				916	rs->rs_bound_port,
				917	dport,
				918	&queued),
				919	timeo);
				920	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				921	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				922	continue;
				923
				924	ret = timeo;
				925	if (ret == 0)
				926	ret = -ETIMEDOUT;
				927	goto out;
				928	}
				929
				930	/*
				931	* By now we've committed to the send. We reuse rds_send_worker()
				932	* to retry sends in the rds thread if the transport asks us to.
				933	*/
				934	rds_stats_inc(s_send_queued);
				935
				936	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
				937	rds_send_worker(&conn->c_send_w.work);
				938
				939	rds_message_put(rm);
				940	return payload_len;
				941
				942	out:
				943	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				944	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				945	* or in any other way, we need to destroy the MR again */
				946	if (allocated_mr)
				947	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				948
				949	if (rm)
				950	rds_message_put(rm);
				951	return ret;
				952	}
				953
				954	/*
				955	* Reply to a ping packet.
				956	*/
				957	int
				958	rds_send_pong(struct rds_connection *conn, __be16 dport)
				959	{
				960	struct rds_message *rm;
				961	unsigned long flags;
				962	int ret = 0;
				963
				964	rm = rds_message_alloc(0, GFP_ATOMIC);
				965	if (rm == NULL) {
				966	ret = -ENOMEM;
				967	goto out;
				968	}
				969
				970	rm->m_daddr = conn->c_faddr;
				971
				972	/* If the connection is down, trigger a connect. We may
				973	* have scheduled a delayed reconnect however - in this case
				974	* we should not interfere.
				975	*/
				976	if (rds_conn_state(conn) == RDS_CONN_DOWN
				977	&& !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
				978	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				979
				980	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
				981	if (ret)
				982	goto out;
				983
				984	spin_lock_irqsave(&conn->c_lock, flags);
				985	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				986	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				987	rds_message_addref(rm);
				988	rm->m_inc.i_conn = conn;
				989
				990	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
				991	conn->c_next_tx_seq);
				992	conn->c_next_tx_seq++;
				993	spin_unlock_irqrestore(&conn->c_lock, flags);
				994
				995	rds_stats_inc(s_send_queued);
				996	rds_stats_inc(s_send_pong);
				997
				998	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
				999	rds_message_put(rm);
				1000	return 0;
				1001
				1002	out:
				1003	if (rm)
				1004	rds_message_put(rm);
				1005	return ret;
				1006	}