blob: 18eb3d17f174ad773c160d0b746d1df43c896293 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd_req.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
Philipp Reisnerb411b362009-09-25 16:07:19 -070026#include <linux/module.h>
27
28#include <linux/slab.h>
29#include <linux/drbd.h>
30#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070031#include "drbd_req.h"
32
33
34/* Update disk stats at start of I/O request */
35static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
36{
37 const int rw = bio_data_dir(bio);
38 int cpu;
39 cpu = part_stat_lock();
40 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
41 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
Philipp Reisner753c8912009-11-18 15:52:51 +010042 part_inc_in_flight(&mdev->vdisk->part0, rw);
Philipp Reisnerb411b362009-09-25 16:07:19 -070043 part_stat_unlock();
Philipp Reisnerb411b362009-09-25 16:07:19 -070044}
45
46/* Update disk stats when completing request upwards */
47static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
48{
49 int rw = bio_data_dir(req->master_bio);
50 unsigned long duration = jiffies - req->start_time;
51 int cpu;
52 cpu = part_stat_lock();
53 part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
54 part_round_stats(cpu, &mdev->vdisk->part0);
Philipp Reisner753c8912009-11-18 15:52:51 +010055 part_dec_in_flight(&mdev->vdisk->part0, rw);
Philipp Reisnerb411b362009-09-25 16:07:19 -070056 part_stat_unlock();
Philipp Reisnerb411b362009-09-25 16:07:19 -070057}
58
Andreas Gruenbacher9e204cd2011-01-26 18:45:11 +010059static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
60 struct bio *bio_src)
61{
62 struct drbd_request *req;
63
64 req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
65 if (!req)
66 return NULL;
67
68 drbd_req_make_private_bio(req, bio_src);
69 req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
Philipp Reisnera21e9292011-02-08 15:08:49 +010070 req->w.mdev = mdev;
Andreas Gruenbacher9e204cd2011-01-26 18:45:11 +010071 req->master_bio = bio_src;
72 req->epoch = 0;
Andreas Gruenbacher53840642011-01-28 10:31:04 +010073
Andreas Gruenbacher9e204cd2011-01-26 18:45:11 +010074 drbd_clear_interval(&req->i);
75 req->i.sector = bio_src->bi_sector;
76 req->i.size = bio_src->bi_size;
Andreas Gruenbacher5e472262011-01-27 14:42:51 +010077 req->i.local = true;
Andreas Gruenbacher53840642011-01-28 10:31:04 +010078 req->i.waiting = false;
79
Andreas Gruenbacher9e204cd2011-01-26 18:45:11 +010080 INIT_LIST_HEAD(&req->tl_requests);
81 INIT_LIST_HEAD(&req->w.list);
82
83 return req;
84}
85
86static void drbd_req_free(struct drbd_request *req)
87{
88 mempool_free(req, drbd_request_mempool);
89}
90
91/* rw is bio_data_dir(), only READ or WRITE */
Philipp Reisnerb411b362009-09-25 16:07:19 -070092static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
93{
94 const unsigned long s = req->rq_state;
Philipp Reisner288f4222010-05-27 15:07:43 +020095
96 /* remove it from the transfer log.
97 * well, only if it had been there in the first
98 * place... if it had not (local only or conflicting
99 * and never sent), it should still be "empty" as
100 * initialized in drbd_req_new(), so we can list_del() it
101 * here unconditionally */
102 list_del(&req->tl_requests);
103
Philipp Reisnerb411b362009-09-25 16:07:19 -0700104 /* if it was a write, we may have to set the corresponding
105 * bit(s) out-of-sync first. If it had a local part, we need to
106 * release the reference to the activity log. */
107 if (rw == WRITE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700108 /* Set out-of-sync unless both OK flags are set
109 * (local only or remote failed).
110 * Other places where we set out-of-sync:
111 * READ with local io-error */
112 if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100113 drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700114
115 if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100116 drbd_set_in_sync(mdev, req->i.sector, req->i.size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700117
118 /* one might be tempted to move the drbd_al_complete_io
Andreas Gruenbacherfcefa622011-02-17 16:46:59 +0100119 * to the local io completion callback drbd_request_endio.
Philipp Reisnerb411b362009-09-25 16:07:19 -0700120 * but, if this was a mirror write, we may only
121 * drbd_al_complete_io after this is RQ_NET_DONE,
122 * otherwise the extent could be dropped from the al
123 * before it has actually been written on the peer.
124 * if we crash before our peer knows about the request,
125 * but after the extent has been dropped from the al,
126 * we would forget to resync the corresponding extent.
127 */
128 if (s & RQ_LOCAL_MASK) {
129 if (get_ldev_if_state(mdev, D_FAILED)) {
Philipp Reisner07782862010-08-31 12:00:50 +0200130 if (s & RQ_IN_ACT_LOG)
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100131 drbd_al_complete_io(mdev, req->i.sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700132 put_ldev(mdev);
133 } else if (__ratelimit(&drbd_ratelimit_state)) {
134 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
135 "but my Disk seems to have failed :(\n",
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100136 (unsigned long long) req->i.sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700137 }
138 }
139 }
140
Philipp Reisner32fa7e92010-05-26 17:13:18 +0200141 drbd_req_free(req);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700142}
143
144static void queue_barrier(struct drbd_conf *mdev)
145{
146 struct drbd_tl_epoch *b;
147
148 /* We are within the req_lock. Once we queued the barrier for sending,
149 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
150 * barrier/epoch object is added. This is the only place this bit is
151 * set. It indicates that the barrier for this epoch is already queued,
152 * and no new epoch has been created yet. */
153 if (test_bit(CREATE_BARRIER, &mdev->flags))
154 return;
155
Philipp Reisner87eeee42011-01-19 14:16:30 +0100156 b = mdev->tconn->newest_tle;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 b->w.cb = w_send_barrier;
Philipp Reisnera21e9292011-02-08 15:08:49 +0100158 b->w.mdev = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700159 /* inc_ap_pending done here, so we won't
160 * get imbalanced on connection loss.
161 * dec_ap_pending will be done in got_BarrierAck
162 * or (on connection loss) in tl_clear. */
163 inc_ap_pending(mdev);
Philipp Reisnere42325a2011-01-19 13:55:45 +0100164 drbd_queue_work(&mdev->tconn->data.work, &b->w);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700165 set_bit(CREATE_BARRIER, &mdev->flags);
166}
167
168static void _about_to_complete_local_write(struct drbd_conf *mdev,
169 struct drbd_request *req)
170{
171 const unsigned long s = req->rq_state;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700172
Lars Ellenberg8a3c1042010-12-05 14:11:14 +0100173 /* Before we can signal completion to the upper layers,
174 * we may need to close the current epoch.
175 * We can skip this, if this request has not even been sent, because we
176 * did not have a fully established connection yet/anymore, during
177 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
178 */
179 if (mdev->state.conn >= C_CONNECTED &&
180 (s & RQ_NET_SENT) != 0 &&
Philipp Reisner87eeee42011-01-19 14:16:30 +0100181 req->epoch == mdev->tconn->newest_tle->br_number)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700182 queue_barrier(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700183}
184
185void complete_master_bio(struct drbd_conf *mdev,
186 struct bio_and_error *m)
187{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700188 bio_endio(m->bio, m->error);
189 dec_ap_bio(mdev);
190}
191
Andreas Gruenbacher53840642011-01-28 10:31:04 +0100192
193static void drbd_remove_request_interval(struct rb_root *root,
194 struct drbd_request *req)
195{
Philipp Reisnera21e9292011-02-08 15:08:49 +0100196 struct drbd_conf *mdev = req->w.mdev;
Andreas Gruenbacher53840642011-01-28 10:31:04 +0100197 struct drbd_interval *i = &req->i;
198
199 drbd_remove_interval(root, i);
200
201 /* Wake up any processes waiting for this request to complete. */
202 if (i->waiting)
203 wake_up(&mdev->misc_wait);
204}
205
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206/* Helper for __req_mod().
207 * Set m->bio to the master bio, if it is fit to be completed,
208 * or leave it alone (it is initialized to NULL in __req_mod),
209 * if it has already been completed, or cannot be completed yet.
210 * If m->bio is set, the error status to be returned is placed in m->error.
211 */
212void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
213{
214 const unsigned long s = req->rq_state;
Philipp Reisnera21e9292011-02-08 15:08:49 +0100215 struct drbd_conf *mdev = req->w.mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700216 /* only WRITES may end up here without a master bio (on barrier ack) */
217 int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
218
Philipp Reisnerb411b362009-09-25 16:07:19 -0700219 /* we must not complete the master bio, while it is
220 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
221 * not yet acknowledged by the peer
222 * not yet completed by the local io subsystem
223 * these flags may get cleared in any order by
224 * the worker,
225 * the receiver,
226 * the bio_endio completion callbacks.
227 */
228 if (s & RQ_NET_QUEUED)
229 return;
230 if (s & RQ_NET_PENDING)
231 return;
232 if (s & RQ_LOCAL_PENDING)
233 return;
234
235 if (req->master_bio) {
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100236 /* this is DATA_RECEIVED (remote read)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700237 * or protocol C P_WRITE_ACK
238 * or protocol B P_RECV_ACK
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100239 * or protocol A "HANDED_OVER_TO_NETWORK" (SendAck)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700240 * or canceled or failed,
241 * or killed from the transfer log due to connection loss.
242 */
243
244 /*
245 * figure out whether to report success or failure.
246 *
247 * report success when at least one of the operations succeeded.
248 * or, to put the other way,
249 * only report failure, when both operations failed.
250 *
251 * what to do about the failures is handled elsewhere.
252 * what we need to do here is just: complete the master_bio.
253 *
254 * local completion error, if any, has been stored as ERR_PTR
Andreas Gruenbacherfcefa622011-02-17 16:46:59 +0100255 * in private_bio within drbd_request_endio.
Philipp Reisnerb411b362009-09-25 16:07:19 -0700256 */
257 int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
258 int error = PTR_ERR(req->private_bio);
259
260 /* remove the request from the conflict detection
261 * respective block_id verification hash */
Andreas Gruenbacherdac13892011-01-21 17:18:39 +0100262 if (!drbd_interval_empty(&req->i)) {
263 struct rb_root *root;
264
Andreas Gruenbacherdac13892011-01-21 17:18:39 +0100265 if (rw == WRITE)
266 root = &mdev->write_requests;
267 else
268 root = &mdev->read_requests;
Andreas Gruenbacher53840642011-01-28 10:31:04 +0100269 drbd_remove_request_interval(root, req);
Andreas Gruenbacherde696712011-01-20 15:00:24 +0100270 } else
Philipp Reisner8825f7c2010-10-21 17:21:19 +0200271 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700272
273 /* for writes we need to do some extra housekeeping */
274 if (rw == WRITE)
275 _about_to_complete_local_write(mdev, req);
276
277 /* Update disk stats */
278 _drbd_end_io_acct(mdev, req);
279
280 m->error = ok ? 0 : (error ?: -EIO);
281 m->bio = req->master_bio;
282 req->master_bio = NULL;
283 }
284
285 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
286 /* this is disconnected (local only) operation,
287 * or protocol C P_WRITE_ACK,
288 * or protocol A or B P_BARRIER_ACK,
289 * or killed from the transfer log due to connection loss. */
290 _req_is_done(mdev, req, rw);
291 }
292 /* else: network part and not DONE yet. that is
293 * protocol A or B, barrier ack still pending... */
294}
295
Philipp Reisnercfa03412010-06-23 17:18:51 +0200296static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
297{
Philipp Reisnera21e9292011-02-08 15:08:49 +0100298 struct drbd_conf *mdev = req->w.mdev;
Philipp Reisnercfa03412010-06-23 17:18:51 +0200299
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200300 if (!is_susp(mdev->state))
Philipp Reisnercfa03412010-06-23 17:18:51 +0200301 _req_may_be_done(req, m);
302}
303
Philipp Reisnerb411b362009-09-25 16:07:19 -0700304/* obviously this could be coded as many single functions
305 * instead of one huge switch,
306 * or by putting the code directly in the respective locations
307 * (as it has been before).
308 *
309 * but having it this way
310 * enforces that it is all in this one place, where it is easier to audit,
311 * it makes it obvious that whatever "event" "happens" to a request should
312 * happen "atomically" within the req_lock,
313 * and it enforces that we have to think in a very structured manner
314 * about the "events" that may happen to a request during its life time ...
315 */
Philipp Reisner2a806992010-06-09 14:07:43 +0200316int __req_mod(struct drbd_request *req, enum drbd_req_event what,
Philipp Reisnerb411b362009-09-25 16:07:19 -0700317 struct bio_and_error *m)
318{
Philipp Reisnera21e9292011-02-08 15:08:49 +0100319 struct drbd_conf *mdev = req->w.mdev;
Philipp Reisner2a806992010-06-09 14:07:43 +0200320 int rv = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700321 m->bio = NULL;
322
Philipp Reisnerb411b362009-09-25 16:07:19 -0700323 switch (what) {
324 default:
325 dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
326 break;
327
328 /* does not happen...
329 * initialization done in drbd_req_new
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100330 case CREATED:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700331 break;
332 */
333
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100334 case TO_BE_SENT: /* via network */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700335 /* reached via drbd_make_request_common
336 * and from w_read_retry_remote */
337 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
338 req->rq_state |= RQ_NET_PENDING;
339 inc_ap_pending(mdev);
340 break;
341
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100342 case TO_BE_SUBMITTED: /* locally */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700343 /* reached via drbd_make_request_common */
344 D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
345 req->rq_state |= RQ_LOCAL_PENDING;
346 break;
347
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100348 case COMPLETED_OK:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700349 if (bio_data_dir(req->master_bio) == WRITE)
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100350 mdev->writ_cnt += req->i.size >> 9;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700351 else
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100352 mdev->read_cnt += req->i.size >> 9;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700353
354 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
355 req->rq_state &= ~RQ_LOCAL_PENDING;
356
Philipp Reisnercfa03412010-06-23 17:18:51 +0200357 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700358 put_ldev(mdev);
359 break;
360
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100361 case WRITE_COMPLETED_WITH_ERROR:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700362 req->rq_state |= RQ_LOCAL_COMPLETED;
363 req->rq_state &= ~RQ_LOCAL_PENDING;
364
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100365 __drbd_chk_io_error(mdev, false);
Philipp Reisnercfa03412010-06-23 17:18:51 +0200366 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700367 put_ldev(mdev);
368 break;
369
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100370 case READ_AHEAD_COMPLETED_WITH_ERROR:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700371 /* it is legal to fail READA */
372 req->rq_state |= RQ_LOCAL_COMPLETED;
373 req->rq_state &= ~RQ_LOCAL_PENDING;
Philipp Reisnercfa03412010-06-23 17:18:51 +0200374 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700375 put_ldev(mdev);
376 break;
377
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100378 case READ_COMPLETED_WITH_ERROR:
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100379 drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700380
381 req->rq_state |= RQ_LOCAL_COMPLETED;
382 req->rq_state &= ~RQ_LOCAL_PENDING;
383
Philipp Reisnerb411b362009-09-25 16:07:19 -0700384 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
Philipp Reisnerb411b362009-09-25 16:07:19 -0700385
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100386 __drbd_chk_io_error(mdev, false);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700387 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700388
Lars Ellenbergd255e5f2010-05-27 09:45:45 +0200389 /* no point in retrying if there is no good remote data,
390 * or we have no connection. */
391 if (mdev->state.pdsk != D_UP_TO_DATE) {
Philipp Reisnercfa03412010-06-23 17:18:51 +0200392 _req_may_be_done_not_susp(req, m);
Lars Ellenbergd255e5f2010-05-27 09:45:45 +0200393 break;
394 }
395
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100396 /* _req_mod(req,TO_BE_SENT); oops, recursion... */
Lars Ellenbergd255e5f2010-05-27 09:45:45 +0200397 req->rq_state |= RQ_NET_PENDING;
398 inc_ap_pending(mdev);
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100399 /* fall through: _req_mod(req,QUEUE_FOR_NET_READ); */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700400
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100401 case QUEUE_FOR_NET_READ:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700402 /* READ or READA, and
403 * no local disk,
404 * or target area marked as invalid,
405 * or just got an io-error. */
406 /* from drbd_make_request_common
407 * or from bio_endio during read io-error recovery */
408
409 /* so we can verify the handle in the answer packet
410 * corresponding hlist_del is in _req_may_be_done() */
Andreas Gruenbacherdac13892011-01-21 17:18:39 +0100411 drbd_insert_interval(&mdev->read_requests, &req->i);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700412
Lars Ellenberg83c38832009-11-03 02:22:06 +0100413 set_bit(UNPLUG_REMOTE, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700414
415 D_ASSERT(req->rq_state & RQ_NET_PENDING);
416 req->rq_state |= RQ_NET_QUEUED;
417 req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
418 ? w_read_retry_remote
419 : w_send_read_req;
Philipp Reisnere42325a2011-01-19 13:55:45 +0100420 drbd_queue_work(&mdev->tconn->data.work, &req->w);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700421 break;
422
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100423 case QUEUE_FOR_NET_WRITE:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700424 /* assert something? */
425 /* from drbd_make_request_common only */
426
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427 /* corresponding hlist_del is in _req_may_be_done() */
Andreas Gruenbacherde696712011-01-20 15:00:24 +0100428 drbd_insert_interval(&mdev->write_requests, &req->i);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700429
430 /* NOTE
431 * In case the req ended up on the transfer log before being
432 * queued on the worker, it could lead to this request being
433 * missed during cleanup after connection loss.
434 * So we have to do both operations here,
435 * within the same lock that protects the transfer log.
436 *
437 * _req_add_to_epoch(req); this has to be after the
438 * _maybe_start_new_epoch(req); which happened in
439 * drbd_make_request_common, because we now may set the bit
440 * again ourselves to close the current epoch.
441 *
442 * Add req to the (now) current epoch (barrier). */
443
Lars Ellenberg83c38832009-11-03 02:22:06 +0100444 /* otherwise we may lose an unplug, which may cause some remote
445 * io-scheduler timeout to expire, increasing maximum latency,
446 * hurting performance. */
447 set_bit(UNPLUG_REMOTE, &mdev->flags);
448
Philipp Reisnerb411b362009-09-25 16:07:19 -0700449 /* see drbd_make_request_common,
450 * just after it grabs the req_lock */
451 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
452
Philipp Reisner87eeee42011-01-19 14:16:30 +0100453 req->epoch = mdev->tconn->newest_tle->br_number;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700454
455 /* increment size of current epoch */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100456 mdev->tconn->newest_tle->n_writes++;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457
458 /* queue work item to send data */
459 D_ASSERT(req->rq_state & RQ_NET_PENDING);
460 req->rq_state |= RQ_NET_QUEUED;
461 req->w.cb = w_send_dblock;
Philipp Reisnere42325a2011-01-19 13:55:45 +0100462 drbd_queue_work(&mdev->tconn->data.work, &req->w);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700463
464 /* close the epoch, in case it outgrew the limit */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100465 if (mdev->tconn->newest_tle->n_writes >= mdev->tconn->net_conf->max_epoch_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700466 queue_barrier(mdev);
467
468 break;
469
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100470 case QUEUE_FOR_SEND_OOS:
Philipp Reisner73a01a12010-10-27 14:33:00 +0200471 req->rq_state |= RQ_NET_QUEUED;
472 req->w.cb = w_send_oos;
Philipp Reisnere42325a2011-01-19 13:55:45 +0100473 drbd_queue_work(&mdev->tconn->data.work, &req->w);
Philipp Reisner73a01a12010-10-27 14:33:00 +0200474 break;
475
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100476 case OOS_HANDED_TO_NETWORK:
Philipp Reisner73a01a12010-10-27 14:33:00 +0200477 /* actually the same */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100478 case SEND_CANCELED:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700479 /* treat it the same */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100480 case SEND_FAILED:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700481 /* real cleanup will be done from tl_clear. just update flags
482 * so it is no longer marked as on the worker queue */
483 req->rq_state &= ~RQ_NET_QUEUED;
484 /* if we did it right, tl_clear should be scheduled only after
485 * this, so this should not be necessary! */
Philipp Reisnercfa03412010-06-23 17:18:51 +0200486 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700487 break;
488
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100489 case HANDED_OVER_TO_NETWORK:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700490 /* assert something? */
Philipp Reisner759fbdf2010-10-26 16:02:27 +0200491 if (bio_data_dir(req->master_bio) == WRITE)
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100492 atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
Philipp Reisner759fbdf2010-10-26 16:02:27 +0200493
Philipp Reisnerb411b362009-09-25 16:07:19 -0700494 if (bio_data_dir(req->master_bio) == WRITE &&
Philipp Reisner89e58e72011-01-19 13:12:45 +0100495 mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700496 /* this is what is dangerous about protocol A:
497 * pretend it was successfully written on the peer. */
498 if (req->rq_state & RQ_NET_PENDING) {
499 dec_ap_pending(mdev);
500 req->rq_state &= ~RQ_NET_PENDING;
501 req->rq_state |= RQ_NET_OK;
502 } /* else: neg-ack was faster... */
503 /* it is still not yet RQ_NET_DONE until the
504 * corresponding epoch barrier got acked as well,
505 * so we know what to dirty on connection loss */
506 }
507 req->rq_state &= ~RQ_NET_QUEUED;
508 req->rq_state |= RQ_NET_SENT;
509 /* because _drbd_send_zc_bio could sleep, and may want to
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100510 * dereference the bio even after the "WRITE_ACKED_BY_PEER" and
511 * "COMPLETED_OK" events came in, once we return from
Philipp Reisnerb411b362009-09-25 16:07:19 -0700512 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
513 * whether it is done already, and end it. */
Philipp Reisnercfa03412010-06-23 17:18:51 +0200514 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700515 break;
516
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100517 case READ_RETRY_REMOTE_CANCELED:
Lars Ellenbergd255e5f2010-05-27 09:45:45 +0200518 req->rq_state &= ~RQ_NET_QUEUED;
519 /* fall through, in case we raced with drbd_disconnect */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100520 case CONNECTION_LOST_WHILE_PENDING:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700521 /* transfer log cleanup after connection loss */
522 /* assert something? */
523 if (req->rq_state & RQ_NET_PENDING)
524 dec_ap_pending(mdev);
525 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
526 req->rq_state |= RQ_NET_DONE;
Philipp Reisner759fbdf2010-10-26 16:02:27 +0200527 if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100528 atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
Philipp Reisner759fbdf2010-10-26 16:02:27 +0200529
Philipp Reisnerb411b362009-09-25 16:07:19 -0700530 /* if it is still queued, we may not complete it here.
531 * it will be canceled soon. */
532 if (!(req->rq_state & RQ_NET_QUEUED))
Philipp Reisnercfa03412010-06-23 17:18:51 +0200533 _req_may_be_done(req, m); /* Allowed while state.susp */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700534 break;
535
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100536 case WRITE_ACKED_BY_PEER_AND_SIS:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700537 req->rq_state |= RQ_NET_SIS;
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100538 case CONFLICT_DISCARDED_BY_PEER:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700539 /* for discarded conflicting writes of multiple primaries,
540 * there is no need to keep anything in the tl, potential
541 * node crashes are covered by the activity log. */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100542 if (what == CONFLICT_DISCARDED_BY_PEER)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700543 dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
544 " DRBD is not a random data generator!\n",
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100545 (unsigned long long)req->i.sector, req->i.size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700546 req->rq_state |= RQ_NET_DONE;
547 /* fall through */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100548 case WRITE_ACKED_BY_PEER:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700549 /* protocol C; successfully written on peer.
550 * Nothing to do here.
551 * We want to keep the tl in place for all protocols, to cater
552 * for volatile write-back caches on lower level devices.
553 *
554 * A barrier request is expected to have forced all prior
555 * requests onto stable storage, so completion of a barrier
556 * request could set NET_DONE right here, and not wait for the
557 * P_BARRIER_ACK, but that is an unnecessary optimization. */
558
559 /* this makes it effectively the same as for: */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100560 case RECV_ACKED_BY_PEER:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700561 /* protocol B; pretends to be successfully written on peer.
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100562 * see also notes above in HANDED_OVER_TO_NETWORK about
Philipp Reisnerb411b362009-09-25 16:07:19 -0700563 * protocol != C */
564 req->rq_state |= RQ_NET_OK;
565 D_ASSERT(req->rq_state & RQ_NET_PENDING);
566 dec_ap_pending(mdev);
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100567 atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700568 req->rq_state &= ~RQ_NET_PENDING;
Philipp Reisnercfa03412010-06-23 17:18:51 +0200569 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700570 break;
571
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100572 case NEG_ACKED:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700573 /* assert something? */
Philipp Reisner759fbdf2010-10-26 16:02:27 +0200574 if (req->rq_state & RQ_NET_PENDING) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700575 dec_ap_pending(mdev);
Andreas Gruenbacherace652a2011-01-03 17:09:58 +0100576 atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
Philipp Reisner759fbdf2010-10-26 16:02:27 +0200577 }
Philipp Reisnerb411b362009-09-25 16:07:19 -0700578 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
579
580 req->rq_state |= RQ_NET_DONE;
Philipp Reisnercfa03412010-06-23 17:18:51 +0200581 _req_may_be_done_not_susp(req, m);
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100582 /* else: done by HANDED_OVER_TO_NETWORK */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700583 break;
584
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100585 case FAIL_FROZEN_DISK_IO:
Philipp Reisner265be2d2010-05-31 10:14:17 +0200586 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
587 break;
588
Philipp Reisnercfa03412010-06-23 17:18:51 +0200589 _req_may_be_done(req, m); /* Allowed while state.susp */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200590 break;
591
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100592 case RESTART_FROZEN_DISK_IO:
Philipp Reisner265be2d2010-05-31 10:14:17 +0200593 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
594 break;
595
596 req->rq_state &= ~RQ_LOCAL_COMPLETED;
597
598 rv = MR_READ;
599 if (bio_data_dir(req->master_bio) == WRITE)
600 rv = MR_WRITE;
601
602 get_ldev(mdev);
603 req->w.cb = w_restart_disk_io;
Philipp Reisnere42325a2011-01-19 13:55:45 +0100604 drbd_queue_work(&mdev->tconn->data.work, &req->w);
Philipp Reisner265be2d2010-05-31 10:14:17 +0200605 break;
606
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100607 case RESEND:
Philipp Reisner11b58e72010-05-12 17:08:26 +0200608 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
Philipp Reisner47ff2d02010-06-18 13:56:57 +0200609 before the connection loss (B&C only); only P_BARRIER_ACK was missing.
Philipp Reisner11b58e72010-05-12 17:08:26 +0200610 Trowing them out of the TL here by pretending we got a BARRIER_ACK
Philipp Reisner481c6f52010-06-22 14:03:27 +0200611 We ensure that the peer was not rebooted */
Philipp Reisner11b58e72010-05-12 17:08:26 +0200612 if (!(req->rq_state & RQ_NET_OK)) {
613 if (req->w.cb) {
Philipp Reisnere42325a2011-01-19 13:55:45 +0100614 drbd_queue_work(&mdev->tconn->data.work, &req->w);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200615 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
616 }
617 break;
618 }
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100619 /* else, fall through to BARRIER_ACKED */
Philipp Reisner11b58e72010-05-12 17:08:26 +0200620
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100621 case BARRIER_ACKED:
Philipp Reisner288f4222010-05-27 15:07:43 +0200622 if (!(req->rq_state & RQ_WRITE))
623 break;
624
Philipp Reisnerb411b362009-09-25 16:07:19 -0700625 if (req->rq_state & RQ_NET_PENDING) {
626 /* barrier came in before all requests have been acked.
627 * this is bad, because if the connection is lost now,
628 * we won't be able to clean them up... */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100629 dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n");
Philipp Reisner87eeee42011-01-19 14:16:30 +0100630 list_move(&req->tl_requests, &mdev->tconn->out_of_sequence_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700631 }
Lars Ellenberge636db52011-01-21 17:10:37 +0100632 if ((req->rq_state & RQ_NET_MASK) != 0) {
633 req->rq_state |= RQ_NET_DONE;
Philipp Reisner89e58e72011-01-19 13:12:45 +0100634 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A)
635 atomic_sub(req->i.size>>9, &mdev->ap_in_flight);
Lars Ellenberge636db52011-01-21 17:10:37 +0100636 }
Philipp Reisnercfa03412010-06-23 17:18:51 +0200637 _req_may_be_done(req, m); /* Allowed while state.susp */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700638 break;
639
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100640 case DATA_RECEIVED:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700641 D_ASSERT(req->rq_state & RQ_NET_PENDING);
642 dec_ap_pending(mdev);
643 req->rq_state &= ~RQ_NET_PENDING;
644 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
Philipp Reisnercfa03412010-06-23 17:18:51 +0200645 _req_may_be_done_not_susp(req, m);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700646 break;
647 };
Philipp Reisner2a806992010-06-09 14:07:43 +0200648
649 return rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700650}
651
652/* we may do a local read if:
653 * - we are consistent (of course),
654 * - or we are generally inconsistent,
655 * BUT we are still/already IN SYNC for this area.
656 * since size may be bigger than BM_BLOCK_SIZE,
657 * we may need to check several bits.
658 */
659static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
660{
661 unsigned long sbnr, ebnr;
662 sector_t esector, nr_sectors;
663
664 if (mdev->state.disk == D_UP_TO_DATE)
665 return 1;
666 if (mdev->state.disk >= D_OUTDATED)
667 return 0;
668 if (mdev->state.disk < D_INCONSISTENT)
669 return 0;
670 /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
671 nr_sectors = drbd_get_capacity(mdev->this_bdev);
672 esector = sector + (size >> 9) - 1;
673
674 D_ASSERT(sector < nr_sectors);
675 D_ASSERT(esector < nr_sectors);
676
677 sbnr = BM_SECT_TO_BIT(sector);
678 ebnr = BM_SECT_TO_BIT(esector);
679
680 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
681}
682
Andreas Gruenbacher6024fec2011-01-28 15:53:51 +0100683/*
684 * complete_conflicting_writes - wait for any conflicting write requests
685 *
686 * The write_requests tree contains all active write requests which we
687 * currently know about. Wait for any requests to complete which conflict with
688 * the new one.
689 */
690static int complete_conflicting_writes(struct drbd_conf *mdev,
691 sector_t sector, int size)
692{
693 for(;;) {
694 DEFINE_WAIT(wait);
695 struct drbd_interval *i;
696
697 i = drbd_find_overlap(&mdev->write_requests, sector, size);
698 if (!i)
699 return 0;
700 i->waiting = true;
701 prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
702 spin_unlock_irq(&mdev->tconn->req_lock);
703 schedule();
704 finish_wait(&mdev->misc_wait, &wait);
705 spin_lock_irq(&mdev->tconn->req_lock);
706 if (signal_pending(current))
707 return -ERESTARTSYS;
708 }
709}
710
Philipp Reisneraeda1cd2010-11-09 17:45:06 +0100711static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700712{
713 const int rw = bio_rw(bio);
714 const int size = bio->bi_size;
715 const sector_t sector = bio->bi_sector;
716 struct drbd_tl_epoch *b = NULL;
717 struct drbd_request *req;
Philipp Reisner73a01a12010-10-27 14:33:00 +0200718 int local, remote, send_oos = 0;
Andreas Gruenbacher6024fec2011-01-28 15:53:51 +0100719 int err;
Philipp Reisner9a25a042010-05-10 16:42:23 +0200720 int ret = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700721
722 /* allocate outside of all locks; */
723 req = drbd_req_new(mdev, bio);
724 if (!req) {
725 dec_ap_bio(mdev);
726 /* only pass the error to the upper layers.
727 * if user cannot handle io errors, that's not our business. */
728 dev_err(DEV, "could not kmalloc() req\n");
729 bio_endio(bio, -ENOMEM);
730 return 0;
731 }
Philipp Reisneraeda1cd2010-11-09 17:45:06 +0100732 req->start_time = start_time;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700733
Philipp Reisnerb411b362009-09-25 16:07:19 -0700734 local = get_ldev(mdev);
735 if (!local) {
736 bio_put(req->private_bio); /* or we get a bio leak */
737 req->private_bio = NULL;
738 }
739 if (rw == WRITE) {
740 remote = 1;
741 } else {
742 /* READ || READA */
743 if (local) {
744 if (!drbd_may_do_local_read(mdev, sector, size)) {
745 /* we could kick the syncer to
746 * sync this extent asap, wait for
747 * it, then continue locally.
748 * Or just issue the request remotely.
749 */
750 local = 0;
751 bio_put(req->private_bio);
752 req->private_bio = NULL;
753 put_ldev(mdev);
754 }
755 }
756 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
757 }
758
759 /* If we have a disk, but a READA request is mapped to remote,
760 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
761 * Just fail that READA request right here.
762 *
763 * THINK: maybe fail all READA when not local?
764 * or make this configurable...
765 * if network is slow, READA won't do any good.
766 */
767 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
768 err = -EWOULDBLOCK;
769 goto fail_and_free_req;
770 }
771
772 /* For WRITES going to the local disk, grab a reference on the target
773 * extent. This waits for any resync activity in the corresponding
774 * resync extent to finish, and, if necessary, pulls in the target
775 * extent into the activity log, which involves further disk io because
776 * of transactional on-disk meta data updates. */
Philipp Reisner07782862010-08-31 12:00:50 +0200777 if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
778 req->rq_state |= RQ_IN_ACT_LOG;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700779 drbd_al_begin_io(mdev, sector);
Philipp Reisner07782862010-08-31 12:00:50 +0200780 }
Philipp Reisnerb411b362009-09-25 16:07:19 -0700781
Philipp Reisner6a35c452011-01-17 20:27:30 +0100782 remote = remote && drbd_should_do_remote(mdev->state);
783 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
Philipp Reisner37190942010-11-10 12:08:37 +0100784 D_ASSERT(!(remote && send_oos));
Philipp Reisnerb411b362009-09-25 16:07:19 -0700785
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200786 if (!(local || remote) && !is_susp(mdev->state)) {
Lars Ellenbergfb2c7a12010-10-19 12:08:13 +0200787 if (__ratelimit(&drbd_ratelimit_state))
788 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
Andreas Gruenbacher6024fec2011-01-28 15:53:51 +0100789 err = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700790 goto fail_free_complete;
791 }
792
793 /* For WRITE request, we have to make sure that we have an
794 * unused_spare_tle, in case we need to start a new epoch.
795 * I try to be smart and avoid to pre-allocate always "just in case",
796 * but there is a race between testing the bit and pointer outside the
797 * spinlock, and grabbing the spinlock.
798 * if we lost that race, we retry. */
Philipp Reisner73a01a12010-10-27 14:33:00 +0200799 if (rw == WRITE && (remote || send_oos) &&
Philipp Reisner87eeee42011-01-19 14:16:30 +0100800 mdev->tconn->unused_spare_tle == NULL &&
Philipp Reisnerb411b362009-09-25 16:07:19 -0700801 test_bit(CREATE_BARRIER, &mdev->flags)) {
802allocate_barrier:
803 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
804 if (!b) {
805 dev_err(DEV, "Failed to alloc barrier.\n");
806 err = -ENOMEM;
807 goto fail_free_complete;
808 }
809 }
810
811 /* GOOD, everything prepared, grab the spin_lock */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100812 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700813
Andreas Gruenbacher6024fec2011-01-28 15:53:51 +0100814 if (rw == WRITE) {
815 err = complete_conflicting_writes(mdev, sector, size);
816 if (err) {
817 spin_unlock_irq(&mdev->tconn->req_lock);
818 goto fail_free_complete;
819 }
820 }
821
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200822 if (is_susp(mdev->state)) {
Philipp Reisner9a25a042010-05-10 16:42:23 +0200823 /* If we got suspended, use the retry mechanism of
824 generic_make_request() to restart processing of this
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +0100825 bio. In the next call to drbd_make_request
Philipp Reisner9a25a042010-05-10 16:42:23 +0200826 we sleep in inc_ap_bio() */
827 ret = 1;
Philipp Reisner87eeee42011-01-19 14:16:30 +0100828 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisner9a25a042010-05-10 16:42:23 +0200829 goto fail_free_complete;
830 }
831
Philipp Reisner73a01a12010-10-27 14:33:00 +0200832 if (remote || send_oos) {
Philipp Reisner6a35c452011-01-17 20:27:30 +0100833 remote = drbd_should_do_remote(mdev->state);
834 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
Philipp Reisner37190942010-11-10 12:08:37 +0100835 D_ASSERT(!(remote && send_oos));
Philipp Reisner73a01a12010-10-27 14:33:00 +0200836
837 if (!(remote || send_oos))
Philipp Reisnerb411b362009-09-25 16:07:19 -0700838 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
839 if (!(local || remote)) {
840 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
Philipp Reisner87eeee42011-01-19 14:16:30 +0100841 spin_unlock_irq(&mdev->tconn->req_lock);
Andreas Gruenbacher6024fec2011-01-28 15:53:51 +0100842 err = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700843 goto fail_free_complete;
844 }
845 }
846
Philipp Reisner87eeee42011-01-19 14:16:30 +0100847 if (b && mdev->tconn->unused_spare_tle == NULL) {
848 mdev->tconn->unused_spare_tle = b;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700849 b = NULL;
850 }
Philipp Reisner73a01a12010-10-27 14:33:00 +0200851 if (rw == WRITE && (remote || send_oos) &&
Philipp Reisner87eeee42011-01-19 14:16:30 +0100852 mdev->tconn->unused_spare_tle == NULL &&
Philipp Reisnerb411b362009-09-25 16:07:19 -0700853 test_bit(CREATE_BARRIER, &mdev->flags)) {
854 /* someone closed the current epoch
855 * while we were grabbing the spinlock */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100856 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700857 goto allocate_barrier;
858 }
859
860
861 /* Update disk stats */
862 _drbd_start_io_acct(mdev, req, bio);
863
864 /* _maybe_start_new_epoch(mdev);
865 * If we need to generate a write barrier packet, we have to add the
866 * new epoch (barrier) object, and queue the barrier packet for sending,
867 * and queue the req's data after it _within the same lock_, otherwise
868 * we have race conditions were the reorder domains could be mixed up.
869 *
870 * Even read requests may start a new epoch and queue the corresponding
871 * barrier packet. To get the write ordering right, we only have to
872 * make sure that, if this is a write request and it triggered a
873 * barrier packet, this request is queued within the same spinlock. */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100874 if ((remote || send_oos) && mdev->tconn->unused_spare_tle &&
Philipp Reisnerb411b362009-09-25 16:07:19 -0700875 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
Philipp Reisner87eeee42011-01-19 14:16:30 +0100876 _tl_add_barrier(mdev, mdev->tconn->unused_spare_tle);
877 mdev->tconn->unused_spare_tle = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700878 } else {
879 D_ASSERT(!(remote && rw == WRITE &&
880 test_bit(CREATE_BARRIER, &mdev->flags)));
881 }
882
883 /* NOTE
884 * Actually, 'local' may be wrong here already, since we may have failed
885 * to write to the meta data, and may become wrong anytime because of
886 * local io-error for some other request, which would lead to us
887 * "detaching" the local disk.
888 *
889 * 'remote' may become wrong any time because the network could fail.
890 *
891 * This is a harmless race condition, though, since it is handled
892 * correctly at the appropriate places; so it just defers the failure
893 * of the respective operation.
894 */
895
896 /* mark them early for readability.
897 * this just sets some state flags. */
898 if (remote)
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100899 _req_mod(req, TO_BE_SENT);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700900 if (local)
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100901 _req_mod(req, TO_BE_SUBMITTED);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700902
Philipp Reisner87eeee42011-01-19 14:16:30 +0100903 list_add_tail(&req->tl_requests, &mdev->tconn->newest_tle->requests);
Philipp Reisner288f4222010-05-27 15:07:43 +0200904
Philipp Reisnerb411b362009-09-25 16:07:19 -0700905 /* NOTE remote first: to get the concurrent write detection right,
906 * we must register the request before start of local IO. */
907 if (remote) {
908 /* either WRITE and C_CONNECTED,
909 * or READ, and no local disk,
910 * or READ, but not in sync.
911 */
912 _req_mod(req, (rw == WRITE)
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100913 ? QUEUE_FOR_NET_WRITE
914 : QUEUE_FOR_NET_READ);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700915 }
Philipp Reisner73a01a12010-10-27 14:33:00 +0200916 if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100917 _req_mod(req, QUEUE_FOR_SEND_OOS);
Philipp Reisner67531712010-10-27 12:21:30 +0200918
Philipp Reisner73a01a12010-10-27 14:33:00 +0200919 if (remote &&
Philipp Reisner31890f42011-01-19 14:12:51 +0100920 mdev->tconn->net_conf->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) {
Philipp Reisner67531712010-10-27 12:21:30 +0200921 int congested = 0;
922
Philipp Reisner89e58e72011-01-19 13:12:45 +0100923 if (mdev->tconn->net_conf->cong_fill &&
924 atomic_read(&mdev->ap_in_flight) >= mdev->tconn->net_conf->cong_fill) {
Philipp Reisner67531712010-10-27 12:21:30 +0200925 dev_info(DEV, "Congestion-fill threshold reached\n");
926 congested = 1;
927 }
928
Philipp Reisner89e58e72011-01-19 13:12:45 +0100929 if (mdev->act_log->used >= mdev->tconn->net_conf->cong_extents) {
Philipp Reisner67531712010-10-27 12:21:30 +0200930 dev_info(DEV, "Congestion-extents threshold reached\n");
931 congested = 1;
932 }
933
Philipp Reisner71c78cf2011-01-14 19:20:34 +0100934 if (congested) {
Philipp Reisner039312b2011-01-21 14:13:22 +0100935 queue_barrier(mdev); /* last barrier, after mirrored writes */
Philipp Reisner73a01a12010-10-27 14:33:00 +0200936
Philipp Reisner89e58e72011-01-19 13:12:45 +0100937 if (mdev->tconn->net_conf->on_congestion == OC_PULL_AHEAD)
Philipp Reisner67531712010-10-27 12:21:30 +0200938 _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
Philipp Reisner89e58e72011-01-19 13:12:45 +0100939 else /*mdev->tconn->net_conf->on_congestion == OC_DISCONNECT */
Philipp Reisner67531712010-10-27 12:21:30 +0200940 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
941 }
942 }
943
Philipp Reisner87eeee42011-01-19 14:16:30 +0100944 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700945 kfree(b); /* if someone else has beaten us to it... */
946
947 if (local) {
948 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
949
Lars Ellenberg6719fb02010-10-18 23:04:07 +0200950 /* State may have changed since we grabbed our reference on the
951 * mdev->ldev member. Double check, and short-circuit to endio.
952 * In case the last activity log transaction failed to get on
953 * stable storage, and this is a WRITE, we may not even submit
954 * this bio. */
955 if (get_ldev(mdev)) {
Andreas Gruenbacher0cf9d272010-12-07 10:43:29 +0100956 if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
957 : rw == READ ? DRBD_FAULT_DT_RD
958 : DRBD_FAULT_DT_RA))
Lars Ellenberg6719fb02010-10-18 23:04:07 +0200959 bio_endio(req->private_bio, -EIO);
960 else
961 generic_make_request(req->private_bio);
962 put_ldev(mdev);
963 } else
Philipp Reisnerb411b362009-09-25 16:07:19 -0700964 bio_endio(req->private_bio, -EIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700965 }
966
Philipp Reisnerb411b362009-09-25 16:07:19 -0700967 return 0;
968
969fail_free_complete:
Lars Ellenberg76727f62011-05-16 15:31:45 +0200970 if (req->rq_state & RQ_IN_ACT_LOG)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700971 drbd_al_complete_io(mdev, sector);
972fail_and_free_req:
973 if (local) {
974 bio_put(req->private_bio);
975 req->private_bio = NULL;
976 put_ldev(mdev);
977 }
Philipp Reisner9a25a042010-05-10 16:42:23 +0200978 if (!ret)
979 bio_endio(bio, err);
980
Philipp Reisnerb411b362009-09-25 16:07:19 -0700981 drbd_req_free(req);
982 dec_ap_bio(mdev);
983 kfree(b);
984
Philipp Reisner9a25a042010-05-10 16:42:23 +0200985 return ret;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700986}
987
988/* helper function for drbd_make_request
989 * if we can determine just by the mdev (state) that this request will fail,
990 * return 1
991 * otherwise return 0
992 */
993static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
994{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700995 if (mdev->state.role != R_PRIMARY &&
996 (!allow_oos || is_write)) {
997 if (__ratelimit(&drbd_ratelimit_state)) {
998 dev_err(DEV, "Process %s[%u] tried to %s; "
999 "since we are not in Primary state, "
1000 "we cannot allow this\n",
1001 current->comm, current->pid,
1002 is_write ? "WRITE" : "READ");
1003 }
1004 return 1;
1005 }
1006
Philipp Reisnerb411b362009-09-25 16:07:19 -07001007 return 0;
1008}
1009
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01001010int drbd_make_request(struct request_queue *q, struct bio *bio)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001011{
1012 unsigned int s_enr, e_enr;
1013 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
Philipp Reisneraeda1cd2010-11-09 17:45:06 +01001014 unsigned long start_time;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001015
1016 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
1017 bio_endio(bio, -EPERM);
1018 return 0;
1019 }
1020
Philipp Reisneraeda1cd2010-11-09 17:45:06 +01001021 start_time = jiffies;
1022
Philipp Reisnerb411b362009-09-25 16:07:19 -07001023 /*
1024 * what we "blindly" assume:
1025 */
1026 D_ASSERT(bio->bi_size > 0);
1027 D_ASSERT((bio->bi_size & 0x1ff) == 0);
1028 D_ASSERT(bio->bi_idx == 0);
1029
1030 /* to make some things easier, force alignment of requests within the
1031 * granularity of our hash tables */
1032 s_enr = bio->bi_sector >> HT_SHIFT;
1033 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
1034
1035 if (likely(s_enr == e_enr)) {
1036 inc_ap_bio(mdev, 1);
Philipp Reisneraeda1cd2010-11-09 17:45:06 +01001037 return drbd_make_request_common(mdev, bio, start_time);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001038 }
1039
1040 /* can this bio be split generically?
1041 * Maybe add our own split-arbitrary-bios function. */
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001042 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001043 /* rather error out here than BUG in bio_split */
1044 dev_err(DEV, "bio would need to, but cannot, be split: "
1045 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
1046 bio->bi_vcnt, bio->bi_idx, bio->bi_size,
1047 (unsigned long long)bio->bi_sector);
1048 bio_endio(bio, -EINVAL);
1049 } else {
1050 /* This bio crosses some boundary, so we have to split it. */
1051 struct bio_pair *bp;
1052 /* works for the "do not cross hash slot boundaries" case
1053 * e.g. sector 262269, size 4096
1054 * s_enr = 262269 >> 6 = 4097
1055 * e_enr = (262269+8-1) >> 6 = 4098
1056 * HT_SHIFT = 6
1057 * sps = 64, mask = 63
1058 * first_sectors = 64 - (262269 & 63) = 3
1059 */
1060 const sector_t sect = bio->bi_sector;
1061 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1062 const int mask = sps - 1;
1063 const sector_t first_sectors = sps - (sect & mask);
Or Gerlitz03567812011-01-13 10:43:40 +01001064 bp = bio_split(bio, first_sectors);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001065
1066 /* we need to get a "reference count" (ap_bio_cnt)
1067 * to avoid races with the disconnect/reconnect/suspend code.
Philipp Reisner9a25a042010-05-10 16:42:23 +02001068 * In case we need to split the bio here, we need to get three references
Philipp Reisnerb411b362009-09-25 16:07:19 -07001069 * atomically, otherwise we might deadlock when trying to submit the
1070 * second one! */
Philipp Reisner9a25a042010-05-10 16:42:23 +02001071 inc_ap_bio(mdev, 3);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001072
1073 D_ASSERT(e_enr == s_enr + 1);
1074
Philipp Reisneraeda1cd2010-11-09 17:45:06 +01001075 while (drbd_make_request_common(mdev, &bp->bio1, start_time))
Philipp Reisner9a25a042010-05-10 16:42:23 +02001076 inc_ap_bio(mdev, 1);
1077
Philipp Reisneraeda1cd2010-11-09 17:45:06 +01001078 while (drbd_make_request_common(mdev, &bp->bio2, start_time))
Philipp Reisner9a25a042010-05-10 16:42:23 +02001079 inc_ap_bio(mdev, 1);
1080
1081 dec_ap_bio(mdev);
1082
Philipp Reisnerb411b362009-09-25 16:07:19 -07001083 bio_pair_release(bp);
1084 }
1085 return 0;
1086}
1087
1088/* This is called by bio_add_page(). With this function we reduce
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001089 * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
Philipp Reisnerb411b362009-09-25 16:07:19 -07001090 * units (was AL_EXTENTs).
1091 *
1092 * we do the calculation within the lower 32bit of the byte offsets,
1093 * since we don't care for actual offset, but only check whether it
1094 * would cross "activity log extent" boundaries.
1095 *
1096 * As long as the BIO is empty we have to allow at least one bvec,
1097 * regardless of size and offset. so the resulting bio may still
1098 * cross extent boundaries. those are dealt with (bio_split) in
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01001099 * drbd_make_request.
Philipp Reisnerb411b362009-09-25 16:07:19 -07001100 */
1101int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1102{
1103 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1104 unsigned int bio_offset =
1105 (unsigned int)bvm->bi_sector << 9; /* 32 bit */
1106 unsigned int bio_size = bvm->bi_size;
1107 int limit, backing_limit;
1108
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001109 limit = DRBD_MAX_BIO_SIZE
1110 - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001111 if (limit < 0)
1112 limit = 0;
1113 if (bio_size == 0) {
1114 if (limit <= bvec->bv_len)
1115 limit = bvec->bv_len;
1116 } else if (limit && get_ldev(mdev)) {
1117 struct request_queue * const b =
1118 mdev->ldev->backing_bdev->bd_disk->queue;
Lars Ellenberga1c88d02010-05-14 19:16:41 +02001119 if (b->merge_bvec_fn) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001120 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1121 limit = min(limit, backing_limit);
1122 }
1123 put_ldev(mdev);
1124 }
1125 return limit;
1126}
Philipp Reisner7fde2be2011-03-01 11:08:28 +01001127
1128void request_timer_fn(unsigned long data)
1129{
1130 struct drbd_conf *mdev = (struct drbd_conf *) data;
1131 struct drbd_request *req; /* oldest request */
1132 struct list_head *le;
1133 unsigned long et = 0; /* effective timeout = ko_count * timeout */
1134
Philipp Reisnerb2fb6db2011-01-19 13:48:44 +01001135 if (get_net_conf(mdev->tconn)) {
Philipp Reisner89e58e72011-01-19 13:12:45 +01001136 et = mdev->tconn->net_conf->timeout*HZ/10 * mdev->tconn->net_conf->ko_count;
Philipp Reisnerb2fb6db2011-01-19 13:48:44 +01001137 put_net_conf(mdev->tconn);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01001138 }
1139 if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
1140 return; /* Recurring timer stopped */
1141
Philipp Reisner87eeee42011-01-19 14:16:30 +01001142 spin_lock_irq(&mdev->tconn->req_lock);
1143 le = &mdev->tconn->oldest_tle->requests;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01001144 if (list_empty(le)) {
Philipp Reisner87eeee42011-01-19 14:16:30 +01001145 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01001146 mod_timer(&mdev->request_timer, jiffies + et);
1147 return;
1148 }
1149
1150 le = le->prev;
1151 req = list_entry(le, struct drbd_request, tl_requests);
1152 if (time_is_before_eq_jiffies(req->start_time + et)) {
1153 if (req->rq_state & RQ_NET_PENDING) {
1154 dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
1155 _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
1156 } else {
1157 dev_warn(DEV, "Local backing block device frozen?\n");
1158 mod_timer(&mdev->request_timer, jiffies + et);
1159 }
1160 } else {
1161 mod_timer(&mdev->request_timer, req->start_time + et);
1162 }
1163
Philipp Reisner87eeee42011-01-19 14:16:30 +01001164 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01001165}