blob: f89b97466d07a862eabcc65008da2eecbc7a89ee [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020080static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070081
Philipp Reisnerb411b362009-09-25 16:07:19 -070082MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
83 "Lars Ellenberg <lars@linbit.com>");
84MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
85MODULE_VERSION(REL_VERSION);
86MODULE_LICENSE("GPL");
87MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
88MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
89
90#include <linux/moduleparam.h>
91/* allow_open_on_secondary */
92MODULE_PARM_DESC(allow_oos, "DONT USE!");
93/* thanks to these macros, if compiled into the kernel (not-module),
94 * this becomes the boot parameter drbd.minor_count */
95module_param(minor_count, uint, 0444);
96module_param(disable_sendpage, bool, 0644);
97module_param(allow_oos, bool, 0);
98module_param(cn_idx, uint, 0444);
99module_param(proc_details, int, 0644);
100
101#ifdef CONFIG_DRBD_FAULT_INJECTION
102int enable_faults;
103int fault_rate;
104static int fault_count;
105int fault_devs;
106/* bitmap of enabled faults */
107module_param(enable_faults, int, 0664);
108/* fault rate % value - applies to all enabled faults */
109module_param(fault_rate, int, 0664);
110/* count of faults inserted */
111module_param(fault_count, int, 0664);
112/* bitmap of devices to insert faults on */
113module_param(fault_devs, int, 0644);
114#endif
115
116/* module parameter, defined */
117unsigned int minor_count = 32;
118int disable_sendpage;
119int allow_oos;
120unsigned int cn_idx = CN_IDX_DRBD;
121int proc_details; /* Detail level in proc drbd*/
122
123/* Module parameter for setting the user mode helper program
124 * to run. Default is /sbin/drbdadm */
125char usermode_helper[80] = "/sbin/drbdadm";
126
127module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
128
129/* in 2.6.x, our device mapping and config info contains our virtual gendisks
130 * as member "struct gendisk *vdisk;"
131 */
132struct drbd_conf **minor_table;
133
134struct kmem_cache *drbd_request_cache;
135struct kmem_cache *drbd_ee_cache; /* epoch entries */
136struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
137struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
138mempool_t *drbd_request_mempool;
139mempool_t *drbd_ee_mempool;
140
141/* I do not use a standard mempool, because:
142 1) I want to hand out the pre-allocated objects first.
143 2) I want to be able to interrupt sleeping allocation with a signal.
144 Note: This is a single linked list, the next pointer is the private
145 member of struct page.
146 */
147struct page *drbd_pp_pool;
148spinlock_t drbd_pp_lock;
149int drbd_pp_vacant;
150wait_queue_head_t drbd_pp_wait;
151
152DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
153
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100154static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700155 .owner = THIS_MODULE,
156 .open = drbd_open,
157 .release = drbd_release,
158};
159
160#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
161
162#ifdef __CHECKER__
163/* When checking with sparse, and this is an inline function, sparse will
164 give tons of false positives. When this is a real functions sparse works.
165 */
166int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
167{
168 int io_allowed;
169
170 atomic_inc(&mdev->local_cnt);
171 io_allowed = (mdev->state.disk >= mins);
172 if (!io_allowed) {
173 if (atomic_dec_and_test(&mdev->local_cnt))
174 wake_up(&mdev->misc_wait);
175 }
176 return io_allowed;
177}
178
179#endif
180
181/**
182 * DOC: The transfer log
183 *
184 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
185 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
186 * of the list. There is always at least one &struct drbd_tl_epoch object.
187 *
188 * Each &struct drbd_tl_epoch has a circular double linked list of requests
189 * attached.
190 */
191static int tl_init(struct drbd_conf *mdev)
192{
193 struct drbd_tl_epoch *b;
194
195 /* during device minor initialization, we may well use GFP_KERNEL */
196 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
197 if (!b)
198 return 0;
199 INIT_LIST_HEAD(&b->requests);
200 INIT_LIST_HEAD(&b->w.list);
201 b->next = NULL;
202 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200203 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700204 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
205
206 mdev->oldest_tle = b;
207 mdev->newest_tle = b;
208 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
209
210 mdev->tl_hash = NULL;
211 mdev->tl_hash_s = 0;
212
213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
218 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
219 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
220 kfree(mdev->oldest_tle);
221 mdev->oldest_tle = NULL;
222 kfree(mdev->unused_spare_tle);
223 mdev->unused_spare_tle = NULL;
224 kfree(mdev->tl_hash);
225 mdev->tl_hash = NULL;
226 mdev->tl_hash_s = 0;
227}
228
229/**
230 * _tl_add_barrier() - Adds a barrier to the transfer log
231 * @mdev: DRBD device.
232 * @new: Barrier to be added before the current head of the TL.
233 *
234 * The caller must hold the req_lock.
235 */
236void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
237{
238 struct drbd_tl_epoch *newest_before;
239
240 INIT_LIST_HEAD(&new->requests);
241 INIT_LIST_HEAD(&new->w.list);
242 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
243 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200244 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700245
246 newest_before = mdev->newest_tle;
247 /* never send a barrier number == 0, because that is special-cased
248 * when using TCQ for our write ordering code */
249 new->br_number = (newest_before->br_number+1) ?: 1;
250 if (mdev->newest_tle != new) {
251 mdev->newest_tle->next = new;
252 mdev->newest_tle = new;
253 }
254}
255
256/**
257 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
258 * @mdev: DRBD device.
259 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
260 * @set_size: Expected number of requests before that barrier.
261 *
262 * In case the passed barrier_nr or set_size does not match the oldest
263 * &struct drbd_tl_epoch objects this function will cause a termination
264 * of the connection.
265 */
266void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
267 unsigned int set_size)
268{
269 struct drbd_tl_epoch *b, *nob; /* next old barrier */
270 struct list_head *le, *tle;
271 struct drbd_request *r;
272
273 spin_lock_irq(&mdev->req_lock);
274
275 b = mdev->oldest_tle;
276
277 /* first some paranoia code */
278 if (b == NULL) {
279 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
280 barrier_nr);
281 goto bail;
282 }
283 if (b->br_number != barrier_nr) {
284 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
285 barrier_nr, b->br_number);
286 goto bail;
287 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200288 if (b->n_writes != set_size) {
289 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
290 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700291 goto bail;
292 }
293
294 /* Clean up list of requests processed during current epoch */
295 list_for_each_safe(le, tle, &b->requests) {
296 r = list_entry(le, struct drbd_request, tl_requests);
297 _req_mod(r, barrier_acked);
298 }
299 /* There could be requests on the list waiting for completion
300 of the write to the local disk. To avoid corruptions of
301 slab's data structures we have to remove the lists head.
302
303 Also there could have been a barrier ack out of sequence, overtaking
304 the write acks - which would be a bug and violating write ordering.
305 To not deadlock in case we lose connection while such requests are
306 still pending, we need some way to find them for the
307 _req_mode(connection_lost_while_pending).
308
309 These have been list_move'd to the out_of_sequence_requests list in
310 _req_mod(, barrier_acked) above.
311 */
312 list_del_init(&b->requests);
313
314 nob = b->next;
315 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
316 _tl_add_barrier(mdev, b);
317 if (nob)
318 mdev->oldest_tle = nob;
319 /* if nob == NULL b was the only barrier, and becomes the new
320 barrier. Therefore mdev->oldest_tle points already to b */
321 } else {
322 D_ASSERT(nob != NULL);
323 mdev->oldest_tle = nob;
324 kfree(b);
325 }
326
327 spin_unlock_irq(&mdev->req_lock);
328 dec_ap_pending(mdev);
329
330 return;
331
332bail:
333 spin_unlock_irq(&mdev->req_lock);
334 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
335}
336
Philipp Reisner11b58e72010-05-12 17:08:26 +0200337/**
338 * _tl_restart() - Walks the transfer log, and applies an action to all requests
339 * @mdev: DRBD device.
340 * @what: The action/event to perform with all request objects
341 *
342 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
343 * restart_frozen_disk_io.
344 */
345static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
346{
347 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200348 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200349 struct drbd_request *req;
350 int rv, n_writes, n_reads;
351
352 b = mdev->oldest_tle;
353 pn = &mdev->oldest_tle;
354 while (b) {
355 n_writes = 0;
356 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200357 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200358 list_for_each_safe(le, tle, &b->requests) {
359 req = list_entry(le, struct drbd_request, tl_requests);
360 rv = _req_mod(req, what);
361
362 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
363 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
364 }
365 tmp = b->next;
366
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200367 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200368 if (what == resend) {
369 b->n_writes = n_writes;
370 if (b->w.cb == NULL) {
371 b->w.cb = w_send_barrier;
372 inc_ap_pending(mdev);
373 set_bit(CREATE_BARRIER, &mdev->flags);
374 }
375
376 drbd_queue_work(&mdev->data.work, &b->w);
377 }
378 pn = &b->next;
379 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200380 if (n_reads)
381 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200382 /* there could still be requests on that ring list,
383 * in case local io is still pending */
384 list_del(&b->requests);
385
386 /* dec_ap_pending corresponding to queue_barrier.
387 * the newest barrier may not have been queued yet,
388 * in which case w.cb is still NULL. */
389 if (b->w.cb != NULL)
390 dec_ap_pending(mdev);
391
392 if (b == mdev->newest_tle) {
393 /* recycle, but reinit! */
394 D_ASSERT(tmp == NULL);
395 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200396 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200397 INIT_LIST_HEAD(&b->w.list);
398 b->w.cb = NULL;
399 b->br_number = net_random();
400 b->n_writes = 0;
401
402 *pn = b;
403 break;
404 }
405 *pn = tmp;
406 kfree(b);
407 }
408 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200409 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200410 }
411}
412
Philipp Reisnerb411b362009-09-25 16:07:19 -0700413
414/**
415 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
416 * @mdev: DRBD device.
417 *
418 * This is called after the connection to the peer was lost. The storage covered
419 * by the requests on the transfer gets marked as our of sync. Called from the
420 * receiver thread and the worker thread.
421 */
422void tl_clear(struct drbd_conf *mdev)
423{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700424 struct list_head *le, *tle;
425 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700426
427 spin_lock_irq(&mdev->req_lock);
428
Philipp Reisner11b58e72010-05-12 17:08:26 +0200429 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700430
431 /* we expect this list to be empty. */
432 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
433
434 /* but just in case, clean it up anyways! */
435 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
436 r = list_entry(le, struct drbd_request, tl_requests);
437 /* It would be nice to complete outside of spinlock.
438 * But this is easier for now. */
439 _req_mod(r, connection_lost_while_pending);
440 }
441
442 /* ensure bit indicating barrier is required is clear */
443 clear_bit(CREATE_BARRIER, &mdev->flags);
444
Philipp Reisner288f4222010-05-27 15:07:43 +0200445 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
446
Philipp Reisnerb411b362009-09-25 16:07:19 -0700447 spin_unlock_irq(&mdev->req_lock);
448}
449
Philipp Reisner11b58e72010-05-12 17:08:26 +0200450void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
451{
452 spin_lock_irq(&mdev->req_lock);
453 _tl_restart(mdev, what);
454 spin_unlock_irq(&mdev->req_lock);
455}
456
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457/**
458 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
459 * @mdev: DRBD device.
460 * @os: old (current) state.
461 * @ns: new (wanted) state.
462 */
463static int cl_wide_st_chg(struct drbd_conf *mdev,
464 union drbd_state os, union drbd_state ns)
465{
466 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
467 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
468 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
469 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
470 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
471 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
472 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
473}
474
475int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
476 union drbd_state mask, union drbd_state val)
477{
478 unsigned long flags;
479 union drbd_state os, ns;
480 int rv;
481
482 spin_lock_irqsave(&mdev->req_lock, flags);
483 os = mdev->state;
484 ns.i = (os.i & ~mask.i) | val.i;
485 rv = _drbd_set_state(mdev, ns, f, NULL);
486 ns = mdev->state;
487 spin_unlock_irqrestore(&mdev->req_lock, flags);
488
489 return rv;
490}
491
492/**
493 * drbd_force_state() - Impose a change which happens outside our control on our state
494 * @mdev: DRBD device.
495 * @mask: mask of state bits to change.
496 * @val: value of new state bits.
497 */
498void drbd_force_state(struct drbd_conf *mdev,
499 union drbd_state mask, union drbd_state val)
500{
501 drbd_change_state(mdev, CS_HARD, mask, val);
502}
503
504static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
505static int is_valid_state_transition(struct drbd_conf *,
506 union drbd_state, union drbd_state);
507static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200508 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700509int drbd_send_state_req(struct drbd_conf *,
510 union drbd_state, union drbd_state);
511
512static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
513 union drbd_state mask, union drbd_state val)
514{
515 union drbd_state os, ns;
516 unsigned long flags;
517 int rv;
518
519 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
520 return SS_CW_SUCCESS;
521
522 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
523 return SS_CW_FAILED_BY_PEER;
524
525 rv = 0;
526 spin_lock_irqsave(&mdev->req_lock, flags);
527 os = mdev->state;
528 ns.i = (os.i & ~mask.i) | val.i;
529 ns = sanitize_state(mdev, os, ns, NULL);
530
531 if (!cl_wide_st_chg(mdev, os, ns))
532 rv = SS_CW_NO_NEED;
533 if (!rv) {
534 rv = is_valid_state(mdev, ns);
535 if (rv == SS_SUCCESS) {
536 rv = is_valid_state_transition(mdev, ns, os);
537 if (rv == SS_SUCCESS)
538 rv = 0; /* cont waiting, otherwise fail. */
539 }
540 }
541 spin_unlock_irqrestore(&mdev->req_lock, flags);
542
543 return rv;
544}
545
546/**
547 * drbd_req_state() - Perform an eventually cluster wide state change
548 * @mdev: DRBD device.
549 * @mask: mask of state bits to change.
550 * @val: value of new state bits.
551 * @f: flags
552 *
553 * Should not be called directly, use drbd_request_state() or
554 * _drbd_request_state().
555 */
556static int drbd_req_state(struct drbd_conf *mdev,
557 union drbd_state mask, union drbd_state val,
558 enum chg_state_flags f)
559{
560 struct completion done;
561 unsigned long flags;
562 union drbd_state os, ns;
563 int rv;
564
565 init_completion(&done);
566
567 if (f & CS_SERIALIZE)
568 mutex_lock(&mdev->state_mutex);
569
570 spin_lock_irqsave(&mdev->req_lock, flags);
571 os = mdev->state;
572 ns.i = (os.i & ~mask.i) | val.i;
573 ns = sanitize_state(mdev, os, ns, NULL);
574
575 if (cl_wide_st_chg(mdev, os, ns)) {
576 rv = is_valid_state(mdev, ns);
577 if (rv == SS_SUCCESS)
578 rv = is_valid_state_transition(mdev, ns, os);
579 spin_unlock_irqrestore(&mdev->req_lock, flags);
580
581 if (rv < SS_SUCCESS) {
582 if (f & CS_VERBOSE)
583 print_st_err(mdev, os, ns, rv);
584 goto abort;
585 }
586
587 drbd_state_lock(mdev);
588 if (!drbd_send_state_req(mdev, mask, val)) {
589 drbd_state_unlock(mdev);
590 rv = SS_CW_FAILED_BY_PEER;
591 if (f & CS_VERBOSE)
592 print_st_err(mdev, os, ns, rv);
593 goto abort;
594 }
595
596 wait_event(mdev->state_wait,
597 (rv = _req_st_cond(mdev, mask, val)));
598
599 if (rv < SS_SUCCESS) {
600 drbd_state_unlock(mdev);
601 if (f & CS_VERBOSE)
602 print_st_err(mdev, os, ns, rv);
603 goto abort;
604 }
605 spin_lock_irqsave(&mdev->req_lock, flags);
606 os = mdev->state;
607 ns.i = (os.i & ~mask.i) | val.i;
608 rv = _drbd_set_state(mdev, ns, f, &done);
609 drbd_state_unlock(mdev);
610 } else {
611 rv = _drbd_set_state(mdev, ns, f, &done);
612 }
613
614 spin_unlock_irqrestore(&mdev->req_lock, flags);
615
616 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
617 D_ASSERT(current != mdev->worker.task);
618 wait_for_completion(&done);
619 }
620
621abort:
622 if (f & CS_SERIALIZE)
623 mutex_unlock(&mdev->state_mutex);
624
625 return rv;
626}
627
628/**
629 * _drbd_request_state() - Request a state change (with flags)
630 * @mdev: DRBD device.
631 * @mask: mask of state bits to change.
632 * @val: value of new state bits.
633 * @f: flags
634 *
635 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
636 * flag, or when logging of failed state change requests is not desired.
637 */
638int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
639 union drbd_state val, enum chg_state_flags f)
640{
641 int rv;
642
643 wait_event(mdev->state_wait,
644 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
645
646 return rv;
647}
648
649static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
650{
651 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
652 name,
653 drbd_conn_str(ns.conn),
654 drbd_role_str(ns.role),
655 drbd_role_str(ns.peer),
656 drbd_disk_str(ns.disk),
657 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200658 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700659 ns.aftr_isp ? 'a' : '-',
660 ns.peer_isp ? 'p' : '-',
661 ns.user_isp ? 'u' : '-'
662 );
663}
664
665void print_st_err(struct drbd_conf *mdev,
666 union drbd_state os, union drbd_state ns, int err)
667{
668 if (err == SS_IN_TRANSIENT_STATE)
669 return;
670 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
671 print_st(mdev, " state", os);
672 print_st(mdev, "wanted", ns);
673}
674
675
676#define drbd_peer_str drbd_role_str
677#define drbd_pdsk_str drbd_disk_str
678
679#define drbd_susp_str(A) ((A) ? "1" : "0")
680#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
681#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
682#define drbd_user_isp_str(A) ((A) ? "1" : "0")
683
684#define PSC(A) \
685 ({ if (ns.A != os.A) { \
686 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
687 drbd_##A##_str(os.A), \
688 drbd_##A##_str(ns.A)); \
689 } })
690
691/**
692 * is_valid_state() - Returns an SS_ error code if ns is not valid
693 * @mdev: DRBD device.
694 * @ns: State to consider.
695 */
696static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
697{
698 /* See drbd_state_sw_errors in drbd_strings.c */
699
700 enum drbd_fencing_p fp;
701 int rv = SS_SUCCESS;
702
703 fp = FP_DONT_CARE;
704 if (get_ldev(mdev)) {
705 fp = mdev->ldev->dc.fencing;
706 put_ldev(mdev);
707 }
708
709 if (get_net_conf(mdev)) {
710 if (!mdev->net_conf->two_primaries &&
711 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
712 rv = SS_TWO_PRIMARIES;
713 put_net_conf(mdev);
714 }
715
716 if (rv <= 0)
717 /* already found a reason to abort */;
718 else if (ns.role == R_SECONDARY && mdev->open_cnt)
719 rv = SS_DEVICE_IN_USE;
720
721 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
722 rv = SS_NO_UP_TO_DATE_DISK;
723
724 else if (fp >= FP_RESOURCE &&
725 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
726 rv = SS_PRIMARY_NOP;
727
728 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
729 rv = SS_NO_UP_TO_DATE_DISK;
730
731 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
732 rv = SS_NO_LOCAL_DISK;
733
734 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
735 rv = SS_NO_REMOTE_DISK;
736
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200737 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
738 rv = SS_NO_UP_TO_DATE_DISK;
739
Philipp Reisnerb411b362009-09-25 16:07:19 -0700740 else if ((ns.conn == C_CONNECTED ||
741 ns.conn == C_WF_BITMAP_S ||
742 ns.conn == C_SYNC_SOURCE ||
743 ns.conn == C_PAUSED_SYNC_S) &&
744 ns.disk == D_OUTDATED)
745 rv = SS_CONNECTED_OUTDATES;
746
747 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
748 (mdev->sync_conf.verify_alg[0] == 0))
749 rv = SS_NO_VERIFY_ALG;
750
751 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
752 mdev->agreed_pro_version < 88)
753 rv = SS_NOT_SUPPORTED;
754
755 return rv;
756}
757
758/**
759 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
760 * @mdev: DRBD device.
761 * @ns: new state.
762 * @os: old state.
763 */
764static int is_valid_state_transition(struct drbd_conf *mdev,
765 union drbd_state ns, union drbd_state os)
766{
767 int rv = SS_SUCCESS;
768
769 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
770 os.conn > C_CONNECTED)
771 rv = SS_RESYNC_RUNNING;
772
773 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
774 rv = SS_ALREADY_STANDALONE;
775
776 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
777 rv = SS_IS_DISKLESS;
778
779 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
780 rv = SS_NO_NET_CONFIG;
781
782 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
783 rv = SS_LOWER_THAN_OUTDATED;
784
785 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
786 rv = SS_IN_TRANSIENT_STATE;
787
788 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
789 rv = SS_IN_TRANSIENT_STATE;
790
791 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
792 rv = SS_NEED_CONNECTION;
793
794 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
795 ns.conn != os.conn && os.conn > C_CONNECTED)
796 rv = SS_RESYNC_RUNNING;
797
798 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
799 os.conn < C_CONNECTED)
800 rv = SS_NEED_CONNECTION;
801
802 return rv;
803}
804
805/**
806 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
807 * @mdev: DRBD device.
808 * @os: old state.
809 * @ns: new state.
810 * @warn_sync_abort:
811 *
812 * When we loose connection, we have to set the state of the peers disk (pdsk)
813 * to D_UNKNOWN. This rule and many more along those lines are in this function.
814 */
815static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200816 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700817{
818 enum drbd_fencing_p fp;
819
820 fp = FP_DONT_CARE;
821 if (get_ldev(mdev)) {
822 fp = mdev->ldev->dc.fencing;
823 put_ldev(mdev);
824 }
825
826 /* Disallow Network errors to configure a device's network part */
827 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
828 os.conn <= C_DISCONNECTING)
829 ns.conn = os.conn;
830
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200831 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
832 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700833 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200834 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700835 ns.conn = os.conn;
836
837 /* After C_DISCONNECTING only C_STANDALONE may follow */
838 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
839 ns.conn = os.conn;
840
841 if (ns.conn < C_CONNECTED) {
842 ns.peer_isp = 0;
843 ns.peer = R_UNKNOWN;
844 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
845 ns.pdsk = D_UNKNOWN;
846 }
847
848 /* Clear the aftr_isp when becoming unconfigured */
849 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
850 ns.aftr_isp = 0;
851
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 /* Abort resync if a disk fails/detaches */
853 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
854 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
855 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200856 *warn_sync_abort =
857 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
858 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700859 ns.conn = C_CONNECTED;
860 }
861
862 if (ns.conn >= C_CONNECTED &&
863 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
864 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
865 switch (ns.conn) {
866 case C_WF_BITMAP_T:
867 case C_PAUSED_SYNC_T:
868 ns.disk = D_OUTDATED;
869 break;
870 case C_CONNECTED:
871 case C_WF_BITMAP_S:
872 case C_SYNC_SOURCE:
873 case C_PAUSED_SYNC_S:
874 ns.disk = D_UP_TO_DATE;
875 break;
876 case C_SYNC_TARGET:
877 ns.disk = D_INCONSISTENT;
878 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
879 break;
880 }
881 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
882 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
883 }
884
885 if (ns.conn >= C_CONNECTED &&
886 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
887 switch (ns.conn) {
888 case C_CONNECTED:
889 case C_WF_BITMAP_T:
890 case C_PAUSED_SYNC_T:
891 case C_SYNC_TARGET:
892 ns.pdsk = D_UP_TO_DATE;
893 break;
894 case C_WF_BITMAP_S:
895 case C_PAUSED_SYNC_S:
Lars Ellenberge0f83012010-04-01 15:13:19 +0200896 /* remap any consistent state to D_OUTDATED,
897 * but disallow "upgrade" of not even consistent states.
898 */
899 ns.pdsk =
900 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
901 ? os.pdsk : D_OUTDATED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700902 break;
903 case C_SYNC_SOURCE:
904 ns.pdsk = D_INCONSISTENT;
905 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
906 break;
907 }
908 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
909 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
910 }
911
912 /* Connection breaks down before we finished "Negotiating" */
913 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
914 get_ldev_if_state(mdev, D_NEGOTIATING)) {
915 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
916 ns.disk = mdev->new_state_tmp.disk;
917 ns.pdsk = mdev->new_state_tmp.pdsk;
918 } else {
919 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
920 ns.disk = D_DISKLESS;
921 ns.pdsk = D_UNKNOWN;
922 }
923 put_ldev(mdev);
924 }
925
926 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200927 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
928 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200929 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200930
931 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
932 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
933 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200934 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700935
936 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
937 if (ns.conn == C_SYNC_SOURCE)
938 ns.conn = C_PAUSED_SYNC_S;
939 if (ns.conn == C_SYNC_TARGET)
940 ns.conn = C_PAUSED_SYNC_T;
941 } else {
942 if (ns.conn == C_PAUSED_SYNC_S)
943 ns.conn = C_SYNC_SOURCE;
944 if (ns.conn == C_PAUSED_SYNC_T)
945 ns.conn = C_SYNC_TARGET;
946 }
947
948 return ns;
949}
950
951/* helper for __drbd_set_state */
952static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
953{
954 if (cs == C_VERIFY_T) {
955 /* starting online verify from an arbitrary position
956 * does not fit well into the existing protocol.
957 * on C_VERIFY_T, we initialize ov_left and friends
958 * implicitly in receive_DataRequest once the
959 * first P_OV_REQUEST is received */
960 mdev->ov_start_sector = ~(sector_t)0;
961 } else {
962 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
963 if (bit >= mdev->rs_total)
964 mdev->ov_start_sector =
965 BM_BIT_TO_SECT(mdev->rs_total - 1);
966 mdev->ov_position = mdev->ov_start_sector;
967 }
968}
969
Philipp Reisner07782862010-08-31 12:00:50 +0200970static void drbd_resume_al(struct drbd_conf *mdev)
971{
972 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
973 dev_info(DEV, "Resumed AL updates\n");
974}
975
Philipp Reisnerb411b362009-09-25 16:07:19 -0700976/**
977 * __drbd_set_state() - Set a new DRBD state
978 * @mdev: DRBD device.
979 * @ns: new state.
980 * @flags: Flags
981 * @done: Optional completion, that will get completed after the after_state_ch() finished
982 *
983 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
984 */
985int __drbd_set_state(struct drbd_conf *mdev,
986 union drbd_state ns, enum chg_state_flags flags,
987 struct completion *done)
988{
989 union drbd_state os;
990 int rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200991 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700992 struct after_state_chg_work *ascw;
993
994 os = mdev->state;
995
996 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
997
998 if (ns.i == os.i)
999 return SS_NOTHING_TO_DO;
1000
1001 if (!(flags & CS_HARD)) {
1002 /* pre-state-change checks ; only look at ns */
1003 /* See drbd_state_sw_errors in drbd_strings.c */
1004
1005 rv = is_valid_state(mdev, ns);
1006 if (rv < SS_SUCCESS) {
1007 /* If the old state was illegal as well, then let
1008 this happen...*/
1009
Philipp Reisner1616a252010-06-10 16:55:15 +02001010 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001011 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001012 } else
1013 rv = is_valid_state_transition(mdev, ns, os);
1014 }
1015
1016 if (rv < SS_SUCCESS) {
1017 if (flags & CS_VERBOSE)
1018 print_st_err(mdev, os, ns, rv);
1019 return rv;
1020 }
1021
1022 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001023 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001024
1025 {
1026 char *pbp, pb[300];
1027 pbp = pb;
1028 *pbp = 0;
1029 PSC(role);
1030 PSC(peer);
1031 PSC(conn);
1032 PSC(disk);
1033 PSC(pdsk);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001034 if (is_susp(ns) != is_susp(os))
1035 pbp += sprintf(pbp, "susp( %s -> %s ) ",
1036 drbd_susp_str(is_susp(os)),
1037 drbd_susp_str(is_susp(ns)));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001038 PSC(aftr_isp);
1039 PSC(peer_isp);
1040 PSC(user_isp);
1041 dev_info(DEV, "%s\n", pb);
1042 }
1043
1044 /* solve the race between becoming unconfigured,
1045 * worker doing the cleanup, and
1046 * admin reconfiguring us:
1047 * on (re)configure, first set CONFIG_PENDING,
1048 * then wait for a potentially exiting worker,
1049 * start the worker, and schedule one no_op.
1050 * then proceed with configuration.
1051 */
1052 if (ns.disk == D_DISKLESS &&
1053 ns.conn == C_STANDALONE &&
1054 ns.role == R_SECONDARY &&
1055 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1056 set_bit(DEVICE_DYING, &mdev->flags);
1057
1058 mdev->state.i = ns.i;
1059 wake_up(&mdev->misc_wait);
1060 wake_up(&mdev->state_wait);
1061
Philipp Reisnerb411b362009-09-25 16:07:19 -07001062 /* aborted verify run. log the last position */
1063 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1064 ns.conn < C_CONNECTED) {
1065 mdev->ov_start_sector =
1066 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1067 dev_info(DEV, "Online Verify reached sector %llu\n",
1068 (unsigned long long)mdev->ov_start_sector);
1069 }
1070
1071 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1072 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1073 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001074 mdev->rs_paused += (long)jiffies
1075 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001076 if (ns.conn == C_SYNC_TARGET)
1077 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001078 }
1079
1080 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1081 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1082 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001083 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001084 }
1085
1086 if (os.conn == C_CONNECTED &&
1087 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001088 unsigned long now = jiffies;
1089 int i;
1090
Philipp Reisnerb411b362009-09-25 16:07:19 -07001091 mdev->ov_position = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001092 mdev->rs_total = drbd_bm_bits(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001093 if (mdev->agreed_pro_version >= 90)
1094 set_ov_position(mdev, ns.conn);
1095 else
1096 mdev->ov_start_sector = 0;
1097 mdev->ov_left = mdev->rs_total
1098 - BM_SECT_TO_BIT(mdev->ov_position);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001099 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001100 mdev->rs_last_events = 0;
1101 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001102 mdev->ov_last_oos_size = 0;
1103 mdev->ov_last_oos_start = 0;
1104
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001105 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1106 mdev->rs_mark_left[i] = mdev->rs_total;
1107 mdev->rs_mark_time[i] = now;
1108 }
1109
Philipp Reisnerb411b362009-09-25 16:07:19 -07001110 if (ns.conn == C_VERIFY_S) {
1111 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1112 (unsigned long long)mdev->ov_position);
1113 mod_timer(&mdev->resync_timer, jiffies);
1114 }
1115 }
1116
1117 if (get_ldev(mdev)) {
1118 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1119 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1120 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1121
1122 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1123 mdf |= MDF_CRASHED_PRIMARY;
1124 if (mdev->state.role == R_PRIMARY ||
1125 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1126 mdf |= MDF_PRIMARY_IND;
1127 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1128 mdf |= MDF_CONNECTED_IND;
1129 if (mdev->state.disk > D_INCONSISTENT)
1130 mdf |= MDF_CONSISTENT;
1131 if (mdev->state.disk > D_OUTDATED)
1132 mdf |= MDF_WAS_UP_TO_DATE;
1133 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1134 mdf |= MDF_PEER_OUT_DATED;
1135 if (mdf != mdev->ldev->md.flags) {
1136 mdev->ldev->md.flags = mdf;
1137 drbd_md_mark_dirty(mdev);
1138 }
1139 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1140 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1141 put_ldev(mdev);
1142 }
1143
1144 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1145 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1146 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1147 set_bit(CONSIDER_RESYNC, &mdev->flags);
1148
1149 /* Receiver should clean up itself */
1150 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1151 drbd_thread_stop_nowait(&mdev->receiver);
1152
1153 /* Now the receiver finished cleaning up itself, it should die */
1154 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1155 drbd_thread_stop_nowait(&mdev->receiver);
1156
1157 /* Upon network failure, we need to restart the receiver. */
1158 if (os.conn > C_TEAR_DOWN &&
1159 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1160 drbd_thread_restart_nowait(&mdev->receiver);
1161
Philipp Reisner07782862010-08-31 12:00:50 +02001162 /* Resume AL writing if we get a connection */
1163 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1164 drbd_resume_al(mdev);
1165
Philipp Reisnerb411b362009-09-25 16:07:19 -07001166 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1167 if (ascw) {
1168 ascw->os = os;
1169 ascw->ns = ns;
1170 ascw->flags = flags;
1171 ascw->w.cb = w_after_state_ch;
1172 ascw->done = done;
1173 drbd_queue_work(&mdev->data.work, &ascw->w);
1174 } else {
1175 dev_warn(DEV, "Could not kmalloc an ascw\n");
1176 }
1177
1178 return rv;
1179}
1180
1181static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1182{
1183 struct after_state_chg_work *ascw =
1184 container_of(w, struct after_state_chg_work, w);
1185 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1186 if (ascw->flags & CS_WAIT_COMPLETE) {
1187 D_ASSERT(ascw->done != NULL);
1188 complete(ascw->done);
1189 }
1190 kfree(ascw);
1191
1192 return 1;
1193}
1194
1195static void abw_start_sync(struct drbd_conf *mdev, int rv)
1196{
1197 if (rv) {
1198 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1199 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1200 return;
1201 }
1202
1203 switch (mdev->state.conn) {
1204 case C_STARTING_SYNC_T:
1205 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1206 break;
1207 case C_STARTING_SYNC_S:
1208 drbd_start_resync(mdev, C_SYNC_SOURCE);
1209 break;
1210 }
1211}
1212
1213/**
1214 * after_state_ch() - Perform after state change actions that may sleep
1215 * @mdev: DRBD device.
1216 * @os: old state.
1217 * @ns: new state.
1218 * @flags: Flags
1219 */
1220static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1221 union drbd_state ns, enum chg_state_flags flags)
1222{
1223 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001224 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001225 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001226
1227 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1228 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1229 if (mdev->p_uuid)
1230 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1231 }
1232
1233 fp = FP_DONT_CARE;
1234 if (get_ldev(mdev)) {
1235 fp = mdev->ldev->dc.fencing;
1236 put_ldev(mdev);
1237 }
1238
1239 /* Inform userspace about the change... */
1240 drbd_bcast_state(mdev, ns);
1241
1242 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1243 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1244 drbd_khelper(mdev, "pri-on-incon-degr");
1245
1246 /* Here we have the actions that are performed after a
1247 state change. This function might sleep */
1248
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001249 nsm.i = -1;
1250 if (ns.susp_nod) {
Philipp Reisner265be2d2010-05-31 10:14:17 +02001251 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
Philipp Reisner67098932010-06-24 16:24:25 +02001252 if (ns.conn == C_CONNECTED)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001253 what = resend, nsm.susp_nod = 0;
Philipp Reisner67098932010-06-24 16:24:25 +02001254 else /* ns.conn > C_CONNECTED */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001255 dev_err(DEV, "Unexpected Resynd going on!\n");
1256 }
1257
Philipp Reisner67098932010-06-24 16:24:25 +02001258 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001259 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1260
Philipp Reisner265be2d2010-05-31 10:14:17 +02001261 }
1262
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001263 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001264 /* case1: The outdate peer handler is successful: */
1265 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001266 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001267 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1268 drbd_uuid_new_current(mdev);
1269 clear_bit(NEW_CUR_UUID, &mdev->flags);
1270 drbd_md_sync(mdev);
1271 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001272 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001273 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001274 spin_unlock_irq(&mdev->req_lock);
1275 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001276 /* case2: The connection was established again: */
1277 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1278 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001279 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001280 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001281 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001282 }
Philipp Reisner67098932010-06-24 16:24:25 +02001283
1284 if (what != nothing) {
1285 spin_lock_irq(&mdev->req_lock);
1286 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001287 nsm.i &= mdev->state.i;
1288 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001289 spin_unlock_irq(&mdev->req_lock);
1290 }
1291
Philipp Reisnerb411b362009-09-25 16:07:19 -07001292 /* Do not change the order of the if above and the two below... */
1293 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1294 drbd_send_uuids(mdev);
1295 drbd_send_state(mdev);
1296 }
1297 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1298 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1299
1300 /* Lost contact to peer's copy of the data */
1301 if ((os.pdsk >= D_INCONSISTENT &&
1302 os.pdsk != D_UNKNOWN &&
1303 os.pdsk != D_OUTDATED)
1304 && (ns.pdsk < D_INCONSISTENT ||
1305 ns.pdsk == D_UNKNOWN ||
1306 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001307 if (get_ldev(mdev)) {
1308 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001309 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001310 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001311 set_bit(NEW_CUR_UUID, &mdev->flags);
1312 } else {
1313 drbd_uuid_new_current(mdev);
1314 drbd_send_uuids(mdev);
1315 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001316 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001317 put_ldev(mdev);
1318 }
1319 }
1320
1321 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001322 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001323 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001324 drbd_send_uuids(mdev);
1325 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001326
1327 /* D_DISKLESS Peer becomes secondary */
1328 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1329 drbd_al_to_on_disk_bm(mdev);
1330 put_ldev(mdev);
1331 }
1332
1333 /* Last part of the attaching process ... */
1334 if (ns.conn >= C_CONNECTED &&
1335 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001336 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001337 drbd_send_uuids(mdev);
1338 drbd_send_state(mdev);
1339 }
1340
1341 /* We want to pause/continue resync, tell peer. */
1342 if (ns.conn >= C_CONNECTED &&
1343 ((os.aftr_isp != ns.aftr_isp) ||
1344 (os.user_isp != ns.user_isp)))
1345 drbd_send_state(mdev);
1346
1347 /* In case one of the isp bits got set, suspend other devices. */
1348 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1349 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1350 suspend_other_sg(mdev);
1351
1352 /* Make sure the peer gets informed about eventual state
1353 changes (ISP bits) while we were in WFReportParams. */
1354 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1355 drbd_send_state(mdev);
1356
1357 /* We are in the progress to start a full sync... */
1358 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1359 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1360 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1361
1362 /* We are invalidating our self... */
1363 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1364 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1365 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1366
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001367 /* first half of local IO error */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001368 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001369 enum drbd_io_error_p eh = EP_PASS_ON;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001370
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001371 if (drbd_send_state(mdev))
1372 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1373 else
1374 dev_err(DEV, "Sending state for drbd_io_error() failed\n");
1375
1376 drbd_rs_cancel_all(mdev);
1377
Philipp Reisnerb411b362009-09-25 16:07:19 -07001378 if (get_ldev_if_state(mdev, D_FAILED)) {
1379 eh = mdev->ldev->dc.on_io_error;
1380 put_ldev(mdev);
1381 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001382 if (eh == EP_CALL_HELPER)
1383 drbd_khelper(mdev, "local-io-error");
1384 }
1385
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001386
1387 /* second half of local IO error handling,
1388 * after local_cnt references have reached zero: */
1389 if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
1390 mdev->rs_total = 0;
1391 mdev->rs_failed = 0;
1392 atomic_set(&mdev->rs_pending_cnt, 0);
1393 }
1394
Philipp Reisnerb411b362009-09-25 16:07:19 -07001395 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001396 int c = atomic_read(&mdev->local_cnt);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001397
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001398 if (drbd_send_state(mdev))
1399 dev_warn(DEV, "Notified peer that I detached my disk.\n");
1400 else
1401 dev_err(DEV, "Sending state for detach failed\n");
1402
1403 if (c != 0) {
1404 dev_err(DEV, "Logic bug, local_cnt=%d, but should be 0\n", c);
1405 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001406 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001407 lc_destroy(mdev->resync);
1408 mdev->resync = NULL;
1409 lc_destroy(mdev->act_log);
1410 mdev->act_log = NULL;
1411 __no_warn(local,
1412 drbd_free_bc(mdev->ldev);
1413 mdev->ldev = NULL;);
1414
Lars Ellenbergf65363c2010-09-14 20:14:09 +02001415 if (mdev->md_io_tmpp) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001416 __free_page(mdev->md_io_tmpp);
Lars Ellenbergf65363c2010-09-14 20:14:09 +02001417 mdev->md_io_tmpp = NULL;
1418 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001419 }
1420
1421 /* Disks got bigger while they were detached */
1422 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1423 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1424 if (ns.conn == C_CONNECTED)
1425 resync_after_online_grow(mdev);
1426 }
1427
1428 /* A resync finished or aborted, wake paused devices... */
1429 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1430 (os.peer_isp && !ns.peer_isp) ||
1431 (os.user_isp && !ns.user_isp))
1432 resume_next_sg(mdev);
1433
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001434 /* sync target done with resync. Explicitly notify peer, even though
1435 * it should (at least for non-empty resyncs) already know itself. */
1436 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1437 drbd_send_state(mdev);
1438
Philipp Reisnerf70b3512010-06-24 14:34:40 +02001439 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001440 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b3512010-06-24 14:34:40 +02001441 drbd_free_tl_hash(mdev);
1442
Philipp Reisnerb411b362009-09-25 16:07:19 -07001443 /* Upon network connection, we need to start the receiver */
1444 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1445 drbd_thread_start(&mdev->receiver);
1446
1447 /* Terminate worker thread if we are unconfigured - it will be
1448 restarted as needed... */
1449 if (ns.disk == D_DISKLESS &&
1450 ns.conn == C_STANDALONE &&
1451 ns.role == R_SECONDARY) {
1452 if (os.aftr_isp != ns.aftr_isp)
1453 resume_next_sg(mdev);
1454 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1455 if (test_bit(DEVICE_DYING, &mdev->flags))
1456 drbd_thread_stop_nowait(&mdev->worker);
1457 }
1458
1459 drbd_md_sync(mdev);
1460}
1461
1462
1463static int drbd_thread_setup(void *arg)
1464{
1465 struct drbd_thread *thi = (struct drbd_thread *) arg;
1466 struct drbd_conf *mdev = thi->mdev;
1467 unsigned long flags;
1468 int retval;
1469
1470restart:
1471 retval = thi->function(thi);
1472
1473 spin_lock_irqsave(&thi->t_lock, flags);
1474
1475 /* if the receiver has been "Exiting", the last thing it did
1476 * was set the conn state to "StandAlone",
1477 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1478 * and receiver thread will be "started".
1479 * drbd_thread_start needs to set "Restarting" in that case.
1480 * t_state check and assignment needs to be within the same spinlock,
1481 * so either thread_start sees Exiting, and can remap to Restarting,
1482 * or thread_start see None, and can proceed as normal.
1483 */
1484
1485 if (thi->t_state == Restarting) {
1486 dev_info(DEV, "Restarting %s\n", current->comm);
1487 thi->t_state = Running;
1488 spin_unlock_irqrestore(&thi->t_lock, flags);
1489 goto restart;
1490 }
1491
1492 thi->task = NULL;
1493 thi->t_state = None;
1494 smp_mb();
1495 complete(&thi->stop);
1496 spin_unlock_irqrestore(&thi->t_lock, flags);
1497
1498 dev_info(DEV, "Terminating %s\n", current->comm);
1499
1500 /* Release mod reference taken when thread was started */
1501 module_put(THIS_MODULE);
1502 return retval;
1503}
1504
1505static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1506 int (*func) (struct drbd_thread *))
1507{
1508 spin_lock_init(&thi->t_lock);
1509 thi->task = NULL;
1510 thi->t_state = None;
1511 thi->function = func;
1512 thi->mdev = mdev;
1513}
1514
1515int drbd_thread_start(struct drbd_thread *thi)
1516{
1517 struct drbd_conf *mdev = thi->mdev;
1518 struct task_struct *nt;
1519 unsigned long flags;
1520
1521 const char *me =
1522 thi == &mdev->receiver ? "receiver" :
1523 thi == &mdev->asender ? "asender" :
1524 thi == &mdev->worker ? "worker" : "NONSENSE";
1525
1526 /* is used from state engine doing drbd_thread_stop_nowait,
1527 * while holding the req lock irqsave */
1528 spin_lock_irqsave(&thi->t_lock, flags);
1529
1530 switch (thi->t_state) {
1531 case None:
1532 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1533 me, current->comm, current->pid);
1534
1535 /* Get ref on module for thread - this is released when thread exits */
1536 if (!try_module_get(THIS_MODULE)) {
1537 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1538 spin_unlock_irqrestore(&thi->t_lock, flags);
1539 return FALSE;
1540 }
1541
1542 init_completion(&thi->stop);
1543 D_ASSERT(thi->task == NULL);
1544 thi->reset_cpu_mask = 1;
1545 thi->t_state = Running;
1546 spin_unlock_irqrestore(&thi->t_lock, flags);
1547 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1548
1549 nt = kthread_create(drbd_thread_setup, (void *) thi,
1550 "drbd%d_%s", mdev_to_minor(mdev), me);
1551
1552 if (IS_ERR(nt)) {
1553 dev_err(DEV, "Couldn't start thread\n");
1554
1555 module_put(THIS_MODULE);
1556 return FALSE;
1557 }
1558 spin_lock_irqsave(&thi->t_lock, flags);
1559 thi->task = nt;
1560 thi->t_state = Running;
1561 spin_unlock_irqrestore(&thi->t_lock, flags);
1562 wake_up_process(nt);
1563 break;
1564 case Exiting:
1565 thi->t_state = Restarting;
1566 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1567 me, current->comm, current->pid);
1568 /* fall through */
1569 case Running:
1570 case Restarting:
1571 default:
1572 spin_unlock_irqrestore(&thi->t_lock, flags);
1573 break;
1574 }
1575
1576 return TRUE;
1577}
1578
1579
1580void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1581{
1582 unsigned long flags;
1583
1584 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1585
1586 /* may be called from state engine, holding the req lock irqsave */
1587 spin_lock_irqsave(&thi->t_lock, flags);
1588
1589 if (thi->t_state == None) {
1590 spin_unlock_irqrestore(&thi->t_lock, flags);
1591 if (restart)
1592 drbd_thread_start(thi);
1593 return;
1594 }
1595
1596 if (thi->t_state != ns) {
1597 if (thi->task == NULL) {
1598 spin_unlock_irqrestore(&thi->t_lock, flags);
1599 return;
1600 }
1601
1602 thi->t_state = ns;
1603 smp_mb();
1604 init_completion(&thi->stop);
1605 if (thi->task != current)
1606 force_sig(DRBD_SIGKILL, thi->task);
1607
1608 }
1609
1610 spin_unlock_irqrestore(&thi->t_lock, flags);
1611
1612 if (wait)
1613 wait_for_completion(&thi->stop);
1614}
1615
1616#ifdef CONFIG_SMP
1617/**
1618 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1619 * @mdev: DRBD device.
1620 *
1621 * Forces all threads of a device onto the same CPU. This is beneficial for
1622 * DRBD's performance. May be overwritten by user's configuration.
1623 */
1624void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1625{
1626 int ord, cpu;
1627
1628 /* user override. */
1629 if (cpumask_weight(mdev->cpu_mask))
1630 return;
1631
1632 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1633 for_each_online_cpu(cpu) {
1634 if (ord-- == 0) {
1635 cpumask_set_cpu(cpu, mdev->cpu_mask);
1636 return;
1637 }
1638 }
1639 /* should not be reached */
1640 cpumask_setall(mdev->cpu_mask);
1641}
1642
1643/**
1644 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1645 * @mdev: DRBD device.
1646 *
1647 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1648 * prematurely.
1649 */
1650void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1651{
1652 struct task_struct *p = current;
1653 struct drbd_thread *thi =
1654 p == mdev->asender.task ? &mdev->asender :
1655 p == mdev->receiver.task ? &mdev->receiver :
1656 p == mdev->worker.task ? &mdev->worker :
1657 NULL;
1658 ERR_IF(thi == NULL)
1659 return;
1660 if (!thi->reset_cpu_mask)
1661 return;
1662 thi->reset_cpu_mask = 0;
1663 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1664}
1665#endif
1666
1667/* the appropriate socket mutex must be held already */
1668int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001669 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001670 size_t size, unsigned msg_flags)
1671{
1672 int sent, ok;
1673
1674 ERR_IF(!h) return FALSE;
1675 ERR_IF(!size) return FALSE;
1676
1677 h->magic = BE_DRBD_MAGIC;
1678 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001679 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001680
Philipp Reisnerb411b362009-09-25 16:07:19 -07001681 sent = drbd_send(mdev, sock, h, size, msg_flags);
1682
1683 ok = (sent == size);
1684 if (!ok)
1685 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1686 cmdname(cmd), (int)size, sent);
1687 return ok;
1688}
1689
1690/* don't pass the socket. we may only look at it
1691 * when we hold the appropriate socket mutex.
1692 */
1693int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001694 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001695{
1696 int ok = 0;
1697 struct socket *sock;
1698
1699 if (use_data_socket) {
1700 mutex_lock(&mdev->data.mutex);
1701 sock = mdev->data.socket;
1702 } else {
1703 mutex_lock(&mdev->meta.mutex);
1704 sock = mdev->meta.socket;
1705 }
1706
1707 /* drbd_disconnect() could have called drbd_free_sock()
1708 * while we were waiting in down()... */
1709 if (likely(sock != NULL))
1710 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1711
1712 if (use_data_socket)
1713 mutex_unlock(&mdev->data.mutex);
1714 else
1715 mutex_unlock(&mdev->meta.mutex);
1716 return ok;
1717}
1718
1719int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1720 size_t size)
1721{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001722 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001723 int ok;
1724
1725 h.magic = BE_DRBD_MAGIC;
1726 h.command = cpu_to_be16(cmd);
1727 h.length = cpu_to_be16(size);
1728
1729 if (!drbd_get_data_sock(mdev))
1730 return 0;
1731
Philipp Reisnerb411b362009-09-25 16:07:19 -07001732 ok = (sizeof(h) ==
1733 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1734 ok = ok && (size ==
1735 drbd_send(mdev, mdev->data.socket, data, size, 0));
1736
1737 drbd_put_data_sock(mdev);
1738
1739 return ok;
1740}
1741
1742int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1743{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001744 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001745 struct socket *sock;
1746 int size, rv;
1747 const int apv = mdev->agreed_pro_version;
1748
1749 size = apv <= 87 ? sizeof(struct p_rs_param)
1750 : apv == 88 ? sizeof(struct p_rs_param)
1751 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001752 : apv <= 94 ? sizeof(struct p_rs_param_89)
1753 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001754
1755 /* used from admin command context and receiver/worker context.
1756 * to avoid kmalloc, grab the socket right here,
1757 * then use the pre-allocated sbuf there */
1758 mutex_lock(&mdev->data.mutex);
1759 sock = mdev->data.socket;
1760
1761 if (likely(sock != NULL)) {
1762 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1763
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001764 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001765
1766 /* initialize verify_alg and csums_alg */
1767 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1768
1769 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001770 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1771 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1772 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1773 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001774
1775 if (apv >= 88)
1776 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1777 if (apv >= 89)
1778 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1779
1780 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1781 } else
1782 rv = 0; /* not ok */
1783
1784 mutex_unlock(&mdev->data.mutex);
1785
1786 return rv;
1787}
1788
1789int drbd_send_protocol(struct drbd_conf *mdev)
1790{
1791 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001792 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001793
1794 size = sizeof(struct p_protocol);
1795
1796 if (mdev->agreed_pro_version >= 87)
1797 size += strlen(mdev->net_conf->integrity_alg) + 1;
1798
1799 /* we must not recurse into our own queue,
1800 * as that is blocked during handshake */
1801 p = kmalloc(size, GFP_NOIO);
1802 if (p == NULL)
1803 return 0;
1804
1805 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1806 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1807 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1808 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001809 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1810
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001811 cf = 0;
1812 if (mdev->net_conf->want_lose)
1813 cf |= CF_WANT_LOSE;
1814 if (mdev->net_conf->dry_run) {
1815 if (mdev->agreed_pro_version >= 92)
1816 cf |= CF_DRY_RUN;
1817 else {
1818 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001819 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001820 return 0;
1821 }
1822 }
1823 p->conn_flags = cpu_to_be32(cf);
1824
Philipp Reisnerb411b362009-09-25 16:07:19 -07001825 if (mdev->agreed_pro_version >= 87)
1826 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1827
1828 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001829 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001830 kfree(p);
1831 return rv;
1832}
1833
1834int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1835{
1836 struct p_uuids p;
1837 int i;
1838
1839 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1840 return 1;
1841
1842 for (i = UI_CURRENT; i < UI_SIZE; i++)
1843 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1844
1845 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1846 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1847 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1848 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1849 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1850 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1851
1852 put_ldev(mdev);
1853
1854 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001855 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001856}
1857
1858int drbd_send_uuids(struct drbd_conf *mdev)
1859{
1860 return _drbd_send_uuids(mdev, 0);
1861}
1862
1863int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1864{
1865 return _drbd_send_uuids(mdev, 8);
1866}
1867
1868
1869int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1870{
1871 struct p_rs_uuid p;
1872
1873 p.uuid = cpu_to_be64(val);
1874
1875 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001876 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001877}
1878
Philipp Reisnere89b5912010-03-24 17:11:33 +01001879int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001880{
1881 struct p_sizes p;
1882 sector_t d_size, u_size;
1883 int q_order_type;
1884 int ok;
1885
1886 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1887 D_ASSERT(mdev->ldev->backing_bdev);
1888 d_size = drbd_get_max_capacity(mdev->ldev);
1889 u_size = mdev->ldev->dc.disk_size;
1890 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001891 put_ldev(mdev);
1892 } else {
1893 d_size = 0;
1894 u_size = 0;
1895 q_order_type = QUEUE_ORDERED_NONE;
1896 }
1897
1898 p.d_size = cpu_to_be64(d_size);
1899 p.u_size = cpu_to_be64(u_size);
1900 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1901 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
Philipp Reisnere89b5912010-03-24 17:11:33 +01001902 p.queue_order_type = cpu_to_be16(q_order_type);
1903 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001904
1905 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001906 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001907 return ok;
1908}
1909
1910/**
1911 * drbd_send_state() - Sends the drbd state to the peer
1912 * @mdev: DRBD device.
1913 */
1914int drbd_send_state(struct drbd_conf *mdev)
1915{
1916 struct socket *sock;
1917 struct p_state p;
1918 int ok = 0;
1919
1920 /* Grab state lock so we wont send state if we're in the middle
1921 * of a cluster wide state change on another thread */
1922 drbd_state_lock(mdev);
1923
1924 mutex_lock(&mdev->data.mutex);
1925
1926 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1927 sock = mdev->data.socket;
1928
1929 if (likely(sock != NULL)) {
1930 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001931 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001932 }
1933
1934 mutex_unlock(&mdev->data.mutex);
1935
1936 drbd_state_unlock(mdev);
1937 return ok;
1938}
1939
1940int drbd_send_state_req(struct drbd_conf *mdev,
1941 union drbd_state mask, union drbd_state val)
1942{
1943 struct p_req_state p;
1944
1945 p.mask = cpu_to_be32(mask.i);
1946 p.val = cpu_to_be32(val.i);
1947
1948 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001949 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001950}
1951
1952int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1953{
1954 struct p_req_state_reply p;
1955
1956 p.retcode = cpu_to_be32(retcode);
1957
1958 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001959 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001960}
1961
1962int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1963 struct p_compressed_bm *p,
1964 struct bm_xfer_ctx *c)
1965{
1966 struct bitstream bs;
1967 unsigned long plain_bits;
1968 unsigned long tmp;
1969 unsigned long rl;
1970 unsigned len;
1971 unsigned toggle;
1972 int bits;
1973
1974 /* may we use this feature? */
1975 if ((mdev->sync_conf.use_rle == 0) ||
1976 (mdev->agreed_pro_version < 90))
1977 return 0;
1978
1979 if (c->bit_offset >= c->bm_bits)
1980 return 0; /* nothing to do. */
1981
1982 /* use at most thus many bytes */
1983 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1984 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1985 /* plain bits covered in this code string */
1986 plain_bits = 0;
1987
1988 /* p->encoding & 0x80 stores whether the first run length is set.
1989 * bit offset is implicit.
1990 * start with toggle == 2 to be able to tell the first iteration */
1991 toggle = 2;
1992
1993 /* see how much plain bits we can stuff into one packet
1994 * using RLE and VLI. */
1995 do {
1996 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1997 : _drbd_bm_find_next(mdev, c->bit_offset);
1998 if (tmp == -1UL)
1999 tmp = c->bm_bits;
2000 rl = tmp - c->bit_offset;
2001
2002 if (toggle == 2) { /* first iteration */
2003 if (rl == 0) {
2004 /* the first checked bit was set,
2005 * store start value, */
2006 DCBP_set_start(p, 1);
2007 /* but skip encoding of zero run length */
2008 toggle = !toggle;
2009 continue;
2010 }
2011 DCBP_set_start(p, 0);
2012 }
2013
2014 /* paranoia: catch zero runlength.
2015 * can only happen if bitmap is modified while we scan it. */
2016 if (rl == 0) {
2017 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2018 "t:%u bo:%lu\n", toggle, c->bit_offset);
2019 return -1;
2020 }
2021
2022 bits = vli_encode_bits(&bs, rl);
2023 if (bits == -ENOBUFS) /* buffer full */
2024 break;
2025 if (bits <= 0) {
2026 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2027 return 0;
2028 }
2029
2030 toggle = !toggle;
2031 plain_bits += rl;
2032 c->bit_offset = tmp;
2033 } while (c->bit_offset < c->bm_bits);
2034
2035 len = bs.cur.b - p->code + !!bs.cur.bit;
2036
2037 if (plain_bits < (len << 3)) {
2038 /* incompressible with this method.
2039 * we need to rewind both word and bit position. */
2040 c->bit_offset -= plain_bits;
2041 bm_xfer_ctx_bit_to_word_offset(c);
2042 c->bit_offset = c->word_offset * BITS_PER_LONG;
2043 return 0;
2044 }
2045
2046 /* RLE + VLI was able to compress it just fine.
2047 * update c->word_offset. */
2048 bm_xfer_ctx_bit_to_word_offset(c);
2049
2050 /* store pad_bits */
2051 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2052
2053 return len;
2054}
2055
2056enum { OK, FAILED, DONE }
2057send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002058 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002059{
2060 struct p_compressed_bm *p = (void*)h;
2061 unsigned long num_words;
2062 int len;
2063 int ok;
2064
2065 len = fill_bitmap_rle_bits(mdev, p, c);
2066
2067 if (len < 0)
2068 return FAILED;
2069
2070 if (len) {
2071 DCBP_set_code(p, RLE_VLI_Bits);
2072 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2073 sizeof(*p) + len, 0);
2074
2075 c->packets[0]++;
2076 c->bytes[0] += sizeof(*p) + len;
2077
2078 if (c->bit_offset >= c->bm_bits)
2079 len = 0; /* DONE */
2080 } else {
2081 /* was not compressible.
2082 * send a buffer full of plain text bits instead. */
2083 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2084 len = num_words * sizeof(long);
2085 if (len)
2086 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2087 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002088 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002089 c->word_offset += num_words;
2090 c->bit_offset = c->word_offset * BITS_PER_LONG;
2091
2092 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002093 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002094
2095 if (c->bit_offset > c->bm_bits)
2096 c->bit_offset = c->bm_bits;
2097 }
2098 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2099
2100 if (ok == DONE)
2101 INFO_bm_xfer_stats(mdev, "send", c);
2102 return ok;
2103}
2104
2105/* See the comment at receive_bitmap() */
2106int _drbd_send_bitmap(struct drbd_conf *mdev)
2107{
2108 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002109 struct p_header80 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002110 int ret;
2111
2112 ERR_IF(!mdev->bitmap) return FALSE;
2113
2114 /* maybe we should use some per thread scratch page,
2115 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002116 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002117 if (!p) {
2118 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2119 return FALSE;
2120 }
2121
2122 if (get_ldev(mdev)) {
2123 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2124 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2125 drbd_bm_set_all(mdev);
2126 if (drbd_bm_write(mdev)) {
2127 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2128 * but otherwise process as per normal - need to tell other
2129 * side that a full resync is required! */
2130 dev_err(DEV, "Failed to write bitmap to disk!\n");
2131 } else {
2132 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2133 drbd_md_sync(mdev);
2134 }
2135 }
2136 put_ldev(mdev);
2137 }
2138
2139 c = (struct bm_xfer_ctx) {
2140 .bm_bits = drbd_bm_bits(mdev),
2141 .bm_words = drbd_bm_words(mdev),
2142 };
2143
2144 do {
2145 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2146 } while (ret == OK);
2147
2148 free_page((unsigned long) p);
2149 return (ret == DONE);
2150}
2151
2152int drbd_send_bitmap(struct drbd_conf *mdev)
2153{
2154 int err;
2155
2156 if (!drbd_get_data_sock(mdev))
2157 return -1;
2158 err = !_drbd_send_bitmap(mdev);
2159 drbd_put_data_sock(mdev);
2160 return err;
2161}
2162
2163int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2164{
2165 int ok;
2166 struct p_barrier_ack p;
2167
2168 p.barrier = barrier_nr;
2169 p.set_size = cpu_to_be32(set_size);
2170
2171 if (mdev->state.conn < C_CONNECTED)
2172 return FALSE;
2173 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002174 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002175 return ok;
2176}
2177
2178/**
2179 * _drbd_send_ack() - Sends an ack packet
2180 * @mdev: DRBD device.
2181 * @cmd: Packet command code.
2182 * @sector: sector, needs to be in big endian byte order
2183 * @blksize: size in byte, needs to be in big endian byte order
2184 * @block_id: Id, big endian byte order
2185 */
2186static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2187 u64 sector,
2188 u32 blksize,
2189 u64 block_id)
2190{
2191 int ok;
2192 struct p_block_ack p;
2193
2194 p.sector = sector;
2195 p.block_id = block_id;
2196 p.blksize = blksize;
2197 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2198
2199 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2200 return FALSE;
2201 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002202 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002203 return ok;
2204}
2205
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002206/* dp->sector and dp->block_id already/still in network byte order,
2207 * data_size is payload size according to dp->head,
2208 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002209int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002210 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002211{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002212 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2213 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002214 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2215 dp->block_id);
2216}
2217
2218int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2219 struct p_block_req *rp)
2220{
2221 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2222}
2223
2224/**
2225 * drbd_send_ack() - Sends an ack packet
2226 * @mdev: DRBD device.
2227 * @cmd: Packet command code.
2228 * @e: Epoch entry.
2229 */
2230int drbd_send_ack(struct drbd_conf *mdev,
2231 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2232{
2233 return _drbd_send_ack(mdev, cmd,
2234 cpu_to_be64(e->sector),
2235 cpu_to_be32(e->size),
2236 e->block_id);
2237}
2238
2239/* This function misuses the block_id field to signal if the blocks
2240 * are is sync or not. */
2241int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2242 sector_t sector, int blksize, u64 block_id)
2243{
2244 return _drbd_send_ack(mdev, cmd,
2245 cpu_to_be64(sector),
2246 cpu_to_be32(blksize),
2247 cpu_to_be64(block_id));
2248}
2249
2250int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2251 sector_t sector, int size, u64 block_id)
2252{
2253 int ok;
2254 struct p_block_req p;
2255
2256 p.sector = cpu_to_be64(sector);
2257 p.block_id = block_id;
2258 p.blksize = cpu_to_be32(size);
2259
2260 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002261 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002262 return ok;
2263}
2264
2265int drbd_send_drequest_csum(struct drbd_conf *mdev,
2266 sector_t sector, int size,
2267 void *digest, int digest_size,
2268 enum drbd_packets cmd)
2269{
2270 int ok;
2271 struct p_block_req p;
2272
2273 p.sector = cpu_to_be64(sector);
2274 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2275 p.blksize = cpu_to_be32(size);
2276
2277 p.head.magic = BE_DRBD_MAGIC;
2278 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002279 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002280
2281 mutex_lock(&mdev->data.mutex);
2282
2283 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2284 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2285
2286 mutex_unlock(&mdev->data.mutex);
2287
2288 return ok;
2289}
2290
2291int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2292{
2293 int ok;
2294 struct p_block_req p;
2295
2296 p.sector = cpu_to_be64(sector);
2297 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2298 p.blksize = cpu_to_be32(size);
2299
2300 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002301 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002302 return ok;
2303}
2304
2305/* called on sndtimeo
2306 * returns FALSE if we should retry,
2307 * TRUE if we think connection is dead
2308 */
2309static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2310{
2311 int drop_it;
2312 /* long elapsed = (long)(jiffies - mdev->last_received); */
2313
2314 drop_it = mdev->meta.socket == sock
2315 || !mdev->asender.task
2316 || get_t_state(&mdev->asender) != Running
2317 || mdev->state.conn < C_CONNECTED;
2318
2319 if (drop_it)
2320 return TRUE;
2321
2322 drop_it = !--mdev->ko_count;
2323 if (!drop_it) {
2324 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2325 current->comm, current->pid, mdev->ko_count);
2326 request_ping(mdev);
2327 }
2328
2329 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2330}
2331
2332/* The idea of sendpage seems to be to put some kind of reference
2333 * to the page into the skb, and to hand it over to the NIC. In
2334 * this process get_page() gets called.
2335 *
2336 * As soon as the page was really sent over the network put_page()
2337 * gets called by some part of the network layer. [ NIC driver? ]
2338 *
2339 * [ get_page() / put_page() increment/decrement the count. If count
2340 * reaches 0 the page will be freed. ]
2341 *
2342 * This works nicely with pages from FSs.
2343 * But this means that in protocol A we might signal IO completion too early!
2344 *
2345 * In order not to corrupt data during a resync we must make sure
2346 * that we do not reuse our own buffer pages (EEs) to early, therefore
2347 * we have the net_ee list.
2348 *
2349 * XFS seems to have problems, still, it submits pages with page_count == 0!
2350 * As a workaround, we disable sendpage on pages
2351 * with page_count == 0 or PageSlab.
2352 */
2353static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002354 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002355{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002356 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002357 kunmap(page);
2358 if (sent == size)
2359 mdev->send_cnt += size>>9;
2360 return sent == size;
2361}
2362
2363static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002364 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002365{
2366 mm_segment_t oldfs = get_fs();
2367 int sent, ok;
2368 int len = size;
2369
2370 /* e.g. XFS meta- & log-data is in slab pages, which have a
2371 * page_count of 0 and/or have PageSlab() set.
2372 * we cannot use send_page for those, as that does get_page();
2373 * put_page(); and would cause either a VM_BUG directly, or
2374 * __page_cache_release a page that would actually still be referenced
2375 * by someone, leading to some obscure delayed Oops somewhere else. */
2376 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002377 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002378
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002379 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002380 drbd_update_congested(mdev);
2381 set_fs(KERNEL_DS);
2382 do {
2383 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2384 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002385 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002386 if (sent == -EAGAIN) {
2387 if (we_should_drop_the_connection(mdev,
2388 mdev->data.socket))
2389 break;
2390 else
2391 continue;
2392 }
2393 if (sent <= 0) {
2394 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2395 __func__, (int)size, len, sent);
2396 break;
2397 }
2398 len -= sent;
2399 offset += sent;
2400 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2401 set_fs(oldfs);
2402 clear_bit(NET_CONGESTED, &mdev->flags);
2403
2404 ok = (len == 0);
2405 if (likely(ok))
2406 mdev->send_cnt += size>>9;
2407 return ok;
2408}
2409
2410static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2411{
2412 struct bio_vec *bvec;
2413 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002414 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002415 __bio_for_each_segment(bvec, bio, i, 0) {
2416 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002417 bvec->bv_offset, bvec->bv_len,
2418 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002419 return 0;
2420 }
2421 return 1;
2422}
2423
2424static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2425{
2426 struct bio_vec *bvec;
2427 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002428 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002429 __bio_for_each_segment(bvec, bio, i, 0) {
2430 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002431 bvec->bv_offset, bvec->bv_len,
2432 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002433 return 0;
2434 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002435 return 1;
2436}
2437
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002438static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2439{
2440 struct page *page = e->pages;
2441 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002442 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002443 page_chain_for_each(page) {
2444 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002445 if (!_drbd_send_page(mdev, page, 0, l,
2446 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002447 return 0;
2448 len -= l;
2449 }
2450 return 1;
2451}
2452
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002453static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2454{
2455 if (mdev->agreed_pro_version >= 95)
2456 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2457 (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
2458 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2459 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2460 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2461 else
2462 return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
2463}
2464
Philipp Reisnerb411b362009-09-25 16:07:19 -07002465/* Used to send write requests
2466 * R_PRIMARY -> Peer (P_DATA)
2467 */
2468int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2469{
2470 int ok = 1;
2471 struct p_data p;
2472 unsigned int dp_flags = 0;
2473 void *dgb;
2474 int dgs;
2475
2476 if (!drbd_get_data_sock(mdev))
2477 return 0;
2478
2479 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2480 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2481
Philipp Reisnerd5373382010-08-23 15:18:33 +02002482 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002483 p.head.h80.magic = BE_DRBD_MAGIC;
2484 p.head.h80.command = cpu_to_be16(P_DATA);
2485 p.head.h80.length =
2486 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2487 } else {
2488 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2489 p.head.h95.command = cpu_to_be16(P_DATA);
2490 p.head.h95.length =
2491 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2492 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002493
2494 p.sector = cpu_to_be64(req->sector);
2495 p.block_id = (unsigned long)req;
2496 p.seq_num = cpu_to_be32(req->seq_num =
2497 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002498
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002499 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2500
Philipp Reisnerb411b362009-09-25 16:07:19 -07002501 if (mdev->state.conn >= C_SYNC_SOURCE &&
2502 mdev->state.conn <= C_PAUSED_SYNC_T)
2503 dp_flags |= DP_MAY_SET_IN_SYNC;
2504
2505 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002506 set_bit(UNPLUG_REMOTE, &mdev->flags);
2507 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002508 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002509 if (ok && dgs) {
2510 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002511 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002512 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002513 }
2514 if (ok) {
2515 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2516 ok = _drbd_send_bio(mdev, req->master_bio);
2517 else
2518 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2519 }
2520
2521 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc2010-05-04 12:33:58 +02002522
Philipp Reisnerb411b362009-09-25 16:07:19 -07002523 return ok;
2524}
2525
2526/* answer packet, used to send data back for read requests:
2527 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2528 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2529 */
2530int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2531 struct drbd_epoch_entry *e)
2532{
2533 int ok;
2534 struct p_data p;
2535 void *dgb;
2536 int dgs;
2537
2538 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2539 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2540
Philipp Reisnerd5373382010-08-23 15:18:33 +02002541 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002542 p.head.h80.magic = BE_DRBD_MAGIC;
2543 p.head.h80.command = cpu_to_be16(cmd);
2544 p.head.h80.length =
2545 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2546 } else {
2547 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2548 p.head.h95.command = cpu_to_be16(cmd);
2549 p.head.h95.length =
2550 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2551 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002552
2553 p.sector = cpu_to_be64(e->sector);
2554 p.block_id = e->block_id;
2555 /* p.seq_num = 0; No sequence numbers here.. */
2556
2557 /* Only called by our kernel thread.
2558 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2559 * in response to admin command or module unload.
2560 */
2561 if (!drbd_get_data_sock(mdev))
2562 return 0;
2563
Philipp Reisner0b70a132010-08-20 13:36:10 +02002564 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002565 if (ok && dgs) {
2566 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002567 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002568 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002569 }
2570 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002571 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002572
2573 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc2010-05-04 12:33:58 +02002574
Philipp Reisnerb411b362009-09-25 16:07:19 -07002575 return ok;
2576}
2577
2578/*
2579 drbd_send distinguishes two cases:
2580
2581 Packets sent via the data socket "sock"
2582 and packets sent via the meta data socket "msock"
2583
2584 sock msock
2585 -----------------+-------------------------+------------------------------
2586 timeout conf.timeout / 2 conf.timeout / 2
2587 timeout action send a ping via msock Abort communication
2588 and close all sockets
2589*/
2590
2591/*
2592 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2593 */
2594int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2595 void *buf, size_t size, unsigned msg_flags)
2596{
2597 struct kvec iov;
2598 struct msghdr msg;
2599 int rv, sent = 0;
2600
2601 if (!sock)
2602 return -1000;
2603
2604 /* THINK if (signal_pending) return ... ? */
2605
2606 iov.iov_base = buf;
2607 iov.iov_len = size;
2608
2609 msg.msg_name = NULL;
2610 msg.msg_namelen = 0;
2611 msg.msg_control = NULL;
2612 msg.msg_controllen = 0;
2613 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2614
2615 if (sock == mdev->data.socket) {
2616 mdev->ko_count = mdev->net_conf->ko_count;
2617 drbd_update_congested(mdev);
2618 }
2619 do {
2620 /* STRANGE
2621 * tcp_sendmsg does _not_ use its size parameter at all ?
2622 *
2623 * -EAGAIN on timeout, -EINTR on signal.
2624 */
2625/* THINK
2626 * do we need to block DRBD_SIG if sock == &meta.socket ??
2627 * otherwise wake_asender() might interrupt some send_*Ack !
2628 */
2629 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2630 if (rv == -EAGAIN) {
2631 if (we_should_drop_the_connection(mdev, sock))
2632 break;
2633 else
2634 continue;
2635 }
2636 D_ASSERT(rv != 0);
2637 if (rv == -EINTR) {
2638 flush_signals(current);
2639 rv = 0;
2640 }
2641 if (rv < 0)
2642 break;
2643 sent += rv;
2644 iov.iov_base += rv;
2645 iov.iov_len -= rv;
2646 } while (sent < size);
2647
2648 if (sock == mdev->data.socket)
2649 clear_bit(NET_CONGESTED, &mdev->flags);
2650
2651 if (rv <= 0) {
2652 if (rv != -EAGAIN) {
2653 dev_err(DEV, "%s_sendmsg returned %d\n",
2654 sock == mdev->meta.socket ? "msock" : "sock",
2655 rv);
2656 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2657 } else
2658 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2659 }
2660
2661 return sent;
2662}
2663
2664static int drbd_open(struct block_device *bdev, fmode_t mode)
2665{
2666 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2667 unsigned long flags;
2668 int rv = 0;
2669
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002670 lock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002671 spin_lock_irqsave(&mdev->req_lock, flags);
2672 /* to have a stable mdev->state.role
2673 * and no race with updating open_cnt */
2674
2675 if (mdev->state.role != R_PRIMARY) {
2676 if (mode & FMODE_WRITE)
2677 rv = -EROFS;
2678 else if (!allow_oos)
2679 rv = -EMEDIUMTYPE;
2680 }
2681
2682 if (!rv)
2683 mdev->open_cnt++;
2684 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002685 unlock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002686
2687 return rv;
2688}
2689
2690static int drbd_release(struct gendisk *gd, fmode_t mode)
2691{
2692 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002693 lock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002694 mdev->open_cnt--;
Arnd Bergmann6e9624b2010-08-07 18:25:34 +02002695 unlock_kernel();
Philipp Reisnerb411b362009-09-25 16:07:19 -07002696 return 0;
2697}
2698
2699static void drbd_unplug_fn(struct request_queue *q)
2700{
2701 struct drbd_conf *mdev = q->queuedata;
2702
Philipp Reisnerb411b362009-09-25 16:07:19 -07002703 /* unplug FIRST */
2704 spin_lock_irq(q->queue_lock);
2705 blk_remove_plug(q);
2706 spin_unlock_irq(q->queue_lock);
2707
2708 /* only if connected */
2709 spin_lock_irq(&mdev->req_lock);
2710 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2711 D_ASSERT(mdev->state.role == R_PRIMARY);
2712 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2713 /* add to the data.work queue,
2714 * unless already queued.
2715 * XXX this might be a good addition to drbd_queue_work
2716 * anyways, to detect "double queuing" ... */
2717 if (list_empty(&mdev->unplug_work.list))
2718 drbd_queue_work(&mdev->data.work,
2719 &mdev->unplug_work);
2720 }
2721 }
2722 spin_unlock_irq(&mdev->req_lock);
2723
2724 if (mdev->state.disk >= D_INCONSISTENT)
2725 drbd_kick_lo(mdev);
2726}
2727
2728static void drbd_set_defaults(struct drbd_conf *mdev)
2729{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002730 /* This way we get a compile error when sync_conf grows,
2731 and we forgot to initialize it here */
2732 mdev->sync_conf = (struct syncer_conf) {
2733 /* .rate = */ DRBD_RATE_DEF,
2734 /* .after = */ DRBD_AFTER_DEF,
2735 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002736 /* .verify_alg = */ {}, 0,
2737 /* .cpu_mask = */ {}, 0,
2738 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002739 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002740 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2741 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2742 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2743 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002744 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2745 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002746 };
2747
2748 /* Have to use that way, because the layout differs between
2749 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002750 mdev->state = (union drbd_state) {
2751 { .role = R_SECONDARY,
2752 .peer = R_UNKNOWN,
2753 .conn = C_STANDALONE,
2754 .disk = D_DISKLESS,
2755 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002756 .susp = 0,
2757 .susp_nod = 0,
2758 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002759 } };
2760}
2761
2762void drbd_init_set_defaults(struct drbd_conf *mdev)
2763{
2764 /* the memset(,0,) did most of this.
2765 * note: only assignments, no allocation in here */
2766
2767 drbd_set_defaults(mdev);
2768
2769 /* for now, we do NOT yet support it,
2770 * even though we start some framework
2771 * to eventually support barriers */
2772 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2773
2774 atomic_set(&mdev->ap_bio_cnt, 0);
2775 atomic_set(&mdev->ap_pending_cnt, 0);
2776 atomic_set(&mdev->rs_pending_cnt, 0);
2777 atomic_set(&mdev->unacked_cnt, 0);
2778 atomic_set(&mdev->local_cnt, 0);
2779 atomic_set(&mdev->net_cnt, 0);
2780 atomic_set(&mdev->packet_seq, 0);
2781 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002782 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002783 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002784 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002785
2786 mutex_init(&mdev->md_io_mutex);
2787 mutex_init(&mdev->data.mutex);
2788 mutex_init(&mdev->meta.mutex);
2789 sema_init(&mdev->data.work.s, 0);
2790 sema_init(&mdev->meta.work.s, 0);
2791 mutex_init(&mdev->state_mutex);
2792
2793 spin_lock_init(&mdev->data.work.q_lock);
2794 spin_lock_init(&mdev->meta.work.q_lock);
2795
2796 spin_lock_init(&mdev->al_lock);
2797 spin_lock_init(&mdev->req_lock);
2798 spin_lock_init(&mdev->peer_seq_lock);
2799 spin_lock_init(&mdev->epoch_lock);
2800
2801 INIT_LIST_HEAD(&mdev->active_ee);
2802 INIT_LIST_HEAD(&mdev->sync_ee);
2803 INIT_LIST_HEAD(&mdev->done_ee);
2804 INIT_LIST_HEAD(&mdev->read_ee);
2805 INIT_LIST_HEAD(&mdev->net_ee);
2806 INIT_LIST_HEAD(&mdev->resync_reads);
2807 INIT_LIST_HEAD(&mdev->data.work.q);
2808 INIT_LIST_HEAD(&mdev->meta.work.q);
2809 INIT_LIST_HEAD(&mdev->resync_work.list);
2810 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002811 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002812 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2813 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002814
Philipp Reisnerb411b362009-09-25 16:07:19 -07002815 mdev->resync_work.cb = w_resync_inactive;
2816 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002817 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002818 mdev->md_sync_work.cb = w_md_sync;
2819 mdev->bm_io_work.w.cb = w_bitmap_io;
2820 init_timer(&mdev->resync_timer);
2821 init_timer(&mdev->md_sync_timer);
2822 mdev->resync_timer.function = resync_timer_fn;
2823 mdev->resync_timer.data = (unsigned long) mdev;
2824 mdev->md_sync_timer.function = md_sync_timer_fn;
2825 mdev->md_sync_timer.data = (unsigned long) mdev;
2826
2827 init_waitqueue_head(&mdev->misc_wait);
2828 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02002829 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002830 init_waitqueue_head(&mdev->ee_wait);
2831 init_waitqueue_head(&mdev->al_wait);
2832 init_waitqueue_head(&mdev->seq_wait);
2833
2834 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2835 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2836 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2837
2838 mdev->agreed_pro_version = PRO_VERSION_MAX;
2839 mdev->write_ordering = WO_bio_barrier;
2840 mdev->resync_wenr = LC_FREE;
2841}
2842
2843void drbd_mdev_cleanup(struct drbd_conf *mdev)
2844{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002845 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002846 if (mdev->receiver.t_state != None)
2847 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2848 mdev->receiver.t_state);
2849
2850 /* no need to lock it, I'm the only thread alive */
2851 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2852 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2853 mdev->al_writ_cnt =
2854 mdev->bm_writ_cnt =
2855 mdev->read_cnt =
2856 mdev->recv_cnt =
2857 mdev->send_cnt =
2858 mdev->writ_cnt =
2859 mdev->p_size =
2860 mdev->rs_start =
2861 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002862 mdev->rs_failed = 0;
2863 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002864 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002865 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2866 mdev->rs_mark_left[i] = 0;
2867 mdev->rs_mark_time[i] = 0;
2868 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002869 D_ASSERT(mdev->net_conf == NULL);
2870
2871 drbd_set_my_capacity(mdev, 0);
2872 if (mdev->bitmap) {
2873 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002874 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002875 drbd_bm_cleanup(mdev);
2876 }
2877
2878 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02002879 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002880
2881 /*
2882 * currently we drbd_init_ee only on module load, so
2883 * we may do drbd_release_ee only on module unload!
2884 */
2885 D_ASSERT(list_empty(&mdev->active_ee));
2886 D_ASSERT(list_empty(&mdev->sync_ee));
2887 D_ASSERT(list_empty(&mdev->done_ee));
2888 D_ASSERT(list_empty(&mdev->read_ee));
2889 D_ASSERT(list_empty(&mdev->net_ee));
2890 D_ASSERT(list_empty(&mdev->resync_reads));
2891 D_ASSERT(list_empty(&mdev->data.work.q));
2892 D_ASSERT(list_empty(&mdev->meta.work.q));
2893 D_ASSERT(list_empty(&mdev->resync_work.list));
2894 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002895 D_ASSERT(list_empty(&mdev->go_diskless.list));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002896
2897}
2898
2899
2900static void drbd_destroy_mempools(void)
2901{
2902 struct page *page;
2903
2904 while (drbd_pp_pool) {
2905 page = drbd_pp_pool;
2906 drbd_pp_pool = (struct page *)page_private(page);
2907 __free_page(page);
2908 drbd_pp_vacant--;
2909 }
2910
2911 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2912
2913 if (drbd_ee_mempool)
2914 mempool_destroy(drbd_ee_mempool);
2915 if (drbd_request_mempool)
2916 mempool_destroy(drbd_request_mempool);
2917 if (drbd_ee_cache)
2918 kmem_cache_destroy(drbd_ee_cache);
2919 if (drbd_request_cache)
2920 kmem_cache_destroy(drbd_request_cache);
2921 if (drbd_bm_ext_cache)
2922 kmem_cache_destroy(drbd_bm_ext_cache);
2923 if (drbd_al_ext_cache)
2924 kmem_cache_destroy(drbd_al_ext_cache);
2925
2926 drbd_ee_mempool = NULL;
2927 drbd_request_mempool = NULL;
2928 drbd_ee_cache = NULL;
2929 drbd_request_cache = NULL;
2930 drbd_bm_ext_cache = NULL;
2931 drbd_al_ext_cache = NULL;
2932
2933 return;
2934}
2935
2936static int drbd_create_mempools(void)
2937{
2938 struct page *page;
2939 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2940 int i;
2941
2942 /* prepare our caches and mempools */
2943 drbd_request_mempool = NULL;
2944 drbd_ee_cache = NULL;
2945 drbd_request_cache = NULL;
2946 drbd_bm_ext_cache = NULL;
2947 drbd_al_ext_cache = NULL;
2948 drbd_pp_pool = NULL;
2949
2950 /* caches */
2951 drbd_request_cache = kmem_cache_create(
2952 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2953 if (drbd_request_cache == NULL)
2954 goto Enomem;
2955
2956 drbd_ee_cache = kmem_cache_create(
2957 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2958 if (drbd_ee_cache == NULL)
2959 goto Enomem;
2960
2961 drbd_bm_ext_cache = kmem_cache_create(
2962 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2963 if (drbd_bm_ext_cache == NULL)
2964 goto Enomem;
2965
2966 drbd_al_ext_cache = kmem_cache_create(
2967 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2968 if (drbd_al_ext_cache == NULL)
2969 goto Enomem;
2970
2971 /* mempools */
2972 drbd_request_mempool = mempool_create(number,
2973 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2974 if (drbd_request_mempool == NULL)
2975 goto Enomem;
2976
2977 drbd_ee_mempool = mempool_create(number,
2978 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2979 if (drbd_request_mempool == NULL)
2980 goto Enomem;
2981
2982 /* drbd's page pool */
2983 spin_lock_init(&drbd_pp_lock);
2984
2985 for (i = 0; i < number; i++) {
2986 page = alloc_page(GFP_HIGHUSER);
2987 if (!page)
2988 goto Enomem;
2989 set_page_private(page, (unsigned long)drbd_pp_pool);
2990 drbd_pp_pool = page;
2991 }
2992 drbd_pp_vacant = number;
2993
2994 return 0;
2995
2996Enomem:
2997 drbd_destroy_mempools(); /* in case we allocated some */
2998 return -ENOMEM;
2999}
3000
3001static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3002 void *unused)
3003{
3004 /* just so we have it. you never know what interesting things we
3005 * might want to do here some day...
3006 */
3007
3008 return NOTIFY_DONE;
3009}
3010
3011static struct notifier_block drbd_notifier = {
3012 .notifier_call = drbd_notify_sys,
3013};
3014
3015static void drbd_release_ee_lists(struct drbd_conf *mdev)
3016{
3017 int rr;
3018
3019 rr = drbd_release_ee(mdev, &mdev->active_ee);
3020 if (rr)
3021 dev_err(DEV, "%d EEs in active list found!\n", rr);
3022
3023 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3024 if (rr)
3025 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3026
3027 rr = drbd_release_ee(mdev, &mdev->read_ee);
3028 if (rr)
3029 dev_err(DEV, "%d EEs in read list found!\n", rr);
3030
3031 rr = drbd_release_ee(mdev, &mdev->done_ee);
3032 if (rr)
3033 dev_err(DEV, "%d EEs in done list found!\n", rr);
3034
3035 rr = drbd_release_ee(mdev, &mdev->net_ee);
3036 if (rr)
3037 dev_err(DEV, "%d EEs in net list found!\n", rr);
3038}
3039
3040/* caution. no locking.
3041 * currently only used from module cleanup code. */
3042static void drbd_delete_device(unsigned int minor)
3043{
3044 struct drbd_conf *mdev = minor_to_mdev(minor);
3045
3046 if (!mdev)
3047 return;
3048
3049 /* paranoia asserts */
3050 if (mdev->open_cnt != 0)
3051 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3052 __FILE__ , __LINE__);
3053
3054 ERR_IF (!list_empty(&mdev->data.work.q)) {
3055 struct list_head *lp;
3056 list_for_each(lp, &mdev->data.work.q) {
3057 dev_err(DEV, "lp = %p\n", lp);
3058 }
3059 };
3060 /* end paranoia asserts */
3061
3062 del_gendisk(mdev->vdisk);
3063
3064 /* cleanup stuff that may have been allocated during
3065 * device (re-)configuration or state changes */
3066
3067 if (mdev->this_bdev)
3068 bdput(mdev->this_bdev);
3069
3070 drbd_free_resources(mdev);
3071
3072 drbd_release_ee_lists(mdev);
3073
3074 /* should be free'd on disconnect? */
3075 kfree(mdev->ee_hash);
3076 /*
3077 mdev->ee_hash_s = 0;
3078 mdev->ee_hash = NULL;
3079 */
3080
3081 lc_destroy(mdev->act_log);
3082 lc_destroy(mdev->resync);
3083
3084 kfree(mdev->p_uuid);
3085 /* mdev->p_uuid = NULL; */
3086
3087 kfree(mdev->int_dig_out);
3088 kfree(mdev->int_dig_in);
3089 kfree(mdev->int_dig_vv);
3090
3091 /* cleanup the rest that has been
3092 * allocated from drbd_new_device
3093 * and actually free the mdev itself */
3094 drbd_free_mdev(mdev);
3095}
3096
3097static void drbd_cleanup(void)
3098{
3099 unsigned int i;
3100
3101 unregister_reboot_notifier(&drbd_notifier);
3102
3103 drbd_nl_cleanup();
3104
3105 if (minor_table) {
3106 if (drbd_proc)
3107 remove_proc_entry("drbd", NULL);
3108 i = minor_count;
3109 while (i--)
3110 drbd_delete_device(i);
3111 drbd_destroy_mempools();
3112 }
3113
3114 kfree(minor_table);
3115
3116 unregister_blkdev(DRBD_MAJOR, "drbd");
3117
3118 printk(KERN_INFO "drbd: module cleanup done.\n");
3119}
3120
3121/**
3122 * drbd_congested() - Callback for pdflush
3123 * @congested_data: User data
3124 * @bdi_bits: Bits pdflush is currently interested in
3125 *
3126 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3127 */
3128static int drbd_congested(void *congested_data, int bdi_bits)
3129{
3130 struct drbd_conf *mdev = congested_data;
3131 struct request_queue *q;
3132 char reason = '-';
3133 int r = 0;
3134
3135 if (!__inc_ap_bio_cond(mdev)) {
3136 /* DRBD has frozen IO */
3137 r = bdi_bits;
3138 reason = 'd';
3139 goto out;
3140 }
3141
3142 if (get_ldev(mdev)) {
3143 q = bdev_get_queue(mdev->ldev->backing_bdev);
3144 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3145 put_ldev(mdev);
3146 if (r)
3147 reason = 'b';
3148 }
3149
3150 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3151 r |= (1 << BDI_async_congested);
3152 reason = reason == 'b' ? 'a' : 'n';
3153 }
3154
3155out:
3156 mdev->congestion_reason = reason;
3157 return r;
3158}
3159
3160struct drbd_conf *drbd_new_device(unsigned int minor)
3161{
3162 struct drbd_conf *mdev;
3163 struct gendisk *disk;
3164 struct request_queue *q;
3165
3166 /* GFP_KERNEL, we are outside of all write-out paths */
3167 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3168 if (!mdev)
3169 return NULL;
3170 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3171 goto out_no_cpumask;
3172
3173 mdev->minor = minor;
3174
3175 drbd_init_set_defaults(mdev);
3176
3177 q = blk_alloc_queue(GFP_KERNEL);
3178 if (!q)
3179 goto out_no_q;
3180 mdev->rq_queue = q;
3181 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003182
3183 disk = alloc_disk(1);
3184 if (!disk)
3185 goto out_no_disk;
3186 mdev->vdisk = disk;
3187
3188 set_disk_ro(disk, TRUE);
3189
3190 disk->queue = q;
3191 disk->major = DRBD_MAJOR;
3192 disk->first_minor = minor;
3193 disk->fops = &drbd_ops;
3194 sprintf(disk->disk_name, "drbd%d", minor);
3195 disk->private_data = mdev;
3196
3197 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3198 /* we have no partitions. we contain only ourselves. */
3199 mdev->this_bdev->bd_contains = mdev->this_bdev;
3200
3201 q->backing_dev_info.congested_fn = drbd_congested;
3202 q->backing_dev_info.congested_data = mdev;
3203
3204 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg98ec2862010-01-21 19:33:14 +01003205 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003206 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3207 blk_queue_merge_bvec(q, drbd_merge_bvec);
3208 q->queue_lock = &mdev->req_lock; /* needed since we use */
3209 /* plugging on a queue, that actually has no requests! */
3210 q->unplug_fn = drbd_unplug_fn;
3211
3212 mdev->md_io_page = alloc_page(GFP_KERNEL);
3213 if (!mdev->md_io_page)
3214 goto out_no_io_page;
3215
3216 if (drbd_bm_init(mdev))
3217 goto out_no_bitmap;
3218 /* no need to lock access, we are still initializing this minor device. */
3219 if (!tl_init(mdev))
3220 goto out_no_tl;
3221
3222 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3223 if (!mdev->app_reads_hash)
3224 goto out_no_app_reads;
3225
3226 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3227 if (!mdev->current_epoch)
3228 goto out_no_epoch;
3229
3230 INIT_LIST_HEAD(&mdev->current_epoch->list);
3231 mdev->epochs = 1;
3232
3233 return mdev;
3234
3235/* out_whatever_else:
3236 kfree(mdev->current_epoch); */
3237out_no_epoch:
3238 kfree(mdev->app_reads_hash);
3239out_no_app_reads:
3240 tl_cleanup(mdev);
3241out_no_tl:
3242 drbd_bm_cleanup(mdev);
3243out_no_bitmap:
3244 __free_page(mdev->md_io_page);
3245out_no_io_page:
3246 put_disk(disk);
3247out_no_disk:
3248 blk_cleanup_queue(q);
3249out_no_q:
3250 free_cpumask_var(mdev->cpu_mask);
3251out_no_cpumask:
3252 kfree(mdev);
3253 return NULL;
3254}
3255
3256/* counterpart of drbd_new_device.
3257 * last part of drbd_delete_device. */
3258void drbd_free_mdev(struct drbd_conf *mdev)
3259{
3260 kfree(mdev->current_epoch);
3261 kfree(mdev->app_reads_hash);
3262 tl_cleanup(mdev);
3263 if (mdev->bitmap) /* should no longer be there. */
3264 drbd_bm_cleanup(mdev);
3265 __free_page(mdev->md_io_page);
3266 put_disk(mdev->vdisk);
3267 blk_cleanup_queue(mdev->rq_queue);
3268 free_cpumask_var(mdev->cpu_mask);
3269 kfree(mdev);
3270}
3271
3272
3273int __init drbd_init(void)
3274{
3275 int err;
3276
3277 if (sizeof(struct p_handshake) != 80) {
3278 printk(KERN_ERR
3279 "drbd: never change the size or layout "
3280 "of the HandShake packet.\n");
3281 return -EINVAL;
3282 }
3283
3284 if (1 > minor_count || minor_count > 255) {
3285 printk(KERN_ERR
3286 "drbd: invalid minor_count (%d)\n", minor_count);
3287#ifdef MODULE
3288 return -EINVAL;
3289#else
3290 minor_count = 8;
3291#endif
3292 }
3293
3294 err = drbd_nl_init();
3295 if (err)
3296 return err;
3297
3298 err = register_blkdev(DRBD_MAJOR, "drbd");
3299 if (err) {
3300 printk(KERN_ERR
3301 "drbd: unable to register block device major %d\n",
3302 DRBD_MAJOR);
3303 return err;
3304 }
3305
3306 register_reboot_notifier(&drbd_notifier);
3307
3308 /*
3309 * allocate all necessary structs
3310 */
3311 err = -ENOMEM;
3312
3313 init_waitqueue_head(&drbd_pp_wait);
3314
3315 drbd_proc = NULL; /* play safe for drbd_cleanup */
3316 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3317 GFP_KERNEL);
3318 if (!minor_table)
3319 goto Enomem;
3320
3321 err = drbd_create_mempools();
3322 if (err)
3323 goto Enomem;
3324
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003325 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003326 if (!drbd_proc) {
3327 printk(KERN_ERR "drbd: unable to register proc file\n");
3328 goto Enomem;
3329 }
3330
3331 rwlock_init(&global_state_lock);
3332
3333 printk(KERN_INFO "drbd: initialized. "
3334 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3335 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3336 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3337 printk(KERN_INFO "drbd: registered as block device major %d\n",
3338 DRBD_MAJOR);
3339 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3340
3341 return 0; /* Success! */
3342
3343Enomem:
3344 drbd_cleanup();
3345 if (err == -ENOMEM)
3346 /* currently always the case */
3347 printk(KERN_ERR "drbd: ran out of memory\n");
3348 else
3349 printk(KERN_ERR "drbd: initialization failure\n");
3350 return err;
3351}
3352
3353void drbd_free_bc(struct drbd_backing_dev *ldev)
3354{
3355 if (ldev == NULL)
3356 return;
3357
3358 bd_release(ldev->backing_bdev);
3359 bd_release(ldev->md_bdev);
3360
3361 fput(ldev->lo_file);
3362 fput(ldev->md_file);
3363
3364 kfree(ldev);
3365}
3366
3367void drbd_free_sock(struct drbd_conf *mdev)
3368{
3369 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003370 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003371 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3372 sock_release(mdev->data.socket);
3373 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003374 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003375 }
3376 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003377 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003378 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3379 sock_release(mdev->meta.socket);
3380 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003381 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003382 }
3383}
3384
3385
3386void drbd_free_resources(struct drbd_conf *mdev)
3387{
3388 crypto_free_hash(mdev->csums_tfm);
3389 mdev->csums_tfm = NULL;
3390 crypto_free_hash(mdev->verify_tfm);
3391 mdev->verify_tfm = NULL;
3392 crypto_free_hash(mdev->cram_hmac_tfm);
3393 mdev->cram_hmac_tfm = NULL;
3394 crypto_free_hash(mdev->integrity_w_tfm);
3395 mdev->integrity_w_tfm = NULL;
3396 crypto_free_hash(mdev->integrity_r_tfm);
3397 mdev->integrity_r_tfm = NULL;
3398
3399 drbd_free_sock(mdev);
3400
3401 __no_warn(local,
3402 drbd_free_bc(mdev->ldev);
3403 mdev->ldev = NULL;);
3404}
3405
3406/* meta data management */
3407
3408struct meta_data_on_disk {
3409 u64 la_size; /* last agreed size. */
3410 u64 uuid[UI_SIZE]; /* UUIDs. */
3411 u64 device_uuid;
3412 u64 reserved_u64_1;
3413 u32 flags; /* MDF */
3414 u32 magic;
3415 u32 md_size_sect;
3416 u32 al_offset; /* offset to this block */
3417 u32 al_nr_extents; /* important for restoring the AL */
3418 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3419 u32 bm_offset; /* offset to the bitmap, from here */
3420 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3421 u32 reserved_u32[4];
3422
3423} __packed;
3424
3425/**
3426 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3427 * @mdev: DRBD device.
3428 */
3429void drbd_md_sync(struct drbd_conf *mdev)
3430{
3431 struct meta_data_on_disk *buffer;
3432 sector_t sector;
3433 int i;
3434
Lars Ellenbergee15b032010-09-03 10:00:09 +02003435 del_timer(&mdev->md_sync_timer);
3436 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003437 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3438 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003439
3440 /* We use here D_FAILED and not D_ATTACHING because we try to write
3441 * metadata even if we detach due to a disk failure! */
3442 if (!get_ldev_if_state(mdev, D_FAILED))
3443 return;
3444
Philipp Reisnerb411b362009-09-25 16:07:19 -07003445 mutex_lock(&mdev->md_io_mutex);
3446 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3447 memset(buffer, 0, 512);
3448
3449 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3450 for (i = UI_CURRENT; i < UI_SIZE; i++)
3451 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3452 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3453 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3454
3455 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3456 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3457 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3458 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3459 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3460
3461 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3462
3463 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3464 sector = mdev->ldev->md.md_offset;
3465
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003466 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003467 /* this was a try anyways ... */
3468 dev_err(DEV, "meta data update failed!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003469 drbd_chk_io_error(mdev, 1, TRUE);
3470 }
3471
3472 /* Update mdev->ldev->md.la_size_sect,
3473 * since we updated it on metadata. */
3474 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3475
3476 mutex_unlock(&mdev->md_io_mutex);
3477 put_ldev(mdev);
3478}
3479
3480/**
3481 * drbd_md_read() - Reads in the meta data super block
3482 * @mdev: DRBD device.
3483 * @bdev: Device from which the meta data should be read in.
3484 *
3485 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3486 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3487 */
3488int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3489{
3490 struct meta_data_on_disk *buffer;
3491 int i, rv = NO_ERROR;
3492
3493 if (!get_ldev_if_state(mdev, D_ATTACHING))
3494 return ERR_IO_MD_DISK;
3495
Philipp Reisnerb411b362009-09-25 16:07:19 -07003496 mutex_lock(&mdev->md_io_mutex);
3497 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3498
3499 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3500 /* NOTE: cant do normal error processing here as this is
3501 called BEFORE disk is attached */
3502 dev_err(DEV, "Error while reading metadata.\n");
3503 rv = ERR_IO_MD_DISK;
3504 goto err;
3505 }
3506
3507 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3508 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3509 rv = ERR_MD_INVALID;
3510 goto err;
3511 }
3512 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3513 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3514 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3515 rv = ERR_MD_INVALID;
3516 goto err;
3517 }
3518 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3519 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3520 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3521 rv = ERR_MD_INVALID;
3522 goto err;
3523 }
3524 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3525 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3526 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3527 rv = ERR_MD_INVALID;
3528 goto err;
3529 }
3530
3531 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3532 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3533 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3534 rv = ERR_MD_INVALID;
3535 goto err;
3536 }
3537
3538 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3539 for (i = UI_CURRENT; i < UI_SIZE; i++)
3540 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3541 bdev->md.flags = be32_to_cpu(buffer->flags);
3542 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3543 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3544
3545 if (mdev->sync_conf.al_extents < 7)
3546 mdev->sync_conf.al_extents = 127;
3547
3548 err:
3549 mutex_unlock(&mdev->md_io_mutex);
3550 put_ldev(mdev);
3551
3552 return rv;
3553}
3554
3555/**
3556 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3557 * @mdev: DRBD device.
3558 *
3559 * Call this function if you change anything that should be written to
3560 * the meta-data super block. This function sets MD_DIRTY, and starts a
3561 * timer that ensures that within five seconds you have to call drbd_md_sync().
3562 */
Lars Ellenbergee15b032010-09-03 10:00:09 +02003563#ifdef DRBD_DEBUG_MD_SYNC
3564void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3565{
3566 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3567 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3568 mdev->last_md_mark_dirty.line = line;
3569 mdev->last_md_mark_dirty.func = func;
3570 }
3571}
3572#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003573void drbd_md_mark_dirty(struct drbd_conf *mdev)
3574{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003575 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3576 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003577}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003578#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003579
3580static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3581{
3582 int i;
3583
Jens Axboe6a0afdf2009-10-01 09:04:14 +02003584 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003585 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003586}
3587
3588void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3589{
3590 if (idx == UI_CURRENT) {
3591 if (mdev->state.role == R_PRIMARY)
3592 val |= 1;
3593 else
3594 val &= ~((u64)1);
3595
3596 drbd_set_ed_uuid(mdev, val);
3597 }
3598
3599 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003600 drbd_md_mark_dirty(mdev);
3601}
3602
3603
3604void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3605{
3606 if (mdev->ldev->md.uuid[idx]) {
3607 drbd_uuid_move_history(mdev);
3608 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003609 }
3610 _drbd_uuid_set(mdev, idx, val);
3611}
3612
3613/**
3614 * drbd_uuid_new_current() - Creates a new current UUID
3615 * @mdev: DRBD device.
3616 *
3617 * Creates a new current UUID, and rotates the old current UUID into
3618 * the bitmap slot. Causes an incremental resync upon next connect.
3619 */
3620void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3621{
3622 u64 val;
3623
3624 dev_info(DEV, "Creating new current UUID\n");
3625 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3626 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003627
3628 get_random_bytes(&val, sizeof(u64));
3629 _drbd_uuid_set(mdev, UI_CURRENT, val);
3630}
3631
3632void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3633{
3634 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3635 return;
3636
3637 if (val == 0) {
3638 drbd_uuid_move_history(mdev);
3639 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3640 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003641 } else {
3642 if (mdev->ldev->md.uuid[UI_BITMAP])
3643 dev_warn(DEV, "bm UUID already set");
3644
3645 mdev->ldev->md.uuid[UI_BITMAP] = val;
3646 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3647
Philipp Reisnerb411b362009-09-25 16:07:19 -07003648 }
3649 drbd_md_mark_dirty(mdev);
3650}
3651
3652/**
3653 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3654 * @mdev: DRBD device.
3655 *
3656 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3657 */
3658int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3659{
3660 int rv = -EIO;
3661
3662 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3663 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3664 drbd_md_sync(mdev);
3665 drbd_bm_set_all(mdev);
3666
3667 rv = drbd_bm_write(mdev);
3668
3669 if (!rv) {
3670 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3671 drbd_md_sync(mdev);
3672 }
3673
3674 put_ldev(mdev);
3675 }
3676
3677 return rv;
3678}
3679
3680/**
3681 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3682 * @mdev: DRBD device.
3683 *
3684 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3685 */
3686int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3687{
3688 int rv = -EIO;
3689
Philipp Reisner07782862010-08-31 12:00:50 +02003690 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003691 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3692 drbd_bm_clear_all(mdev);
3693 rv = drbd_bm_write(mdev);
3694 put_ldev(mdev);
3695 }
3696
3697 return rv;
3698}
3699
3700static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3701{
3702 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3703 int rv;
3704
3705 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3706
3707 drbd_bm_lock(mdev, work->why);
3708 rv = work->io_fn(mdev);
3709 drbd_bm_unlock(mdev);
3710
3711 clear_bit(BITMAP_IO, &mdev->flags);
3712 wake_up(&mdev->misc_wait);
3713
3714 if (work->done)
3715 work->done(mdev, rv);
3716
3717 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3718 work->why = NULL;
3719
3720 return 1;
3721}
3722
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003723static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3724{
3725 D_ASSERT(mdev->state.disk == D_FAILED);
3726 D_ASSERT(atomic_read(&mdev->local_cnt) == 0);
3727
3728 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3729
3730 clear_bit(GO_DISKLESS, &mdev->flags);
3731 return 1;
3732}
3733
3734void drbd_go_diskless(struct drbd_conf *mdev)
3735{
3736 D_ASSERT(mdev->state.disk == D_FAILED);
3737 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3738 drbd_queue_work_front(&mdev->data.work, &mdev->go_diskless);
3739}
3740
Philipp Reisnerb411b362009-09-25 16:07:19 -07003741/**
3742 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3743 * @mdev: DRBD device.
3744 * @io_fn: IO callback to be called when bitmap IO is possible
3745 * @done: callback to be called after the bitmap IO was performed
3746 * @why: Descriptive text of the reason for doing the IO
3747 *
3748 * While IO on the bitmap happens we freeze application IO thus we ensure
3749 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3750 * called from worker context. It MUST NOT be used while a previous such
3751 * work is still pending!
3752 */
3753void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3754 int (*io_fn)(struct drbd_conf *),
3755 void (*done)(struct drbd_conf *, int),
3756 char *why)
3757{
3758 D_ASSERT(current == mdev->worker.task);
3759
3760 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3761 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3762 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3763 if (mdev->bm_io_work.why)
3764 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3765 why, mdev->bm_io_work.why);
3766
3767 mdev->bm_io_work.io_fn = io_fn;
3768 mdev->bm_io_work.done = done;
3769 mdev->bm_io_work.why = why;
3770
3771 set_bit(BITMAP_IO, &mdev->flags);
3772 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3773 if (list_empty(&mdev->bm_io_work.w.list)) {
3774 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3775 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3776 } else
3777 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3778 }
3779}
3780
3781/**
3782 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3783 * @mdev: DRBD device.
3784 * @io_fn: IO callback to be called when bitmap IO is possible
3785 * @why: Descriptive text of the reason for doing the IO
3786 *
3787 * freezes application IO while that the actual IO operations runs. This
3788 * functions MAY NOT be called from worker context.
3789 */
3790int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3791{
3792 int rv;
3793
3794 D_ASSERT(current != mdev->worker.task);
3795
3796 drbd_suspend_io(mdev);
3797
3798 drbd_bm_lock(mdev, why);
3799 rv = io_fn(mdev);
3800 drbd_bm_unlock(mdev);
3801
3802 drbd_resume_io(mdev);
3803
3804 return rv;
3805}
3806
3807void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3808{
3809 if ((mdev->ldev->md.flags & flag) != flag) {
3810 drbd_md_mark_dirty(mdev);
3811 mdev->ldev->md.flags |= flag;
3812 }
3813}
3814
3815void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3816{
3817 if ((mdev->ldev->md.flags & flag) != 0) {
3818 drbd_md_mark_dirty(mdev);
3819 mdev->ldev->md.flags &= ~flag;
3820 }
3821}
3822int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3823{
3824 return (bdev->md.flags & flag) != 0;
3825}
3826
3827static void md_sync_timer_fn(unsigned long data)
3828{
3829 struct drbd_conf *mdev = (struct drbd_conf *) data;
3830
3831 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3832}
3833
3834static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3835{
3836 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02003837#ifdef DEBUG
3838 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3839 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3840#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003841 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003842 return 1;
3843}
3844
3845#ifdef CONFIG_DRBD_FAULT_INJECTION
3846/* Fault insertion support including random number generator shamelessly
3847 * stolen from kernel/rcutorture.c */
3848struct fault_random_state {
3849 unsigned long state;
3850 unsigned long count;
3851};
3852
3853#define FAULT_RANDOM_MULT 39916801 /* prime */
3854#define FAULT_RANDOM_ADD 479001701 /* prime */
3855#define FAULT_RANDOM_REFRESH 10000
3856
3857/*
3858 * Crude but fast random-number generator. Uses a linear congruential
3859 * generator, with occasional help from get_random_bytes().
3860 */
3861static unsigned long
3862_drbd_fault_random(struct fault_random_state *rsp)
3863{
3864 long refresh;
3865
Roel Kluin49829ea2009-12-15 22:55:44 +01003866 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003867 get_random_bytes(&refresh, sizeof(refresh));
3868 rsp->state += refresh;
3869 rsp->count = FAULT_RANDOM_REFRESH;
3870 }
3871 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3872 return swahw32(rsp->state);
3873}
3874
3875static char *
3876_drbd_fault_str(unsigned int type) {
3877 static char *_faults[] = {
3878 [DRBD_FAULT_MD_WR] = "Meta-data write",
3879 [DRBD_FAULT_MD_RD] = "Meta-data read",
3880 [DRBD_FAULT_RS_WR] = "Resync write",
3881 [DRBD_FAULT_RS_RD] = "Resync read",
3882 [DRBD_FAULT_DT_WR] = "Data write",
3883 [DRBD_FAULT_DT_RD] = "Data read",
3884 [DRBD_FAULT_DT_RA] = "Data read ahead",
3885 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02003886 [DRBD_FAULT_AL_EE] = "EE allocation",
3887 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07003888 };
3889
3890 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3891}
3892
3893unsigned int
3894_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3895{
3896 static struct fault_random_state rrs = {0, 0};
3897
3898 unsigned int ret = (
3899 (fault_devs == 0 ||
3900 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3901 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3902
3903 if (ret) {
3904 fault_count++;
3905
Lars Ellenberg73835062010-05-27 11:51:56 +02003906 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003907 dev_warn(DEV, "***Simulating %s failure\n",
3908 _drbd_fault_str(type));
3909 }
3910
3911 return ret;
3912}
3913#endif
3914
3915const char *drbd_buildtag(void)
3916{
3917 /* DRBD built from external sources has here a reference to the
3918 git hash of the source code. */
3919
3920 static char buildtag[38] = "\0uilt-in";
3921
3922 if (buildtag[0] == 0) {
3923#ifdef CONFIG_MODULES
3924 if (THIS_MODULE != NULL)
3925 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3926 else
3927#endif
3928 buildtag[0] = 'b';
3929 }
3930
3931 return buildtag;
3932}
3933
3934module_init(drbd_init)
3935module_exit(drbd_cleanup)
3936
Philipp Reisnerb411b362009-09-25 16:07:19 -07003937EXPORT_SYMBOL(drbd_conn_str);
3938EXPORT_SYMBOL(drbd_role_str);
3939EXPORT_SYMBOL(drbd_disk_str);
3940EXPORT_SYMBOL(drbd_set_st_err_str);