blob: b1ce5dc7c60324f8ad7508589573792a6079d59b [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
Philipp Reisnerb411b362009-09-25 16:07:19 -070081MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89#include <linux/moduleparam.h>
90/* allow_open_on_secondary */
91MODULE_PARM_DESC(allow_oos, "DONT USE!");
92/* thanks to these macros, if compiled into the kernel (not-module),
93 * this becomes the boot parameter drbd.minor_count */
94module_param(minor_count, uint, 0444);
95module_param(disable_sendpage, bool, 0644);
96module_param(allow_oos, bool, 0);
97module_param(cn_idx, uint, 0444);
98module_param(proc_details, int, 0644);
99
100#ifdef CONFIG_DRBD_FAULT_INJECTION
101int enable_faults;
102int fault_rate;
103static int fault_count;
104int fault_devs;
105/* bitmap of enabled faults */
106module_param(enable_faults, int, 0664);
107/* fault rate % value - applies to all enabled faults */
108module_param(fault_rate, int, 0664);
109/* count of faults inserted */
110module_param(fault_count, int, 0664);
111/* bitmap of devices to insert faults on */
112module_param(fault_devs, int, 0644);
113#endif
114
115/* module parameter, defined */
116unsigned int minor_count = 32;
117int disable_sendpage;
118int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD;
120int proc_details; /* Detail level in proc drbd*/
121
122/* Module parameter for setting the user mode helper program
123 * to run. Default is /sbin/drbdadm */
124char usermode_helper[80] = "/sbin/drbdadm";
125
126module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128/* in 2.6.x, our device mapping and config info contains our virtual gendisks
129 * as member "struct gendisk *vdisk;"
130 */
131struct drbd_conf **minor_table;
132
133struct kmem_cache *drbd_request_cache;
134struct kmem_cache *drbd_ee_cache; /* epoch entries */
135struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
136struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
137mempool_t *drbd_request_mempool;
138mempool_t *drbd_ee_mempool;
139
140/* I do not use a standard mempool, because:
141 1) I want to hand out the pre-allocated objects first.
142 2) I want to be able to interrupt sleeping allocation with a signal.
143 Note: This is a single linked list, the next pointer is the private
144 member of struct page.
145 */
146struct page *drbd_pp_pool;
147spinlock_t drbd_pp_lock;
148int drbd_pp_vacant;
149wait_queue_head_t drbd_pp_wait;
150
151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100153static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700154 .owner = THIS_MODULE,
155 .open = drbd_open,
156 .release = drbd_release,
157};
158
159#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161#ifdef __CHECKER__
162/* When checking with sparse, and this is an inline function, sparse will
163 give tons of false positives. When this is a real functions sparse works.
164 */
165int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166{
167 int io_allowed;
168
169 atomic_inc(&mdev->local_cnt);
170 io_allowed = (mdev->state.disk >= mins);
171 if (!io_allowed) {
172 if (atomic_dec_and_test(&mdev->local_cnt))
173 wake_up(&mdev->misc_wait);
174 }
175 return io_allowed;
176}
177
178#endif
179
180/**
181 * DOC: The transfer log
182 *
183 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185 * of the list. There is always at least one &struct drbd_tl_epoch object.
186 *
187 * Each &struct drbd_tl_epoch has a circular double linked list of requests
188 * attached.
189 */
190static int tl_init(struct drbd_conf *mdev)
191{
192 struct drbd_tl_epoch *b;
193
194 /* during device minor initialization, we may well use GFP_KERNEL */
195 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196 if (!b)
197 return 0;
198 INIT_LIST_HEAD(&b->requests);
199 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL;
201 b->br_number = 4711;
202 b->n_req = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205 mdev->oldest_tle = b;
206 mdev->newest_tle = b;
207 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209 mdev->tl_hash = NULL;
210 mdev->tl_hash_s = 0;
211
212 return 1;
213}
214
215static void tl_cleanup(struct drbd_conf *mdev)
216{
217 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219 kfree(mdev->oldest_tle);
220 mdev->oldest_tle = NULL;
221 kfree(mdev->unused_spare_tle);
222 mdev->unused_spare_tle = NULL;
223 kfree(mdev->tl_hash);
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226}
227
228/**
229 * _tl_add_barrier() - Adds a barrier to the transfer log
230 * @mdev: DRBD device.
231 * @new: Barrier to be added before the current head of the TL.
232 *
233 * The caller must hold the req_lock.
234 */
235void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236{
237 struct drbd_tl_epoch *newest_before;
238
239 INIT_LIST_HEAD(&new->requests);
240 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL;
243 new->n_req = 0;
244
245 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased
247 * when using TCQ for our write ordering code */
248 new->br_number = (newest_before->br_number+1) ?: 1;
249 if (mdev->newest_tle != new) {
250 mdev->newest_tle->next = new;
251 mdev->newest_tle = new;
252 }
253}
254
255/**
256 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257 * @mdev: DRBD device.
258 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259 * @set_size: Expected number of requests before that barrier.
260 *
261 * In case the passed barrier_nr or set_size does not match the oldest
262 * &struct drbd_tl_epoch objects this function will cause a termination
263 * of the connection.
264 */
265void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266 unsigned int set_size)
267{
268 struct drbd_tl_epoch *b, *nob; /* next old barrier */
269 struct list_head *le, *tle;
270 struct drbd_request *r;
271
272 spin_lock_irq(&mdev->req_lock);
273
274 b = mdev->oldest_tle;
275
276 /* first some paranoia code */
277 if (b == NULL) {
278 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279 barrier_nr);
280 goto bail;
281 }
282 if (b->br_number != barrier_nr) {
283 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284 barrier_nr, b->br_number);
285 goto bail;
286 }
287 if (b->n_req != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289 barrier_nr, set_size, b->n_req);
290 goto bail;
291 }
292
293 /* Clean up list of requests processed during current epoch */
294 list_for_each_safe(le, tle, &b->requests) {
295 r = list_entry(le, struct drbd_request, tl_requests);
296 _req_mod(r, barrier_acked);
297 }
298 /* There could be requests on the list waiting for completion
299 of the write to the local disk. To avoid corruptions of
300 slab's data structures we have to remove the lists head.
301
302 Also there could have been a barrier ack out of sequence, overtaking
303 the write acks - which would be a bug and violating write ordering.
304 To not deadlock in case we lose connection while such requests are
305 still pending, we need some way to find them for the
306 _req_mode(connection_lost_while_pending).
307
308 These have been list_move'd to the out_of_sequence_requests list in
309 _req_mod(, barrier_acked) above.
310 */
311 list_del_init(&b->requests);
312
313 nob = b->next;
314 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315 _tl_add_barrier(mdev, b);
316 if (nob)
317 mdev->oldest_tle = nob;
318 /* if nob == NULL b was the only barrier, and becomes the new
319 barrier. Therefore mdev->oldest_tle points already to b */
320 } else {
321 D_ASSERT(nob != NULL);
322 mdev->oldest_tle = nob;
323 kfree(b);
324 }
325
326 spin_unlock_irq(&mdev->req_lock);
327 dec_ap_pending(mdev);
328
329 return;
330
331bail:
332 spin_unlock_irq(&mdev->req_lock);
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334}
335
336
337/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339 * @mdev: DRBD device.
340 *
341 * This is called after the connection to the peer was lost. The storage covered
342 * by the requests on the transfer gets marked as our of sync. Called from the
343 * receiver thread and the worker thread.
344 */
345void tl_clear(struct drbd_conf *mdev)
346{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle;
349 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351
352 spin_lock_irq(&mdev->req_lock);
353
354 b = mdev->oldest_tle;
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389
390 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393 /* but just in case, clean it up anyways! */
394 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395 r = list_entry(le, struct drbd_request, tl_requests);
396 /* It would be nice to complete outside of spinlock.
397 * But this is easier for now. */
398 _req_mod(r, connection_lost_while_pending);
399 }
400
401 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags);
403
404 spin_unlock_irq(&mdev->req_lock);
405}
406
407/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409 * @mdev: DRBD device.
410 * @os: old (current) state.
411 * @ns: new (wanted) state.
412 */
413static int cl_wide_st_chg(struct drbd_conf *mdev,
414 union drbd_state os, union drbd_state ns)
415{
416 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423}
424
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426 union drbd_state mask, union drbd_state val)
427{
428 unsigned long flags;
429 union drbd_state os, ns;
430 int rv;
431
432 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state;
434 ns.i = (os.i & ~mask.i) | val.i;
435 rv = _drbd_set_state(mdev, ns, f, NULL);
436 ns = mdev->state;
437 spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439 return rv;
440}
441
442/**
443 * drbd_force_state() - Impose a change which happens outside our control on our state
444 * @mdev: DRBD device.
445 * @mask: mask of state bits to change.
446 * @val: value of new state bits.
447 */
448void drbd_force_state(struct drbd_conf *mdev,
449 union drbd_state mask, union drbd_state val)
450{
451 drbd_change_state(mdev, CS_HARD, mask, val);
452}
453
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state);
461
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463 union drbd_state mask, union drbd_state val)
464{
465 union drbd_state os, ns;
466 unsigned long flags;
467 int rv;
468
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS;
471
472 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473 return SS_CW_FAILED_BY_PEER;
474
475 rv = 0;
476 spin_lock_irqsave(&mdev->req_lock, flags);
477 os = mdev->state;
478 ns.i = (os.i & ~mask.i) | val.i;
479 ns = sanitize_state(mdev, os, ns, NULL);
480
481 if (!cl_wide_st_chg(mdev, os, ns))
482 rv = SS_CW_NO_NEED;
483 if (!rv) {
484 rv = is_valid_state(mdev, ns);
485 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */
489 }
490 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_req_state() - Perform an eventually cluster wide state change
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 * @f: flags
502 *
503 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state().
505 */
506static int drbd_req_state(struct drbd_conf *mdev,
507 union drbd_state mask, union drbd_state val,
508 enum chg_state_flags f)
509{
510 struct completion done;
511 unsigned long flags;
512 union drbd_state os, ns;
513 int rv;
514
515 init_completion(&done);
516
517 if (f & CS_SERIALIZE)
518 mutex_lock(&mdev->state_mutex);
519
520 spin_lock_irqsave(&mdev->req_lock, flags);
521 os = mdev->state;
522 ns.i = (os.i & ~mask.i) | val.i;
523 ns = sanitize_state(mdev, os, ns, NULL);
524
525 if (cl_wide_st_chg(mdev, os, ns)) {
526 rv = is_valid_state(mdev, ns);
527 if (rv == SS_SUCCESS)
528 rv = is_valid_state_transition(mdev, ns, os);
529 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531 if (rv < SS_SUCCESS) {
532 if (f & CS_VERBOSE)
533 print_st_err(mdev, os, ns, rv);
534 goto abort;
535 }
536
537 drbd_state_lock(mdev);
538 if (!drbd_send_state_req(mdev, mask, val)) {
539 drbd_state_unlock(mdev);
540 rv = SS_CW_FAILED_BY_PEER;
541 if (f & CS_VERBOSE)
542 print_st_err(mdev, os, ns, rv);
543 goto abort;
544 }
545
546 wait_event(mdev->state_wait,
547 (rv = _req_st_cond(mdev, mask, val)));
548
549 if (rv < SS_SUCCESS) {
550 drbd_state_unlock(mdev);
551 if (f & CS_VERBOSE)
552 print_st_err(mdev, os, ns, rv);
553 goto abort;
554 }
555 spin_lock_irqsave(&mdev->req_lock, flags);
556 os = mdev->state;
557 ns.i = (os.i & ~mask.i) | val.i;
558 rv = _drbd_set_state(mdev, ns, f, &done);
559 drbd_state_unlock(mdev);
560 } else {
561 rv = _drbd_set_state(mdev, ns, f, &done);
562 }
563
564 spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567 D_ASSERT(current != mdev->worker.task);
568 wait_for_completion(&done);
569 }
570
571abort:
572 if (f & CS_SERIALIZE)
573 mutex_unlock(&mdev->state_mutex);
574
575 return rv;
576}
577
578/**
579 * _drbd_request_state() - Request a state change (with flags)
580 * @mdev: DRBD device.
581 * @mask: mask of state bits to change.
582 * @val: value of new state bits.
583 * @f: flags
584 *
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired.
587 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589 union drbd_state val, enum chg_state_flags f)
590{
591 int rv;
592
593 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596 return rv;
597}
598
599static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600{
601 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602 name,
603 drbd_conn_str(ns.conn),
604 drbd_role_str(ns.role),
605 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-'
612 );
613}
614
615void print_st_err(struct drbd_conf *mdev,
616 union drbd_state os, union drbd_state ns, int err)
617{
618 if (err == SS_IN_TRANSIENT_STATE)
619 return;
620 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621 print_st(mdev, " state", os);
622 print_st(mdev, "wanted", ns);
623}
624
625
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device.
644 * @ns: State to consider.
645 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{
648 /* See drbd_state_sw_errors in drbd_strings.c */
649
650 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS;
652
653 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) {
655 fp = mdev->ldev->dc.fencing;
656 put_ldev(mdev);
657 }
658
659 if (get_net_conf(mdev)) {
660 if (!mdev->net_conf->two_primaries &&
661 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662 rv = SS_TWO_PRIMARIES;
663 put_net_conf(mdev);
664 }
665
666 if (rv <= 0)
667 /* already found a reason to abort */;
668 else if (ns.role == R_SECONDARY && mdev->open_cnt)
669 rv = SS_DEVICE_IN_USE;
670
671 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672 rv = SS_NO_UP_TO_DATE_DISK;
673
674 else if (fp >= FP_RESOURCE &&
675 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676 rv = SS_PRIMARY_NOP;
677
678 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679 rv = SS_NO_UP_TO_DATE_DISK;
680
681 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682 rv = SS_NO_LOCAL_DISK;
683
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK;
686
687 else if ((ns.conn == C_CONNECTED ||
688 ns.conn == C_WF_BITMAP_S ||
689 ns.conn == C_SYNC_SOURCE ||
690 ns.conn == C_PAUSED_SYNC_S) &&
691 ns.disk == D_OUTDATED)
692 rv = SS_CONNECTED_OUTDATES;
693
694 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
695 (mdev->sync_conf.verify_alg[0] == 0))
696 rv = SS_NO_VERIFY_ALG;
697
698 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
699 mdev->agreed_pro_version < 88)
700 rv = SS_NOT_SUPPORTED;
701
702 return rv;
703}
704
705/**
706 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
707 * @mdev: DRBD device.
708 * @ns: new state.
709 * @os: old state.
710 */
711static int is_valid_state_transition(struct drbd_conf *mdev,
712 union drbd_state ns, union drbd_state os)
713{
714 int rv = SS_SUCCESS;
715
716 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
717 os.conn > C_CONNECTED)
718 rv = SS_RESYNC_RUNNING;
719
720 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
721 rv = SS_ALREADY_STANDALONE;
722
723 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
724 rv = SS_IS_DISKLESS;
725
726 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
727 rv = SS_NO_NET_CONFIG;
728
729 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
730 rv = SS_LOWER_THAN_OUTDATED;
731
732 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
733 rv = SS_IN_TRANSIENT_STATE;
734
735 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
736 rv = SS_IN_TRANSIENT_STATE;
737
738 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
739 rv = SS_NEED_CONNECTION;
740
741 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
742 ns.conn != os.conn && os.conn > C_CONNECTED)
743 rv = SS_RESYNC_RUNNING;
744
745 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
746 os.conn < C_CONNECTED)
747 rv = SS_NEED_CONNECTION;
748
749 return rv;
750}
751
752/**
753 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
754 * @mdev: DRBD device.
755 * @os: old state.
756 * @ns: new state.
757 * @warn_sync_abort:
758 *
759 * When we loose connection, we have to set the state of the peers disk (pdsk)
760 * to D_UNKNOWN. This rule and many more along those lines are in this function.
761 */
762static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
763 union drbd_state ns, int *warn_sync_abort)
764{
765 enum drbd_fencing_p fp;
766
767 fp = FP_DONT_CARE;
768 if (get_ldev(mdev)) {
769 fp = mdev->ldev->dc.fencing;
770 put_ldev(mdev);
771 }
772
773 /* Disallow Network errors to configure a device's network part */
774 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
775 os.conn <= C_DISCONNECTING)
776 ns.conn = os.conn;
777
778 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
779 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
780 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
781 ns.conn = os.conn;
782
783 /* After C_DISCONNECTING only C_STANDALONE may follow */
784 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
785 ns.conn = os.conn;
786
787 if (ns.conn < C_CONNECTED) {
788 ns.peer_isp = 0;
789 ns.peer = R_UNKNOWN;
790 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
791 ns.pdsk = D_UNKNOWN;
792 }
793
794 /* Clear the aftr_isp when becoming unconfigured */
795 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
796 ns.aftr_isp = 0;
797
798 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
799 ns.pdsk = D_UNKNOWN;
800
801 /* Abort resync if a disk fails/detaches */
802 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
803 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
804 if (warn_sync_abort)
805 *warn_sync_abort = 1;
806 ns.conn = C_CONNECTED;
807 }
808
809 if (ns.conn >= C_CONNECTED &&
810 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
811 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
812 switch (ns.conn) {
813 case C_WF_BITMAP_T:
814 case C_PAUSED_SYNC_T:
815 ns.disk = D_OUTDATED;
816 break;
817 case C_CONNECTED:
818 case C_WF_BITMAP_S:
819 case C_SYNC_SOURCE:
820 case C_PAUSED_SYNC_S:
821 ns.disk = D_UP_TO_DATE;
822 break;
823 case C_SYNC_TARGET:
824 ns.disk = D_INCONSISTENT;
825 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
826 break;
827 }
828 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
829 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
830 }
831
832 if (ns.conn >= C_CONNECTED &&
833 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
834 switch (ns.conn) {
835 case C_CONNECTED:
836 case C_WF_BITMAP_T:
837 case C_PAUSED_SYNC_T:
838 case C_SYNC_TARGET:
839 ns.pdsk = D_UP_TO_DATE;
840 break;
841 case C_WF_BITMAP_S:
842 case C_PAUSED_SYNC_S:
Lars Ellenberge0f83012010-04-01 15:13:19 +0200843 /* remap any consistent state to D_OUTDATED,
844 * but disallow "upgrade" of not even consistent states.
845 */
846 ns.pdsk =
847 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
848 ? os.pdsk : D_OUTDATED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700849 break;
850 case C_SYNC_SOURCE:
851 ns.pdsk = D_INCONSISTENT;
852 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
853 break;
854 }
855 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
856 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
857 }
858
859 /* Connection breaks down before we finished "Negotiating" */
860 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
861 get_ldev_if_state(mdev, D_NEGOTIATING)) {
862 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
863 ns.disk = mdev->new_state_tmp.disk;
864 ns.pdsk = mdev->new_state_tmp.pdsk;
865 } else {
866 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
867 ns.disk = D_DISKLESS;
868 ns.pdsk = D_UNKNOWN;
869 }
870 put_ldev(mdev);
871 }
872
873 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200874 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
875 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
876 ns.susp = 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700877
878 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
879 if (ns.conn == C_SYNC_SOURCE)
880 ns.conn = C_PAUSED_SYNC_S;
881 if (ns.conn == C_SYNC_TARGET)
882 ns.conn = C_PAUSED_SYNC_T;
883 } else {
884 if (ns.conn == C_PAUSED_SYNC_S)
885 ns.conn = C_SYNC_SOURCE;
886 if (ns.conn == C_PAUSED_SYNC_T)
887 ns.conn = C_SYNC_TARGET;
888 }
889
890 return ns;
891}
892
893/* helper for __drbd_set_state */
894static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
895{
896 if (cs == C_VERIFY_T) {
897 /* starting online verify from an arbitrary position
898 * does not fit well into the existing protocol.
899 * on C_VERIFY_T, we initialize ov_left and friends
900 * implicitly in receive_DataRequest once the
901 * first P_OV_REQUEST is received */
902 mdev->ov_start_sector = ~(sector_t)0;
903 } else {
904 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
905 if (bit >= mdev->rs_total)
906 mdev->ov_start_sector =
907 BM_BIT_TO_SECT(mdev->rs_total - 1);
908 mdev->ov_position = mdev->ov_start_sector;
909 }
910}
911
912/**
913 * __drbd_set_state() - Set a new DRBD state
914 * @mdev: DRBD device.
915 * @ns: new state.
916 * @flags: Flags
917 * @done: Optional completion, that will get completed after the after_state_ch() finished
918 *
919 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
920 */
921int __drbd_set_state(struct drbd_conf *mdev,
922 union drbd_state ns, enum chg_state_flags flags,
923 struct completion *done)
924{
925 union drbd_state os;
926 int rv = SS_SUCCESS;
927 int warn_sync_abort = 0;
928 struct after_state_chg_work *ascw;
929
930 os = mdev->state;
931
932 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
933
934 if (ns.i == os.i)
935 return SS_NOTHING_TO_DO;
936
937 if (!(flags & CS_HARD)) {
938 /* pre-state-change checks ; only look at ns */
939 /* See drbd_state_sw_errors in drbd_strings.c */
940
941 rv = is_valid_state(mdev, ns);
942 if (rv < SS_SUCCESS) {
943 /* If the old state was illegal as well, then let
944 this happen...*/
945
946 if (is_valid_state(mdev, os) == rv) {
947 dev_err(DEV, "Considering state change from bad state. "
948 "Error would be: '%s'\n",
949 drbd_set_st_err_str(rv));
950 print_st(mdev, "old", os);
951 print_st(mdev, "new", ns);
952 rv = is_valid_state_transition(mdev, ns, os);
953 }
954 } else
955 rv = is_valid_state_transition(mdev, ns, os);
956 }
957
958 if (rv < SS_SUCCESS) {
959 if (flags & CS_VERBOSE)
960 print_st_err(mdev, os, ns, rv);
961 return rv;
962 }
963
964 if (warn_sync_abort)
965 dev_warn(DEV, "Resync aborted.\n");
966
967 {
968 char *pbp, pb[300];
969 pbp = pb;
970 *pbp = 0;
971 PSC(role);
972 PSC(peer);
973 PSC(conn);
974 PSC(disk);
975 PSC(pdsk);
976 PSC(susp);
977 PSC(aftr_isp);
978 PSC(peer_isp);
979 PSC(user_isp);
980 dev_info(DEV, "%s\n", pb);
981 }
982
983 /* solve the race between becoming unconfigured,
984 * worker doing the cleanup, and
985 * admin reconfiguring us:
986 * on (re)configure, first set CONFIG_PENDING,
987 * then wait for a potentially exiting worker,
988 * start the worker, and schedule one no_op.
989 * then proceed with configuration.
990 */
991 if (ns.disk == D_DISKLESS &&
992 ns.conn == C_STANDALONE &&
993 ns.role == R_SECONDARY &&
994 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
995 set_bit(DEVICE_DYING, &mdev->flags);
996
997 mdev->state.i = ns.i;
998 wake_up(&mdev->misc_wait);
999 wake_up(&mdev->state_wait);
1000
1001 /* post-state-change actions */
1002 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1003 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1004 mod_timer(&mdev->resync_timer, jiffies);
1005 }
1006
1007 /* aborted verify run. log the last position */
1008 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1009 ns.conn < C_CONNECTED) {
1010 mdev->ov_start_sector =
1011 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1012 dev_info(DEV, "Online Verify reached sector %llu\n",
1013 (unsigned long long)mdev->ov_start_sector);
1014 }
1015
1016 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1017 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1018 dev_info(DEV, "Syncer continues.\n");
1019 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1020 if (ns.conn == C_SYNC_TARGET) {
1021 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1022 mod_timer(&mdev->resync_timer, jiffies);
1023 /* This if (!test_bit) is only needed for the case
1024 that a device that has ceased to used its timer,
1025 i.e. it is already in drbd_resync_finished() gets
1026 paused and resumed. */
1027 }
1028 }
1029
1030 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1031 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1032 dev_info(DEV, "Resync suspended\n");
1033 mdev->rs_mark_time = jiffies;
1034 if (ns.conn == C_PAUSED_SYNC_T)
1035 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1036 }
1037
1038 if (os.conn == C_CONNECTED &&
1039 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1040 mdev->ov_position = 0;
1041 mdev->rs_total =
1042 mdev->rs_mark_left = drbd_bm_bits(mdev);
1043 if (mdev->agreed_pro_version >= 90)
1044 set_ov_position(mdev, ns.conn);
1045 else
1046 mdev->ov_start_sector = 0;
1047 mdev->ov_left = mdev->rs_total
1048 - BM_SECT_TO_BIT(mdev->ov_position);
1049 mdev->rs_start =
1050 mdev->rs_mark_time = jiffies;
1051 mdev->ov_last_oos_size = 0;
1052 mdev->ov_last_oos_start = 0;
1053
1054 if (ns.conn == C_VERIFY_S) {
1055 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1056 (unsigned long long)mdev->ov_position);
1057 mod_timer(&mdev->resync_timer, jiffies);
1058 }
1059 }
1060
1061 if (get_ldev(mdev)) {
1062 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1063 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1064 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1065
1066 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1067 mdf |= MDF_CRASHED_PRIMARY;
1068 if (mdev->state.role == R_PRIMARY ||
1069 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1070 mdf |= MDF_PRIMARY_IND;
1071 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1072 mdf |= MDF_CONNECTED_IND;
1073 if (mdev->state.disk > D_INCONSISTENT)
1074 mdf |= MDF_CONSISTENT;
1075 if (mdev->state.disk > D_OUTDATED)
1076 mdf |= MDF_WAS_UP_TO_DATE;
1077 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1078 mdf |= MDF_PEER_OUT_DATED;
1079 if (mdf != mdev->ldev->md.flags) {
1080 mdev->ldev->md.flags = mdf;
1081 drbd_md_mark_dirty(mdev);
1082 }
1083 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1084 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1085 put_ldev(mdev);
1086 }
1087
1088 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1089 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1090 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1091 set_bit(CONSIDER_RESYNC, &mdev->flags);
1092
1093 /* Receiver should clean up itself */
1094 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1095 drbd_thread_stop_nowait(&mdev->receiver);
1096
1097 /* Now the receiver finished cleaning up itself, it should die */
1098 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1099 drbd_thread_stop_nowait(&mdev->receiver);
1100
1101 /* Upon network failure, we need to restart the receiver. */
1102 if (os.conn > C_TEAR_DOWN &&
1103 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1104 drbd_thread_restart_nowait(&mdev->receiver);
1105
1106 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1107 if (ascw) {
1108 ascw->os = os;
1109 ascw->ns = ns;
1110 ascw->flags = flags;
1111 ascw->w.cb = w_after_state_ch;
1112 ascw->done = done;
1113 drbd_queue_work(&mdev->data.work, &ascw->w);
1114 } else {
1115 dev_warn(DEV, "Could not kmalloc an ascw\n");
1116 }
1117
1118 return rv;
1119}
1120
1121static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1122{
1123 struct after_state_chg_work *ascw =
1124 container_of(w, struct after_state_chg_work, w);
1125 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1126 if (ascw->flags & CS_WAIT_COMPLETE) {
1127 D_ASSERT(ascw->done != NULL);
1128 complete(ascw->done);
1129 }
1130 kfree(ascw);
1131
1132 return 1;
1133}
1134
1135static void abw_start_sync(struct drbd_conf *mdev, int rv)
1136{
1137 if (rv) {
1138 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1139 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1140 return;
1141 }
1142
1143 switch (mdev->state.conn) {
1144 case C_STARTING_SYNC_T:
1145 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1146 break;
1147 case C_STARTING_SYNC_S:
1148 drbd_start_resync(mdev, C_SYNC_SOURCE);
1149 break;
1150 }
1151}
1152
1153/**
1154 * after_state_ch() - Perform after state change actions that may sleep
1155 * @mdev: DRBD device.
1156 * @os: old state.
1157 * @ns: new state.
1158 * @flags: Flags
1159 */
1160static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1161 union drbd_state ns, enum chg_state_flags flags)
1162{
1163 enum drbd_fencing_p fp;
1164
1165 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1166 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1167 if (mdev->p_uuid)
1168 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1169 }
1170
1171 fp = FP_DONT_CARE;
1172 if (get_ldev(mdev)) {
1173 fp = mdev->ldev->dc.fencing;
1174 put_ldev(mdev);
1175 }
1176
1177 /* Inform userspace about the change... */
1178 drbd_bcast_state(mdev, ns);
1179
1180 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1181 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1182 drbd_khelper(mdev, "pri-on-incon-degr");
1183
1184 /* Here we have the actions that are performed after a
1185 state change. This function might sleep */
1186
1187 if (fp == FP_STONITH && ns.susp) {
1188 /* case1: The outdate peer handler is successful:
1189 * case2: The connection was established again: */
1190 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1191 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1192 tl_clear(mdev);
1193 spin_lock_irq(&mdev->req_lock);
1194 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1195 spin_unlock_irq(&mdev->req_lock);
1196 }
1197 }
1198 /* Do not change the order of the if above and the two below... */
1199 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1200 drbd_send_uuids(mdev);
1201 drbd_send_state(mdev);
1202 }
1203 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1204 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1205
1206 /* Lost contact to peer's copy of the data */
1207 if ((os.pdsk >= D_INCONSISTENT &&
1208 os.pdsk != D_UNKNOWN &&
1209 os.pdsk != D_OUTDATED)
1210 && (ns.pdsk < D_INCONSISTENT ||
1211 ns.pdsk == D_UNKNOWN ||
1212 ns.pdsk == D_OUTDATED)) {
1213 kfree(mdev->p_uuid);
1214 mdev->p_uuid = NULL;
1215 if (get_ldev(mdev)) {
1216 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1217 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1218 drbd_uuid_new_current(mdev);
1219 drbd_send_uuids(mdev);
1220 }
1221 put_ldev(mdev);
1222 }
1223 }
1224
1225 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1226 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1227 drbd_uuid_new_current(mdev);
1228
1229 /* D_DISKLESS Peer becomes secondary */
1230 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1231 drbd_al_to_on_disk_bm(mdev);
1232 put_ldev(mdev);
1233 }
1234
1235 /* Last part of the attaching process ... */
1236 if (ns.conn >= C_CONNECTED &&
1237 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1238 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1239 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1240 drbd_send_sizes(mdev, 0); /* to start sync... */
1241 drbd_send_uuids(mdev);
1242 drbd_send_state(mdev);
1243 }
1244
1245 /* We want to pause/continue resync, tell peer. */
1246 if (ns.conn >= C_CONNECTED &&
1247 ((os.aftr_isp != ns.aftr_isp) ||
1248 (os.user_isp != ns.user_isp)))
1249 drbd_send_state(mdev);
1250
1251 /* In case one of the isp bits got set, suspend other devices. */
1252 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1253 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1254 suspend_other_sg(mdev);
1255
1256 /* Make sure the peer gets informed about eventual state
1257 changes (ISP bits) while we were in WFReportParams. */
1258 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1259 drbd_send_state(mdev);
1260
1261 /* We are in the progress to start a full sync... */
1262 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1263 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1264 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1265
1266 /* We are invalidating our self... */
1267 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1268 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1269 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1270
1271 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1272 enum drbd_io_error_p eh;
1273
1274 eh = EP_PASS_ON;
1275 if (get_ldev_if_state(mdev, D_FAILED)) {
1276 eh = mdev->ldev->dc.on_io_error;
1277 put_ldev(mdev);
1278 }
1279
1280 drbd_rs_cancel_all(mdev);
1281 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1282 and it is D_DISKLESS here, local_cnt can only go down, it can
1283 not increase... It will reach zero */
1284 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1285 mdev->rs_total = 0;
1286 mdev->rs_failed = 0;
1287 atomic_set(&mdev->rs_pending_cnt, 0);
1288
1289 spin_lock_irq(&mdev->req_lock);
1290 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1291 spin_unlock_irq(&mdev->req_lock);
1292
1293 if (eh == EP_CALL_HELPER)
1294 drbd_khelper(mdev, "local-io-error");
1295 }
1296
1297 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1298
1299 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1300 if (drbd_send_state(mdev))
1301 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1302 else
1303 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1304 }
1305
Philipp Reisner0a6dbf22009-12-28 16:58:38 +01001306 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001307 lc_destroy(mdev->resync);
1308 mdev->resync = NULL;
1309 lc_destroy(mdev->act_log);
1310 mdev->act_log = NULL;
1311 __no_warn(local,
1312 drbd_free_bc(mdev->ldev);
1313 mdev->ldev = NULL;);
1314
1315 if (mdev->md_io_tmpp)
1316 __free_page(mdev->md_io_tmpp);
1317 }
1318
1319 /* Disks got bigger while they were detached */
1320 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1321 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1322 if (ns.conn == C_CONNECTED)
1323 resync_after_online_grow(mdev);
1324 }
1325
1326 /* A resync finished or aborted, wake paused devices... */
1327 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1328 (os.peer_isp && !ns.peer_isp) ||
1329 (os.user_isp && !ns.user_isp))
1330 resume_next_sg(mdev);
1331
1332 /* Upon network connection, we need to start the receiver */
1333 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1334 drbd_thread_start(&mdev->receiver);
1335
1336 /* Terminate worker thread if we are unconfigured - it will be
1337 restarted as needed... */
1338 if (ns.disk == D_DISKLESS &&
1339 ns.conn == C_STANDALONE &&
1340 ns.role == R_SECONDARY) {
1341 if (os.aftr_isp != ns.aftr_isp)
1342 resume_next_sg(mdev);
1343 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1344 if (test_bit(DEVICE_DYING, &mdev->flags))
1345 drbd_thread_stop_nowait(&mdev->worker);
1346 }
1347
1348 drbd_md_sync(mdev);
1349}
1350
1351
1352static int drbd_thread_setup(void *arg)
1353{
1354 struct drbd_thread *thi = (struct drbd_thread *) arg;
1355 struct drbd_conf *mdev = thi->mdev;
1356 unsigned long flags;
1357 int retval;
1358
1359restart:
1360 retval = thi->function(thi);
1361
1362 spin_lock_irqsave(&thi->t_lock, flags);
1363
1364 /* if the receiver has been "Exiting", the last thing it did
1365 * was set the conn state to "StandAlone",
1366 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1367 * and receiver thread will be "started".
1368 * drbd_thread_start needs to set "Restarting" in that case.
1369 * t_state check and assignment needs to be within the same spinlock,
1370 * so either thread_start sees Exiting, and can remap to Restarting,
1371 * or thread_start see None, and can proceed as normal.
1372 */
1373
1374 if (thi->t_state == Restarting) {
1375 dev_info(DEV, "Restarting %s\n", current->comm);
1376 thi->t_state = Running;
1377 spin_unlock_irqrestore(&thi->t_lock, flags);
1378 goto restart;
1379 }
1380
1381 thi->task = NULL;
1382 thi->t_state = None;
1383 smp_mb();
1384 complete(&thi->stop);
1385 spin_unlock_irqrestore(&thi->t_lock, flags);
1386
1387 dev_info(DEV, "Terminating %s\n", current->comm);
1388
1389 /* Release mod reference taken when thread was started */
1390 module_put(THIS_MODULE);
1391 return retval;
1392}
1393
1394static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1395 int (*func) (struct drbd_thread *))
1396{
1397 spin_lock_init(&thi->t_lock);
1398 thi->task = NULL;
1399 thi->t_state = None;
1400 thi->function = func;
1401 thi->mdev = mdev;
1402}
1403
1404int drbd_thread_start(struct drbd_thread *thi)
1405{
1406 struct drbd_conf *mdev = thi->mdev;
1407 struct task_struct *nt;
1408 unsigned long flags;
1409
1410 const char *me =
1411 thi == &mdev->receiver ? "receiver" :
1412 thi == &mdev->asender ? "asender" :
1413 thi == &mdev->worker ? "worker" : "NONSENSE";
1414
1415 /* is used from state engine doing drbd_thread_stop_nowait,
1416 * while holding the req lock irqsave */
1417 spin_lock_irqsave(&thi->t_lock, flags);
1418
1419 switch (thi->t_state) {
1420 case None:
1421 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1422 me, current->comm, current->pid);
1423
1424 /* Get ref on module for thread - this is released when thread exits */
1425 if (!try_module_get(THIS_MODULE)) {
1426 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1427 spin_unlock_irqrestore(&thi->t_lock, flags);
1428 return FALSE;
1429 }
1430
1431 init_completion(&thi->stop);
1432 D_ASSERT(thi->task == NULL);
1433 thi->reset_cpu_mask = 1;
1434 thi->t_state = Running;
1435 spin_unlock_irqrestore(&thi->t_lock, flags);
1436 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1437
1438 nt = kthread_create(drbd_thread_setup, (void *) thi,
1439 "drbd%d_%s", mdev_to_minor(mdev), me);
1440
1441 if (IS_ERR(nt)) {
1442 dev_err(DEV, "Couldn't start thread\n");
1443
1444 module_put(THIS_MODULE);
1445 return FALSE;
1446 }
1447 spin_lock_irqsave(&thi->t_lock, flags);
1448 thi->task = nt;
1449 thi->t_state = Running;
1450 spin_unlock_irqrestore(&thi->t_lock, flags);
1451 wake_up_process(nt);
1452 break;
1453 case Exiting:
1454 thi->t_state = Restarting;
1455 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1456 me, current->comm, current->pid);
1457 /* fall through */
1458 case Running:
1459 case Restarting:
1460 default:
1461 spin_unlock_irqrestore(&thi->t_lock, flags);
1462 break;
1463 }
1464
1465 return TRUE;
1466}
1467
1468
1469void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1470{
1471 unsigned long flags;
1472
1473 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1474
1475 /* may be called from state engine, holding the req lock irqsave */
1476 spin_lock_irqsave(&thi->t_lock, flags);
1477
1478 if (thi->t_state == None) {
1479 spin_unlock_irqrestore(&thi->t_lock, flags);
1480 if (restart)
1481 drbd_thread_start(thi);
1482 return;
1483 }
1484
1485 if (thi->t_state != ns) {
1486 if (thi->task == NULL) {
1487 spin_unlock_irqrestore(&thi->t_lock, flags);
1488 return;
1489 }
1490
1491 thi->t_state = ns;
1492 smp_mb();
1493 init_completion(&thi->stop);
1494 if (thi->task != current)
1495 force_sig(DRBD_SIGKILL, thi->task);
1496
1497 }
1498
1499 spin_unlock_irqrestore(&thi->t_lock, flags);
1500
1501 if (wait)
1502 wait_for_completion(&thi->stop);
1503}
1504
1505#ifdef CONFIG_SMP
1506/**
1507 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1508 * @mdev: DRBD device.
1509 *
1510 * Forces all threads of a device onto the same CPU. This is beneficial for
1511 * DRBD's performance. May be overwritten by user's configuration.
1512 */
1513void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1514{
1515 int ord, cpu;
1516
1517 /* user override. */
1518 if (cpumask_weight(mdev->cpu_mask))
1519 return;
1520
1521 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1522 for_each_online_cpu(cpu) {
1523 if (ord-- == 0) {
1524 cpumask_set_cpu(cpu, mdev->cpu_mask);
1525 return;
1526 }
1527 }
1528 /* should not be reached */
1529 cpumask_setall(mdev->cpu_mask);
1530}
1531
1532/**
1533 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1534 * @mdev: DRBD device.
1535 *
1536 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1537 * prematurely.
1538 */
1539void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1540{
1541 struct task_struct *p = current;
1542 struct drbd_thread *thi =
1543 p == mdev->asender.task ? &mdev->asender :
1544 p == mdev->receiver.task ? &mdev->receiver :
1545 p == mdev->worker.task ? &mdev->worker :
1546 NULL;
1547 ERR_IF(thi == NULL)
1548 return;
1549 if (!thi->reset_cpu_mask)
1550 return;
1551 thi->reset_cpu_mask = 0;
1552 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1553}
1554#endif
1555
1556/* the appropriate socket mutex must be held already */
1557int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1558 enum drbd_packets cmd, struct p_header *h,
1559 size_t size, unsigned msg_flags)
1560{
1561 int sent, ok;
1562
1563 ERR_IF(!h) return FALSE;
1564 ERR_IF(!size) return FALSE;
1565
1566 h->magic = BE_DRBD_MAGIC;
1567 h->command = cpu_to_be16(cmd);
1568 h->length = cpu_to_be16(size-sizeof(struct p_header));
1569
Philipp Reisnerb411b362009-09-25 16:07:19 -07001570 sent = drbd_send(mdev, sock, h, size, msg_flags);
1571
1572 ok = (sent == size);
1573 if (!ok)
1574 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1575 cmdname(cmd), (int)size, sent);
1576 return ok;
1577}
1578
1579/* don't pass the socket. we may only look at it
1580 * when we hold the appropriate socket mutex.
1581 */
1582int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1583 enum drbd_packets cmd, struct p_header *h, size_t size)
1584{
1585 int ok = 0;
1586 struct socket *sock;
1587
1588 if (use_data_socket) {
1589 mutex_lock(&mdev->data.mutex);
1590 sock = mdev->data.socket;
1591 } else {
1592 mutex_lock(&mdev->meta.mutex);
1593 sock = mdev->meta.socket;
1594 }
1595
1596 /* drbd_disconnect() could have called drbd_free_sock()
1597 * while we were waiting in down()... */
1598 if (likely(sock != NULL))
1599 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1600
1601 if (use_data_socket)
1602 mutex_unlock(&mdev->data.mutex);
1603 else
1604 mutex_unlock(&mdev->meta.mutex);
1605 return ok;
1606}
1607
1608int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1609 size_t size)
1610{
1611 struct p_header h;
1612 int ok;
1613
1614 h.magic = BE_DRBD_MAGIC;
1615 h.command = cpu_to_be16(cmd);
1616 h.length = cpu_to_be16(size);
1617
1618 if (!drbd_get_data_sock(mdev))
1619 return 0;
1620
Philipp Reisnerb411b362009-09-25 16:07:19 -07001621 ok = (sizeof(h) ==
1622 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1623 ok = ok && (size ==
1624 drbd_send(mdev, mdev->data.socket, data, size, 0));
1625
1626 drbd_put_data_sock(mdev);
1627
1628 return ok;
1629}
1630
1631int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1632{
1633 struct p_rs_param_89 *p;
1634 struct socket *sock;
1635 int size, rv;
1636 const int apv = mdev->agreed_pro_version;
1637
1638 size = apv <= 87 ? sizeof(struct p_rs_param)
1639 : apv == 88 ? sizeof(struct p_rs_param)
1640 + strlen(mdev->sync_conf.verify_alg) + 1
1641 : /* 89 */ sizeof(struct p_rs_param_89);
1642
1643 /* used from admin command context and receiver/worker context.
1644 * to avoid kmalloc, grab the socket right here,
1645 * then use the pre-allocated sbuf there */
1646 mutex_lock(&mdev->data.mutex);
1647 sock = mdev->data.socket;
1648
1649 if (likely(sock != NULL)) {
1650 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1651
1652 p = &mdev->data.sbuf.rs_param_89;
1653
1654 /* initialize verify_alg and csums_alg */
1655 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1656
1657 p->rate = cpu_to_be32(sc->rate);
1658
1659 if (apv >= 88)
1660 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1661 if (apv >= 89)
1662 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1663
1664 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1665 } else
1666 rv = 0; /* not ok */
1667
1668 mutex_unlock(&mdev->data.mutex);
1669
1670 return rv;
1671}
1672
1673int drbd_send_protocol(struct drbd_conf *mdev)
1674{
1675 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001676 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001677
1678 size = sizeof(struct p_protocol);
1679
1680 if (mdev->agreed_pro_version >= 87)
1681 size += strlen(mdev->net_conf->integrity_alg) + 1;
1682
1683 /* we must not recurse into our own queue,
1684 * as that is blocked during handshake */
1685 p = kmalloc(size, GFP_NOIO);
1686 if (p == NULL)
1687 return 0;
1688
1689 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1690 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1691 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1692 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001693 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1694
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001695 cf = 0;
1696 if (mdev->net_conf->want_lose)
1697 cf |= CF_WANT_LOSE;
1698 if (mdev->net_conf->dry_run) {
1699 if (mdev->agreed_pro_version >= 92)
1700 cf |= CF_DRY_RUN;
1701 else {
1702 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001703 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001704 return 0;
1705 }
1706 }
1707 p->conn_flags = cpu_to_be32(cf);
1708
Philipp Reisnerb411b362009-09-25 16:07:19 -07001709 if (mdev->agreed_pro_version >= 87)
1710 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1711
1712 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1713 (struct p_header *)p, size);
1714 kfree(p);
1715 return rv;
1716}
1717
1718int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1719{
1720 struct p_uuids p;
1721 int i;
1722
1723 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1724 return 1;
1725
1726 for (i = UI_CURRENT; i < UI_SIZE; i++)
1727 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1728
1729 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1730 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1731 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1732 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1733 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1734 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1735
1736 put_ldev(mdev);
1737
1738 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1739 (struct p_header *)&p, sizeof(p));
1740}
1741
1742int drbd_send_uuids(struct drbd_conf *mdev)
1743{
1744 return _drbd_send_uuids(mdev, 0);
1745}
1746
1747int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1748{
1749 return _drbd_send_uuids(mdev, 8);
1750}
1751
1752
1753int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1754{
1755 struct p_rs_uuid p;
1756
1757 p.uuid = cpu_to_be64(val);
1758
1759 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1760 (struct p_header *)&p, sizeof(p));
1761}
1762
1763int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1764{
1765 struct p_sizes p;
1766 sector_t d_size, u_size;
1767 int q_order_type;
1768 int ok;
1769
1770 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1771 D_ASSERT(mdev->ldev->backing_bdev);
1772 d_size = drbd_get_max_capacity(mdev->ldev);
1773 u_size = mdev->ldev->dc.disk_size;
1774 q_order_type = drbd_queue_order_type(mdev);
1775 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1776 put_ldev(mdev);
1777 } else {
1778 d_size = 0;
1779 u_size = 0;
1780 q_order_type = QUEUE_ORDERED_NONE;
1781 }
1782
1783 p.d_size = cpu_to_be64(d_size);
1784 p.u_size = cpu_to_be64(u_size);
1785 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1786 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1787 p.queue_order_type = cpu_to_be32(q_order_type);
1788
1789 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1790 (struct p_header *)&p, sizeof(p));
1791 return ok;
1792}
1793
1794/**
1795 * drbd_send_state() - Sends the drbd state to the peer
1796 * @mdev: DRBD device.
1797 */
1798int drbd_send_state(struct drbd_conf *mdev)
1799{
1800 struct socket *sock;
1801 struct p_state p;
1802 int ok = 0;
1803
1804 /* Grab state lock so we wont send state if we're in the middle
1805 * of a cluster wide state change on another thread */
1806 drbd_state_lock(mdev);
1807
1808 mutex_lock(&mdev->data.mutex);
1809
1810 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1811 sock = mdev->data.socket;
1812
1813 if (likely(sock != NULL)) {
1814 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1815 (struct p_header *)&p, sizeof(p), 0);
1816 }
1817
1818 mutex_unlock(&mdev->data.mutex);
1819
1820 drbd_state_unlock(mdev);
1821 return ok;
1822}
1823
1824int drbd_send_state_req(struct drbd_conf *mdev,
1825 union drbd_state mask, union drbd_state val)
1826{
1827 struct p_req_state p;
1828
1829 p.mask = cpu_to_be32(mask.i);
1830 p.val = cpu_to_be32(val.i);
1831
1832 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1833 (struct p_header *)&p, sizeof(p));
1834}
1835
1836int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1837{
1838 struct p_req_state_reply p;
1839
1840 p.retcode = cpu_to_be32(retcode);
1841
1842 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1843 (struct p_header *)&p, sizeof(p));
1844}
1845
1846int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1847 struct p_compressed_bm *p,
1848 struct bm_xfer_ctx *c)
1849{
1850 struct bitstream bs;
1851 unsigned long plain_bits;
1852 unsigned long tmp;
1853 unsigned long rl;
1854 unsigned len;
1855 unsigned toggle;
1856 int bits;
1857
1858 /* may we use this feature? */
1859 if ((mdev->sync_conf.use_rle == 0) ||
1860 (mdev->agreed_pro_version < 90))
1861 return 0;
1862
1863 if (c->bit_offset >= c->bm_bits)
1864 return 0; /* nothing to do. */
1865
1866 /* use at most thus many bytes */
1867 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1868 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1869 /* plain bits covered in this code string */
1870 plain_bits = 0;
1871
1872 /* p->encoding & 0x80 stores whether the first run length is set.
1873 * bit offset is implicit.
1874 * start with toggle == 2 to be able to tell the first iteration */
1875 toggle = 2;
1876
1877 /* see how much plain bits we can stuff into one packet
1878 * using RLE and VLI. */
1879 do {
1880 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1881 : _drbd_bm_find_next(mdev, c->bit_offset);
1882 if (tmp == -1UL)
1883 tmp = c->bm_bits;
1884 rl = tmp - c->bit_offset;
1885
1886 if (toggle == 2) { /* first iteration */
1887 if (rl == 0) {
1888 /* the first checked bit was set,
1889 * store start value, */
1890 DCBP_set_start(p, 1);
1891 /* but skip encoding of zero run length */
1892 toggle = !toggle;
1893 continue;
1894 }
1895 DCBP_set_start(p, 0);
1896 }
1897
1898 /* paranoia: catch zero runlength.
1899 * can only happen if bitmap is modified while we scan it. */
1900 if (rl == 0) {
1901 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1902 "t:%u bo:%lu\n", toggle, c->bit_offset);
1903 return -1;
1904 }
1905
1906 bits = vli_encode_bits(&bs, rl);
1907 if (bits == -ENOBUFS) /* buffer full */
1908 break;
1909 if (bits <= 0) {
1910 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1911 return 0;
1912 }
1913
1914 toggle = !toggle;
1915 plain_bits += rl;
1916 c->bit_offset = tmp;
1917 } while (c->bit_offset < c->bm_bits);
1918
1919 len = bs.cur.b - p->code + !!bs.cur.bit;
1920
1921 if (plain_bits < (len << 3)) {
1922 /* incompressible with this method.
1923 * we need to rewind both word and bit position. */
1924 c->bit_offset -= plain_bits;
1925 bm_xfer_ctx_bit_to_word_offset(c);
1926 c->bit_offset = c->word_offset * BITS_PER_LONG;
1927 return 0;
1928 }
1929
1930 /* RLE + VLI was able to compress it just fine.
1931 * update c->word_offset. */
1932 bm_xfer_ctx_bit_to_word_offset(c);
1933
1934 /* store pad_bits */
1935 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1936
1937 return len;
1938}
1939
1940enum { OK, FAILED, DONE }
1941send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1942 struct p_header *h, struct bm_xfer_ctx *c)
1943{
1944 struct p_compressed_bm *p = (void*)h;
1945 unsigned long num_words;
1946 int len;
1947 int ok;
1948
1949 len = fill_bitmap_rle_bits(mdev, p, c);
1950
1951 if (len < 0)
1952 return FAILED;
1953
1954 if (len) {
1955 DCBP_set_code(p, RLE_VLI_Bits);
1956 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1957 sizeof(*p) + len, 0);
1958
1959 c->packets[0]++;
1960 c->bytes[0] += sizeof(*p) + len;
1961
1962 if (c->bit_offset >= c->bm_bits)
1963 len = 0; /* DONE */
1964 } else {
1965 /* was not compressible.
1966 * send a buffer full of plain text bits instead. */
1967 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1968 len = num_words * sizeof(long);
1969 if (len)
1970 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1971 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1972 h, sizeof(struct p_header) + len, 0);
1973 c->word_offset += num_words;
1974 c->bit_offset = c->word_offset * BITS_PER_LONG;
1975
1976 c->packets[1]++;
1977 c->bytes[1] += sizeof(struct p_header) + len;
1978
1979 if (c->bit_offset > c->bm_bits)
1980 c->bit_offset = c->bm_bits;
1981 }
1982 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1983
1984 if (ok == DONE)
1985 INFO_bm_xfer_stats(mdev, "send", c);
1986 return ok;
1987}
1988
1989/* See the comment at receive_bitmap() */
1990int _drbd_send_bitmap(struct drbd_conf *mdev)
1991{
1992 struct bm_xfer_ctx c;
1993 struct p_header *p;
1994 int ret;
1995
1996 ERR_IF(!mdev->bitmap) return FALSE;
1997
1998 /* maybe we should use some per thread scratch page,
1999 * and allocate that during initial device creation? */
2000 p = (struct p_header *) __get_free_page(GFP_NOIO);
2001 if (!p) {
2002 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2003 return FALSE;
2004 }
2005
2006 if (get_ldev(mdev)) {
2007 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2008 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2009 drbd_bm_set_all(mdev);
2010 if (drbd_bm_write(mdev)) {
2011 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2012 * but otherwise process as per normal - need to tell other
2013 * side that a full resync is required! */
2014 dev_err(DEV, "Failed to write bitmap to disk!\n");
2015 } else {
2016 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2017 drbd_md_sync(mdev);
2018 }
2019 }
2020 put_ldev(mdev);
2021 }
2022
2023 c = (struct bm_xfer_ctx) {
2024 .bm_bits = drbd_bm_bits(mdev),
2025 .bm_words = drbd_bm_words(mdev),
2026 };
2027
2028 do {
2029 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2030 } while (ret == OK);
2031
2032 free_page((unsigned long) p);
2033 return (ret == DONE);
2034}
2035
2036int drbd_send_bitmap(struct drbd_conf *mdev)
2037{
2038 int err;
2039
2040 if (!drbd_get_data_sock(mdev))
2041 return -1;
2042 err = !_drbd_send_bitmap(mdev);
2043 drbd_put_data_sock(mdev);
2044 return err;
2045}
2046
2047int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2048{
2049 int ok;
2050 struct p_barrier_ack p;
2051
2052 p.barrier = barrier_nr;
2053 p.set_size = cpu_to_be32(set_size);
2054
2055 if (mdev->state.conn < C_CONNECTED)
2056 return FALSE;
2057 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2058 (struct p_header *)&p, sizeof(p));
2059 return ok;
2060}
2061
2062/**
2063 * _drbd_send_ack() - Sends an ack packet
2064 * @mdev: DRBD device.
2065 * @cmd: Packet command code.
2066 * @sector: sector, needs to be in big endian byte order
2067 * @blksize: size in byte, needs to be in big endian byte order
2068 * @block_id: Id, big endian byte order
2069 */
2070static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2071 u64 sector,
2072 u32 blksize,
2073 u64 block_id)
2074{
2075 int ok;
2076 struct p_block_ack p;
2077
2078 p.sector = sector;
2079 p.block_id = block_id;
2080 p.blksize = blksize;
2081 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2082
2083 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2084 return FALSE;
2085 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2086 (struct p_header *)&p, sizeof(p));
2087 return ok;
2088}
2089
2090int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2091 struct p_data *dp)
2092{
2093 const int header_size = sizeof(struct p_data)
2094 - sizeof(struct p_header);
2095 int data_size = ((struct p_header *)dp)->length - header_size;
2096
2097 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2098 dp->block_id);
2099}
2100
2101int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2102 struct p_block_req *rp)
2103{
2104 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2105}
2106
2107/**
2108 * drbd_send_ack() - Sends an ack packet
2109 * @mdev: DRBD device.
2110 * @cmd: Packet command code.
2111 * @e: Epoch entry.
2112 */
2113int drbd_send_ack(struct drbd_conf *mdev,
2114 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2115{
2116 return _drbd_send_ack(mdev, cmd,
2117 cpu_to_be64(e->sector),
2118 cpu_to_be32(e->size),
2119 e->block_id);
2120}
2121
2122/* This function misuses the block_id field to signal if the blocks
2123 * are is sync or not. */
2124int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2125 sector_t sector, int blksize, u64 block_id)
2126{
2127 return _drbd_send_ack(mdev, cmd,
2128 cpu_to_be64(sector),
2129 cpu_to_be32(blksize),
2130 cpu_to_be64(block_id));
2131}
2132
2133int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2134 sector_t sector, int size, u64 block_id)
2135{
2136 int ok;
2137 struct p_block_req p;
2138
2139 p.sector = cpu_to_be64(sector);
2140 p.block_id = block_id;
2141 p.blksize = cpu_to_be32(size);
2142
2143 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2144 (struct p_header *)&p, sizeof(p));
2145 return ok;
2146}
2147
2148int drbd_send_drequest_csum(struct drbd_conf *mdev,
2149 sector_t sector, int size,
2150 void *digest, int digest_size,
2151 enum drbd_packets cmd)
2152{
2153 int ok;
2154 struct p_block_req p;
2155
2156 p.sector = cpu_to_be64(sector);
2157 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2158 p.blksize = cpu_to_be32(size);
2159
2160 p.head.magic = BE_DRBD_MAGIC;
2161 p.head.command = cpu_to_be16(cmd);
2162 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2163
2164 mutex_lock(&mdev->data.mutex);
2165
2166 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2167 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2168
2169 mutex_unlock(&mdev->data.mutex);
2170
2171 return ok;
2172}
2173
2174int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2175{
2176 int ok;
2177 struct p_block_req p;
2178
2179 p.sector = cpu_to_be64(sector);
2180 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2181 p.blksize = cpu_to_be32(size);
2182
2183 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2184 (struct p_header *)&p, sizeof(p));
2185 return ok;
2186}
2187
2188/* called on sndtimeo
2189 * returns FALSE if we should retry,
2190 * TRUE if we think connection is dead
2191 */
2192static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2193{
2194 int drop_it;
2195 /* long elapsed = (long)(jiffies - mdev->last_received); */
2196
2197 drop_it = mdev->meta.socket == sock
2198 || !mdev->asender.task
2199 || get_t_state(&mdev->asender) != Running
2200 || mdev->state.conn < C_CONNECTED;
2201
2202 if (drop_it)
2203 return TRUE;
2204
2205 drop_it = !--mdev->ko_count;
2206 if (!drop_it) {
2207 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2208 current->comm, current->pid, mdev->ko_count);
2209 request_ping(mdev);
2210 }
2211
2212 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2213}
2214
2215/* The idea of sendpage seems to be to put some kind of reference
2216 * to the page into the skb, and to hand it over to the NIC. In
2217 * this process get_page() gets called.
2218 *
2219 * As soon as the page was really sent over the network put_page()
2220 * gets called by some part of the network layer. [ NIC driver? ]
2221 *
2222 * [ get_page() / put_page() increment/decrement the count. If count
2223 * reaches 0 the page will be freed. ]
2224 *
2225 * This works nicely with pages from FSs.
2226 * But this means that in protocol A we might signal IO completion too early!
2227 *
2228 * In order not to corrupt data during a resync we must make sure
2229 * that we do not reuse our own buffer pages (EEs) to early, therefore
2230 * we have the net_ee list.
2231 *
2232 * XFS seems to have problems, still, it submits pages with page_count == 0!
2233 * As a workaround, we disable sendpage on pages
2234 * with page_count == 0 or PageSlab.
2235 */
2236static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2237 int offset, size_t size)
2238{
2239 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2240 kunmap(page);
2241 if (sent == size)
2242 mdev->send_cnt += size>>9;
2243 return sent == size;
2244}
2245
2246static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2247 int offset, size_t size)
2248{
2249 mm_segment_t oldfs = get_fs();
2250 int sent, ok;
2251 int len = size;
2252
2253 /* e.g. XFS meta- & log-data is in slab pages, which have a
2254 * page_count of 0 and/or have PageSlab() set.
2255 * we cannot use send_page for those, as that does get_page();
2256 * put_page(); and would cause either a VM_BUG directly, or
2257 * __page_cache_release a page that would actually still be referenced
2258 * by someone, leading to some obscure delayed Oops somewhere else. */
2259 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2260 return _drbd_no_send_page(mdev, page, offset, size);
2261
2262 drbd_update_congested(mdev);
2263 set_fs(KERNEL_DS);
2264 do {
2265 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2266 offset, len,
2267 MSG_NOSIGNAL);
2268 if (sent == -EAGAIN) {
2269 if (we_should_drop_the_connection(mdev,
2270 mdev->data.socket))
2271 break;
2272 else
2273 continue;
2274 }
2275 if (sent <= 0) {
2276 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2277 __func__, (int)size, len, sent);
2278 break;
2279 }
2280 len -= sent;
2281 offset += sent;
2282 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2283 set_fs(oldfs);
2284 clear_bit(NET_CONGESTED, &mdev->flags);
2285
2286 ok = (len == 0);
2287 if (likely(ok))
2288 mdev->send_cnt += size>>9;
2289 return ok;
2290}
2291
2292static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2293{
2294 struct bio_vec *bvec;
2295 int i;
2296 __bio_for_each_segment(bvec, bio, i, 0) {
2297 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2298 bvec->bv_offset, bvec->bv_len))
2299 return 0;
2300 }
2301 return 1;
2302}
2303
2304static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2305{
2306 struct bio_vec *bvec;
2307 int i;
2308 __bio_for_each_segment(bvec, bio, i, 0) {
2309 if (!_drbd_send_page(mdev, bvec->bv_page,
2310 bvec->bv_offset, bvec->bv_len))
2311 return 0;
2312 }
2313
2314 return 1;
2315}
2316
2317/* Used to send write requests
2318 * R_PRIMARY -> Peer (P_DATA)
2319 */
2320int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2321{
2322 int ok = 1;
2323 struct p_data p;
2324 unsigned int dp_flags = 0;
2325 void *dgb;
2326 int dgs;
2327
2328 if (!drbd_get_data_sock(mdev))
2329 return 0;
2330
2331 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2332 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2333
2334 p.head.magic = BE_DRBD_MAGIC;
2335 p.head.command = cpu_to_be16(P_DATA);
2336 p.head.length =
2337 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2338
2339 p.sector = cpu_to_be64(req->sector);
2340 p.block_id = (unsigned long)req;
2341 p.seq_num = cpu_to_be32(req->seq_num =
2342 atomic_add_return(1, &mdev->packet_seq));
2343 dp_flags = 0;
2344
2345 /* NOTE: no need to check if barriers supported here as we would
2346 * not pass the test in make_request_common in that case
2347 */
2348 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2349 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2350 /* dp_flags |= DP_HARDBARRIER; */
2351 }
2352 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2353 dp_flags |= DP_RW_SYNC;
2354 /* for now handle SYNCIO and UNPLUG
2355 * as if they still were one and the same flag */
2356 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2357 dp_flags |= DP_RW_SYNC;
2358 if (mdev->state.conn >= C_SYNC_SOURCE &&
2359 mdev->state.conn <= C_PAUSED_SYNC_T)
2360 dp_flags |= DP_MAY_SET_IN_SYNC;
2361
2362 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002363 set_bit(UNPLUG_REMOTE, &mdev->flags);
2364 ok = (sizeof(p) ==
2365 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2366 if (ok && dgs) {
2367 dgb = mdev->int_dig_out;
2368 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2369 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2370 }
2371 if (ok) {
2372 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2373 ok = _drbd_send_bio(mdev, req->master_bio);
2374 else
2375 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2376 }
2377
2378 drbd_put_data_sock(mdev);
2379 return ok;
2380}
2381
2382/* answer packet, used to send data back for read requests:
2383 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2384 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2385 */
2386int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2387 struct drbd_epoch_entry *e)
2388{
2389 int ok;
2390 struct p_data p;
2391 void *dgb;
2392 int dgs;
2393
2394 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2395 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2396
2397 p.head.magic = BE_DRBD_MAGIC;
2398 p.head.command = cpu_to_be16(cmd);
2399 p.head.length =
2400 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2401
2402 p.sector = cpu_to_be64(e->sector);
2403 p.block_id = e->block_id;
2404 /* p.seq_num = 0; No sequence numbers here.. */
2405
2406 /* Only called by our kernel thread.
2407 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2408 * in response to admin command or module unload.
2409 */
2410 if (!drbd_get_data_sock(mdev))
2411 return 0;
2412
Philipp Reisnerb411b362009-09-25 16:07:19 -07002413 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2414 sizeof(p), MSG_MORE);
2415 if (ok && dgs) {
2416 dgb = mdev->int_dig_out;
2417 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2418 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2419 }
2420 if (ok)
2421 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2422
2423 drbd_put_data_sock(mdev);
2424 return ok;
2425}
2426
2427/*
2428 drbd_send distinguishes two cases:
2429
2430 Packets sent via the data socket "sock"
2431 and packets sent via the meta data socket "msock"
2432
2433 sock msock
2434 -----------------+-------------------------+------------------------------
2435 timeout conf.timeout / 2 conf.timeout / 2
2436 timeout action send a ping via msock Abort communication
2437 and close all sockets
2438*/
2439
2440/*
2441 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2442 */
2443int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2444 void *buf, size_t size, unsigned msg_flags)
2445{
2446 struct kvec iov;
2447 struct msghdr msg;
2448 int rv, sent = 0;
2449
2450 if (!sock)
2451 return -1000;
2452
2453 /* THINK if (signal_pending) return ... ? */
2454
2455 iov.iov_base = buf;
2456 iov.iov_len = size;
2457
2458 msg.msg_name = NULL;
2459 msg.msg_namelen = 0;
2460 msg.msg_control = NULL;
2461 msg.msg_controllen = 0;
2462 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2463
2464 if (sock == mdev->data.socket) {
2465 mdev->ko_count = mdev->net_conf->ko_count;
2466 drbd_update_congested(mdev);
2467 }
2468 do {
2469 /* STRANGE
2470 * tcp_sendmsg does _not_ use its size parameter at all ?
2471 *
2472 * -EAGAIN on timeout, -EINTR on signal.
2473 */
2474/* THINK
2475 * do we need to block DRBD_SIG if sock == &meta.socket ??
2476 * otherwise wake_asender() might interrupt some send_*Ack !
2477 */
2478 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2479 if (rv == -EAGAIN) {
2480 if (we_should_drop_the_connection(mdev, sock))
2481 break;
2482 else
2483 continue;
2484 }
2485 D_ASSERT(rv != 0);
2486 if (rv == -EINTR) {
2487 flush_signals(current);
2488 rv = 0;
2489 }
2490 if (rv < 0)
2491 break;
2492 sent += rv;
2493 iov.iov_base += rv;
2494 iov.iov_len -= rv;
2495 } while (sent < size);
2496
2497 if (sock == mdev->data.socket)
2498 clear_bit(NET_CONGESTED, &mdev->flags);
2499
2500 if (rv <= 0) {
2501 if (rv != -EAGAIN) {
2502 dev_err(DEV, "%s_sendmsg returned %d\n",
2503 sock == mdev->meta.socket ? "msock" : "sock",
2504 rv);
2505 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2506 } else
2507 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2508 }
2509
2510 return sent;
2511}
2512
2513static int drbd_open(struct block_device *bdev, fmode_t mode)
2514{
2515 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2516 unsigned long flags;
2517 int rv = 0;
2518
2519 spin_lock_irqsave(&mdev->req_lock, flags);
2520 /* to have a stable mdev->state.role
2521 * and no race with updating open_cnt */
2522
2523 if (mdev->state.role != R_PRIMARY) {
2524 if (mode & FMODE_WRITE)
2525 rv = -EROFS;
2526 else if (!allow_oos)
2527 rv = -EMEDIUMTYPE;
2528 }
2529
2530 if (!rv)
2531 mdev->open_cnt++;
2532 spin_unlock_irqrestore(&mdev->req_lock, flags);
2533
2534 return rv;
2535}
2536
2537static int drbd_release(struct gendisk *gd, fmode_t mode)
2538{
2539 struct drbd_conf *mdev = gd->private_data;
2540 mdev->open_cnt--;
2541 return 0;
2542}
2543
2544static void drbd_unplug_fn(struct request_queue *q)
2545{
2546 struct drbd_conf *mdev = q->queuedata;
2547
Philipp Reisnerb411b362009-09-25 16:07:19 -07002548 /* unplug FIRST */
2549 spin_lock_irq(q->queue_lock);
2550 blk_remove_plug(q);
2551 spin_unlock_irq(q->queue_lock);
2552
2553 /* only if connected */
2554 spin_lock_irq(&mdev->req_lock);
2555 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2556 D_ASSERT(mdev->state.role == R_PRIMARY);
2557 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2558 /* add to the data.work queue,
2559 * unless already queued.
2560 * XXX this might be a good addition to drbd_queue_work
2561 * anyways, to detect "double queuing" ... */
2562 if (list_empty(&mdev->unplug_work.list))
2563 drbd_queue_work(&mdev->data.work,
2564 &mdev->unplug_work);
2565 }
2566 }
2567 spin_unlock_irq(&mdev->req_lock);
2568
2569 if (mdev->state.disk >= D_INCONSISTENT)
2570 drbd_kick_lo(mdev);
2571}
2572
2573static void drbd_set_defaults(struct drbd_conf *mdev)
2574{
2575 mdev->sync_conf.after = DRBD_AFTER_DEF;
2576 mdev->sync_conf.rate = DRBD_RATE_DEF;
2577 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2578 mdev->state = (union drbd_state) {
2579 { .role = R_SECONDARY,
2580 .peer = R_UNKNOWN,
2581 .conn = C_STANDALONE,
2582 .disk = D_DISKLESS,
2583 .pdsk = D_UNKNOWN,
2584 .susp = 0
2585 } };
2586}
2587
2588void drbd_init_set_defaults(struct drbd_conf *mdev)
2589{
2590 /* the memset(,0,) did most of this.
2591 * note: only assignments, no allocation in here */
2592
2593 drbd_set_defaults(mdev);
2594
2595 /* for now, we do NOT yet support it,
2596 * even though we start some framework
2597 * to eventually support barriers */
2598 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2599
2600 atomic_set(&mdev->ap_bio_cnt, 0);
2601 atomic_set(&mdev->ap_pending_cnt, 0);
2602 atomic_set(&mdev->rs_pending_cnt, 0);
2603 atomic_set(&mdev->unacked_cnt, 0);
2604 atomic_set(&mdev->local_cnt, 0);
2605 atomic_set(&mdev->net_cnt, 0);
2606 atomic_set(&mdev->packet_seq, 0);
2607 atomic_set(&mdev->pp_in_use, 0);
2608
2609 mutex_init(&mdev->md_io_mutex);
2610 mutex_init(&mdev->data.mutex);
2611 mutex_init(&mdev->meta.mutex);
2612 sema_init(&mdev->data.work.s, 0);
2613 sema_init(&mdev->meta.work.s, 0);
2614 mutex_init(&mdev->state_mutex);
2615
2616 spin_lock_init(&mdev->data.work.q_lock);
2617 spin_lock_init(&mdev->meta.work.q_lock);
2618
2619 spin_lock_init(&mdev->al_lock);
2620 spin_lock_init(&mdev->req_lock);
2621 spin_lock_init(&mdev->peer_seq_lock);
2622 spin_lock_init(&mdev->epoch_lock);
2623
2624 INIT_LIST_HEAD(&mdev->active_ee);
2625 INIT_LIST_HEAD(&mdev->sync_ee);
2626 INIT_LIST_HEAD(&mdev->done_ee);
2627 INIT_LIST_HEAD(&mdev->read_ee);
2628 INIT_LIST_HEAD(&mdev->net_ee);
2629 INIT_LIST_HEAD(&mdev->resync_reads);
2630 INIT_LIST_HEAD(&mdev->data.work.q);
2631 INIT_LIST_HEAD(&mdev->meta.work.q);
2632 INIT_LIST_HEAD(&mdev->resync_work.list);
2633 INIT_LIST_HEAD(&mdev->unplug_work.list);
2634 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2635 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2636 mdev->resync_work.cb = w_resync_inactive;
2637 mdev->unplug_work.cb = w_send_write_hint;
2638 mdev->md_sync_work.cb = w_md_sync;
2639 mdev->bm_io_work.w.cb = w_bitmap_io;
2640 init_timer(&mdev->resync_timer);
2641 init_timer(&mdev->md_sync_timer);
2642 mdev->resync_timer.function = resync_timer_fn;
2643 mdev->resync_timer.data = (unsigned long) mdev;
2644 mdev->md_sync_timer.function = md_sync_timer_fn;
2645 mdev->md_sync_timer.data = (unsigned long) mdev;
2646
2647 init_waitqueue_head(&mdev->misc_wait);
2648 init_waitqueue_head(&mdev->state_wait);
2649 init_waitqueue_head(&mdev->ee_wait);
2650 init_waitqueue_head(&mdev->al_wait);
2651 init_waitqueue_head(&mdev->seq_wait);
2652
2653 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2654 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2655 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2656
2657 mdev->agreed_pro_version = PRO_VERSION_MAX;
2658 mdev->write_ordering = WO_bio_barrier;
2659 mdev->resync_wenr = LC_FREE;
2660}
2661
2662void drbd_mdev_cleanup(struct drbd_conf *mdev)
2663{
2664 if (mdev->receiver.t_state != None)
2665 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2666 mdev->receiver.t_state);
2667
2668 /* no need to lock it, I'm the only thread alive */
2669 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2670 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2671 mdev->al_writ_cnt =
2672 mdev->bm_writ_cnt =
2673 mdev->read_cnt =
2674 mdev->recv_cnt =
2675 mdev->send_cnt =
2676 mdev->writ_cnt =
2677 mdev->p_size =
2678 mdev->rs_start =
2679 mdev->rs_total =
2680 mdev->rs_failed =
2681 mdev->rs_mark_left =
2682 mdev->rs_mark_time = 0;
2683 D_ASSERT(mdev->net_conf == NULL);
2684
2685 drbd_set_my_capacity(mdev, 0);
2686 if (mdev->bitmap) {
2687 /* maybe never allocated. */
2688 drbd_bm_resize(mdev, 0);
2689 drbd_bm_cleanup(mdev);
2690 }
2691
2692 drbd_free_resources(mdev);
2693
2694 /*
2695 * currently we drbd_init_ee only on module load, so
2696 * we may do drbd_release_ee only on module unload!
2697 */
2698 D_ASSERT(list_empty(&mdev->active_ee));
2699 D_ASSERT(list_empty(&mdev->sync_ee));
2700 D_ASSERT(list_empty(&mdev->done_ee));
2701 D_ASSERT(list_empty(&mdev->read_ee));
2702 D_ASSERT(list_empty(&mdev->net_ee));
2703 D_ASSERT(list_empty(&mdev->resync_reads));
2704 D_ASSERT(list_empty(&mdev->data.work.q));
2705 D_ASSERT(list_empty(&mdev->meta.work.q));
2706 D_ASSERT(list_empty(&mdev->resync_work.list));
2707 D_ASSERT(list_empty(&mdev->unplug_work.list));
2708
2709}
2710
2711
2712static void drbd_destroy_mempools(void)
2713{
2714 struct page *page;
2715
2716 while (drbd_pp_pool) {
2717 page = drbd_pp_pool;
2718 drbd_pp_pool = (struct page *)page_private(page);
2719 __free_page(page);
2720 drbd_pp_vacant--;
2721 }
2722
2723 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2724
2725 if (drbd_ee_mempool)
2726 mempool_destroy(drbd_ee_mempool);
2727 if (drbd_request_mempool)
2728 mempool_destroy(drbd_request_mempool);
2729 if (drbd_ee_cache)
2730 kmem_cache_destroy(drbd_ee_cache);
2731 if (drbd_request_cache)
2732 kmem_cache_destroy(drbd_request_cache);
2733 if (drbd_bm_ext_cache)
2734 kmem_cache_destroy(drbd_bm_ext_cache);
2735 if (drbd_al_ext_cache)
2736 kmem_cache_destroy(drbd_al_ext_cache);
2737
2738 drbd_ee_mempool = NULL;
2739 drbd_request_mempool = NULL;
2740 drbd_ee_cache = NULL;
2741 drbd_request_cache = NULL;
2742 drbd_bm_ext_cache = NULL;
2743 drbd_al_ext_cache = NULL;
2744
2745 return;
2746}
2747
2748static int drbd_create_mempools(void)
2749{
2750 struct page *page;
2751 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2752 int i;
2753
2754 /* prepare our caches and mempools */
2755 drbd_request_mempool = NULL;
2756 drbd_ee_cache = NULL;
2757 drbd_request_cache = NULL;
2758 drbd_bm_ext_cache = NULL;
2759 drbd_al_ext_cache = NULL;
2760 drbd_pp_pool = NULL;
2761
2762 /* caches */
2763 drbd_request_cache = kmem_cache_create(
2764 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2765 if (drbd_request_cache == NULL)
2766 goto Enomem;
2767
2768 drbd_ee_cache = kmem_cache_create(
2769 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2770 if (drbd_ee_cache == NULL)
2771 goto Enomem;
2772
2773 drbd_bm_ext_cache = kmem_cache_create(
2774 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2775 if (drbd_bm_ext_cache == NULL)
2776 goto Enomem;
2777
2778 drbd_al_ext_cache = kmem_cache_create(
2779 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2780 if (drbd_al_ext_cache == NULL)
2781 goto Enomem;
2782
2783 /* mempools */
2784 drbd_request_mempool = mempool_create(number,
2785 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2786 if (drbd_request_mempool == NULL)
2787 goto Enomem;
2788
2789 drbd_ee_mempool = mempool_create(number,
2790 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2791 if (drbd_request_mempool == NULL)
2792 goto Enomem;
2793
2794 /* drbd's page pool */
2795 spin_lock_init(&drbd_pp_lock);
2796
2797 for (i = 0; i < number; i++) {
2798 page = alloc_page(GFP_HIGHUSER);
2799 if (!page)
2800 goto Enomem;
2801 set_page_private(page, (unsigned long)drbd_pp_pool);
2802 drbd_pp_pool = page;
2803 }
2804 drbd_pp_vacant = number;
2805
2806 return 0;
2807
2808Enomem:
2809 drbd_destroy_mempools(); /* in case we allocated some */
2810 return -ENOMEM;
2811}
2812
2813static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2814 void *unused)
2815{
2816 /* just so we have it. you never know what interesting things we
2817 * might want to do here some day...
2818 */
2819
2820 return NOTIFY_DONE;
2821}
2822
2823static struct notifier_block drbd_notifier = {
2824 .notifier_call = drbd_notify_sys,
2825};
2826
2827static void drbd_release_ee_lists(struct drbd_conf *mdev)
2828{
2829 int rr;
2830
2831 rr = drbd_release_ee(mdev, &mdev->active_ee);
2832 if (rr)
2833 dev_err(DEV, "%d EEs in active list found!\n", rr);
2834
2835 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2836 if (rr)
2837 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2838
2839 rr = drbd_release_ee(mdev, &mdev->read_ee);
2840 if (rr)
2841 dev_err(DEV, "%d EEs in read list found!\n", rr);
2842
2843 rr = drbd_release_ee(mdev, &mdev->done_ee);
2844 if (rr)
2845 dev_err(DEV, "%d EEs in done list found!\n", rr);
2846
2847 rr = drbd_release_ee(mdev, &mdev->net_ee);
2848 if (rr)
2849 dev_err(DEV, "%d EEs in net list found!\n", rr);
2850}
2851
2852/* caution. no locking.
2853 * currently only used from module cleanup code. */
2854static void drbd_delete_device(unsigned int minor)
2855{
2856 struct drbd_conf *mdev = minor_to_mdev(minor);
2857
2858 if (!mdev)
2859 return;
2860
2861 /* paranoia asserts */
2862 if (mdev->open_cnt != 0)
2863 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2864 __FILE__ , __LINE__);
2865
2866 ERR_IF (!list_empty(&mdev->data.work.q)) {
2867 struct list_head *lp;
2868 list_for_each(lp, &mdev->data.work.q) {
2869 dev_err(DEV, "lp = %p\n", lp);
2870 }
2871 };
2872 /* end paranoia asserts */
2873
2874 del_gendisk(mdev->vdisk);
2875
2876 /* cleanup stuff that may have been allocated during
2877 * device (re-)configuration or state changes */
2878
2879 if (mdev->this_bdev)
2880 bdput(mdev->this_bdev);
2881
2882 drbd_free_resources(mdev);
2883
2884 drbd_release_ee_lists(mdev);
2885
2886 /* should be free'd on disconnect? */
2887 kfree(mdev->ee_hash);
2888 /*
2889 mdev->ee_hash_s = 0;
2890 mdev->ee_hash = NULL;
2891 */
2892
2893 lc_destroy(mdev->act_log);
2894 lc_destroy(mdev->resync);
2895
2896 kfree(mdev->p_uuid);
2897 /* mdev->p_uuid = NULL; */
2898
2899 kfree(mdev->int_dig_out);
2900 kfree(mdev->int_dig_in);
2901 kfree(mdev->int_dig_vv);
2902
2903 /* cleanup the rest that has been
2904 * allocated from drbd_new_device
2905 * and actually free the mdev itself */
2906 drbd_free_mdev(mdev);
2907}
2908
2909static void drbd_cleanup(void)
2910{
2911 unsigned int i;
2912
2913 unregister_reboot_notifier(&drbd_notifier);
2914
2915 drbd_nl_cleanup();
2916
2917 if (minor_table) {
2918 if (drbd_proc)
2919 remove_proc_entry("drbd", NULL);
2920 i = minor_count;
2921 while (i--)
2922 drbd_delete_device(i);
2923 drbd_destroy_mempools();
2924 }
2925
2926 kfree(minor_table);
2927
2928 unregister_blkdev(DRBD_MAJOR, "drbd");
2929
2930 printk(KERN_INFO "drbd: module cleanup done.\n");
2931}
2932
2933/**
2934 * drbd_congested() - Callback for pdflush
2935 * @congested_data: User data
2936 * @bdi_bits: Bits pdflush is currently interested in
2937 *
2938 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2939 */
2940static int drbd_congested(void *congested_data, int bdi_bits)
2941{
2942 struct drbd_conf *mdev = congested_data;
2943 struct request_queue *q;
2944 char reason = '-';
2945 int r = 0;
2946
2947 if (!__inc_ap_bio_cond(mdev)) {
2948 /* DRBD has frozen IO */
2949 r = bdi_bits;
2950 reason = 'd';
2951 goto out;
2952 }
2953
2954 if (get_ldev(mdev)) {
2955 q = bdev_get_queue(mdev->ldev->backing_bdev);
2956 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2957 put_ldev(mdev);
2958 if (r)
2959 reason = 'b';
2960 }
2961
2962 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2963 r |= (1 << BDI_async_congested);
2964 reason = reason == 'b' ? 'a' : 'n';
2965 }
2966
2967out:
2968 mdev->congestion_reason = reason;
2969 return r;
2970}
2971
2972struct drbd_conf *drbd_new_device(unsigned int minor)
2973{
2974 struct drbd_conf *mdev;
2975 struct gendisk *disk;
2976 struct request_queue *q;
2977
2978 /* GFP_KERNEL, we are outside of all write-out paths */
2979 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2980 if (!mdev)
2981 return NULL;
2982 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2983 goto out_no_cpumask;
2984
2985 mdev->minor = minor;
2986
2987 drbd_init_set_defaults(mdev);
2988
2989 q = blk_alloc_queue(GFP_KERNEL);
2990 if (!q)
2991 goto out_no_q;
2992 mdev->rq_queue = q;
2993 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002994
2995 disk = alloc_disk(1);
2996 if (!disk)
2997 goto out_no_disk;
2998 mdev->vdisk = disk;
2999
3000 set_disk_ro(disk, TRUE);
3001
3002 disk->queue = q;
3003 disk->major = DRBD_MAJOR;
3004 disk->first_minor = minor;
3005 disk->fops = &drbd_ops;
3006 sprintf(disk->disk_name, "drbd%d", minor);
3007 disk->private_data = mdev;
3008
3009 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3010 /* we have no partitions. we contain only ourselves. */
3011 mdev->this_bdev->bd_contains = mdev->this_bdev;
3012
3013 q->backing_dev_info.congested_fn = drbd_congested;
3014 q->backing_dev_info.congested_data = mdev;
3015
3016 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg98ec2862010-01-21 19:33:14 +01003017 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003018 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3019 blk_queue_merge_bvec(q, drbd_merge_bvec);
3020 q->queue_lock = &mdev->req_lock; /* needed since we use */
3021 /* plugging on a queue, that actually has no requests! */
3022 q->unplug_fn = drbd_unplug_fn;
3023
3024 mdev->md_io_page = alloc_page(GFP_KERNEL);
3025 if (!mdev->md_io_page)
3026 goto out_no_io_page;
3027
3028 if (drbd_bm_init(mdev))
3029 goto out_no_bitmap;
3030 /* no need to lock access, we are still initializing this minor device. */
3031 if (!tl_init(mdev))
3032 goto out_no_tl;
3033
3034 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3035 if (!mdev->app_reads_hash)
3036 goto out_no_app_reads;
3037
3038 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3039 if (!mdev->current_epoch)
3040 goto out_no_epoch;
3041
3042 INIT_LIST_HEAD(&mdev->current_epoch->list);
3043 mdev->epochs = 1;
3044
3045 return mdev;
3046
3047/* out_whatever_else:
3048 kfree(mdev->current_epoch); */
3049out_no_epoch:
3050 kfree(mdev->app_reads_hash);
3051out_no_app_reads:
3052 tl_cleanup(mdev);
3053out_no_tl:
3054 drbd_bm_cleanup(mdev);
3055out_no_bitmap:
3056 __free_page(mdev->md_io_page);
3057out_no_io_page:
3058 put_disk(disk);
3059out_no_disk:
3060 blk_cleanup_queue(q);
3061out_no_q:
3062 free_cpumask_var(mdev->cpu_mask);
3063out_no_cpumask:
3064 kfree(mdev);
3065 return NULL;
3066}
3067
3068/* counterpart of drbd_new_device.
3069 * last part of drbd_delete_device. */
3070void drbd_free_mdev(struct drbd_conf *mdev)
3071{
3072 kfree(mdev->current_epoch);
3073 kfree(mdev->app_reads_hash);
3074 tl_cleanup(mdev);
3075 if (mdev->bitmap) /* should no longer be there. */
3076 drbd_bm_cleanup(mdev);
3077 __free_page(mdev->md_io_page);
3078 put_disk(mdev->vdisk);
3079 blk_cleanup_queue(mdev->rq_queue);
3080 free_cpumask_var(mdev->cpu_mask);
3081 kfree(mdev);
3082}
3083
3084
3085int __init drbd_init(void)
3086{
3087 int err;
3088
3089 if (sizeof(struct p_handshake) != 80) {
3090 printk(KERN_ERR
3091 "drbd: never change the size or layout "
3092 "of the HandShake packet.\n");
3093 return -EINVAL;
3094 }
3095
3096 if (1 > minor_count || minor_count > 255) {
3097 printk(KERN_ERR
3098 "drbd: invalid minor_count (%d)\n", minor_count);
3099#ifdef MODULE
3100 return -EINVAL;
3101#else
3102 minor_count = 8;
3103#endif
3104 }
3105
3106 err = drbd_nl_init();
3107 if (err)
3108 return err;
3109
3110 err = register_blkdev(DRBD_MAJOR, "drbd");
3111 if (err) {
3112 printk(KERN_ERR
3113 "drbd: unable to register block device major %d\n",
3114 DRBD_MAJOR);
3115 return err;
3116 }
3117
3118 register_reboot_notifier(&drbd_notifier);
3119
3120 /*
3121 * allocate all necessary structs
3122 */
3123 err = -ENOMEM;
3124
3125 init_waitqueue_head(&drbd_pp_wait);
3126
3127 drbd_proc = NULL; /* play safe for drbd_cleanup */
3128 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3129 GFP_KERNEL);
3130 if (!minor_table)
3131 goto Enomem;
3132
3133 err = drbd_create_mempools();
3134 if (err)
3135 goto Enomem;
3136
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003137 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003138 if (!drbd_proc) {
3139 printk(KERN_ERR "drbd: unable to register proc file\n");
3140 goto Enomem;
3141 }
3142
3143 rwlock_init(&global_state_lock);
3144
3145 printk(KERN_INFO "drbd: initialized. "
3146 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3147 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3148 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3149 printk(KERN_INFO "drbd: registered as block device major %d\n",
3150 DRBD_MAJOR);
3151 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3152
3153 return 0; /* Success! */
3154
3155Enomem:
3156 drbd_cleanup();
3157 if (err == -ENOMEM)
3158 /* currently always the case */
3159 printk(KERN_ERR "drbd: ran out of memory\n");
3160 else
3161 printk(KERN_ERR "drbd: initialization failure\n");
3162 return err;
3163}
3164
3165void drbd_free_bc(struct drbd_backing_dev *ldev)
3166{
3167 if (ldev == NULL)
3168 return;
3169
3170 bd_release(ldev->backing_bdev);
3171 bd_release(ldev->md_bdev);
3172
3173 fput(ldev->lo_file);
3174 fput(ldev->md_file);
3175
3176 kfree(ldev);
3177}
3178
3179void drbd_free_sock(struct drbd_conf *mdev)
3180{
3181 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003182 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003183 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3184 sock_release(mdev->data.socket);
3185 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003186 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003187 }
3188 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003189 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003190 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3191 sock_release(mdev->meta.socket);
3192 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003193 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003194 }
3195}
3196
3197
3198void drbd_free_resources(struct drbd_conf *mdev)
3199{
3200 crypto_free_hash(mdev->csums_tfm);
3201 mdev->csums_tfm = NULL;
3202 crypto_free_hash(mdev->verify_tfm);
3203 mdev->verify_tfm = NULL;
3204 crypto_free_hash(mdev->cram_hmac_tfm);
3205 mdev->cram_hmac_tfm = NULL;
3206 crypto_free_hash(mdev->integrity_w_tfm);
3207 mdev->integrity_w_tfm = NULL;
3208 crypto_free_hash(mdev->integrity_r_tfm);
3209 mdev->integrity_r_tfm = NULL;
3210
3211 drbd_free_sock(mdev);
3212
3213 __no_warn(local,
3214 drbd_free_bc(mdev->ldev);
3215 mdev->ldev = NULL;);
3216}
3217
3218/* meta data management */
3219
3220struct meta_data_on_disk {
3221 u64 la_size; /* last agreed size. */
3222 u64 uuid[UI_SIZE]; /* UUIDs. */
3223 u64 device_uuid;
3224 u64 reserved_u64_1;
3225 u32 flags; /* MDF */
3226 u32 magic;
3227 u32 md_size_sect;
3228 u32 al_offset; /* offset to this block */
3229 u32 al_nr_extents; /* important for restoring the AL */
3230 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3231 u32 bm_offset; /* offset to the bitmap, from here */
3232 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3233 u32 reserved_u32[4];
3234
3235} __packed;
3236
3237/**
3238 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3239 * @mdev: DRBD device.
3240 */
3241void drbd_md_sync(struct drbd_conf *mdev)
3242{
3243 struct meta_data_on_disk *buffer;
3244 sector_t sector;
3245 int i;
3246
3247 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3248 return;
3249 del_timer(&mdev->md_sync_timer);
3250
3251 /* We use here D_FAILED and not D_ATTACHING because we try to write
3252 * metadata even if we detach due to a disk failure! */
3253 if (!get_ldev_if_state(mdev, D_FAILED))
3254 return;
3255
Philipp Reisnerb411b362009-09-25 16:07:19 -07003256 mutex_lock(&mdev->md_io_mutex);
3257 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3258 memset(buffer, 0, 512);
3259
3260 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3261 for (i = UI_CURRENT; i < UI_SIZE; i++)
3262 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3263 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3264 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3265
3266 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3267 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3268 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3269 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3270 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3271
3272 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3273
3274 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3275 sector = mdev->ldev->md.md_offset;
3276
3277 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3278 clear_bit(MD_DIRTY, &mdev->flags);
3279 } else {
3280 /* this was a try anyways ... */
3281 dev_err(DEV, "meta data update failed!\n");
3282
3283 drbd_chk_io_error(mdev, 1, TRUE);
3284 }
3285
3286 /* Update mdev->ldev->md.la_size_sect,
3287 * since we updated it on metadata. */
3288 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3289
3290 mutex_unlock(&mdev->md_io_mutex);
3291 put_ldev(mdev);
3292}
3293
3294/**
3295 * drbd_md_read() - Reads in the meta data super block
3296 * @mdev: DRBD device.
3297 * @bdev: Device from which the meta data should be read in.
3298 *
3299 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3300 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3301 */
3302int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3303{
3304 struct meta_data_on_disk *buffer;
3305 int i, rv = NO_ERROR;
3306
3307 if (!get_ldev_if_state(mdev, D_ATTACHING))
3308 return ERR_IO_MD_DISK;
3309
Philipp Reisnerb411b362009-09-25 16:07:19 -07003310 mutex_lock(&mdev->md_io_mutex);
3311 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3312
3313 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3314 /* NOTE: cant do normal error processing here as this is
3315 called BEFORE disk is attached */
3316 dev_err(DEV, "Error while reading metadata.\n");
3317 rv = ERR_IO_MD_DISK;
3318 goto err;
3319 }
3320
3321 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3322 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3323 rv = ERR_MD_INVALID;
3324 goto err;
3325 }
3326 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3327 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3328 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3329 rv = ERR_MD_INVALID;
3330 goto err;
3331 }
3332 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3333 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3334 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3335 rv = ERR_MD_INVALID;
3336 goto err;
3337 }
3338 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3339 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3340 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3341 rv = ERR_MD_INVALID;
3342 goto err;
3343 }
3344
3345 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3346 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3347 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3348 rv = ERR_MD_INVALID;
3349 goto err;
3350 }
3351
3352 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3353 for (i = UI_CURRENT; i < UI_SIZE; i++)
3354 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3355 bdev->md.flags = be32_to_cpu(buffer->flags);
3356 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3357 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3358
3359 if (mdev->sync_conf.al_extents < 7)
3360 mdev->sync_conf.al_extents = 127;
3361
3362 err:
3363 mutex_unlock(&mdev->md_io_mutex);
3364 put_ldev(mdev);
3365
3366 return rv;
3367}
3368
3369/**
3370 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3371 * @mdev: DRBD device.
3372 *
3373 * Call this function if you change anything that should be written to
3374 * the meta-data super block. This function sets MD_DIRTY, and starts a
3375 * timer that ensures that within five seconds you have to call drbd_md_sync().
3376 */
3377void drbd_md_mark_dirty(struct drbd_conf *mdev)
3378{
3379 set_bit(MD_DIRTY, &mdev->flags);
3380 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3381}
3382
3383
3384static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3385{
3386 int i;
3387
Jens Axboe6a0afdf2009-10-01 09:04:14 +02003388 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003389 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003390}
3391
3392void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3393{
3394 if (idx == UI_CURRENT) {
3395 if (mdev->state.role == R_PRIMARY)
3396 val |= 1;
3397 else
3398 val &= ~((u64)1);
3399
3400 drbd_set_ed_uuid(mdev, val);
3401 }
3402
3403 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003404 drbd_md_mark_dirty(mdev);
3405}
3406
3407
3408void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3409{
3410 if (mdev->ldev->md.uuid[idx]) {
3411 drbd_uuid_move_history(mdev);
3412 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003413 }
3414 _drbd_uuid_set(mdev, idx, val);
3415}
3416
3417/**
3418 * drbd_uuid_new_current() - Creates a new current UUID
3419 * @mdev: DRBD device.
3420 *
3421 * Creates a new current UUID, and rotates the old current UUID into
3422 * the bitmap slot. Causes an incremental resync upon next connect.
3423 */
3424void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3425{
3426 u64 val;
3427
3428 dev_info(DEV, "Creating new current UUID\n");
3429 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3430 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003431
3432 get_random_bytes(&val, sizeof(u64));
3433 _drbd_uuid_set(mdev, UI_CURRENT, val);
3434}
3435
3436void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3437{
3438 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3439 return;
3440
3441 if (val == 0) {
3442 drbd_uuid_move_history(mdev);
3443 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3444 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003445 } else {
3446 if (mdev->ldev->md.uuid[UI_BITMAP])
3447 dev_warn(DEV, "bm UUID already set");
3448
3449 mdev->ldev->md.uuid[UI_BITMAP] = val;
3450 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3451
Philipp Reisnerb411b362009-09-25 16:07:19 -07003452 }
3453 drbd_md_mark_dirty(mdev);
3454}
3455
3456/**
3457 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3458 * @mdev: DRBD device.
3459 *
3460 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3461 */
3462int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3463{
3464 int rv = -EIO;
3465
3466 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3467 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3468 drbd_md_sync(mdev);
3469 drbd_bm_set_all(mdev);
3470
3471 rv = drbd_bm_write(mdev);
3472
3473 if (!rv) {
3474 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3475 drbd_md_sync(mdev);
3476 }
3477
3478 put_ldev(mdev);
3479 }
3480
3481 return rv;
3482}
3483
3484/**
3485 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3486 * @mdev: DRBD device.
3487 *
3488 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3489 */
3490int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3491{
3492 int rv = -EIO;
3493
3494 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3495 drbd_bm_clear_all(mdev);
3496 rv = drbd_bm_write(mdev);
3497 put_ldev(mdev);
3498 }
3499
3500 return rv;
3501}
3502
3503static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3504{
3505 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3506 int rv;
3507
3508 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3509
3510 drbd_bm_lock(mdev, work->why);
3511 rv = work->io_fn(mdev);
3512 drbd_bm_unlock(mdev);
3513
3514 clear_bit(BITMAP_IO, &mdev->flags);
3515 wake_up(&mdev->misc_wait);
3516
3517 if (work->done)
3518 work->done(mdev, rv);
3519
3520 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3521 work->why = NULL;
3522
3523 return 1;
3524}
3525
3526/**
3527 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3528 * @mdev: DRBD device.
3529 * @io_fn: IO callback to be called when bitmap IO is possible
3530 * @done: callback to be called after the bitmap IO was performed
3531 * @why: Descriptive text of the reason for doing the IO
3532 *
3533 * While IO on the bitmap happens we freeze application IO thus we ensure
3534 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3535 * called from worker context. It MUST NOT be used while a previous such
3536 * work is still pending!
3537 */
3538void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3539 int (*io_fn)(struct drbd_conf *),
3540 void (*done)(struct drbd_conf *, int),
3541 char *why)
3542{
3543 D_ASSERT(current == mdev->worker.task);
3544
3545 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3546 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3547 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3548 if (mdev->bm_io_work.why)
3549 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3550 why, mdev->bm_io_work.why);
3551
3552 mdev->bm_io_work.io_fn = io_fn;
3553 mdev->bm_io_work.done = done;
3554 mdev->bm_io_work.why = why;
3555
3556 set_bit(BITMAP_IO, &mdev->flags);
3557 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3558 if (list_empty(&mdev->bm_io_work.w.list)) {
3559 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3560 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3561 } else
3562 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3563 }
3564}
3565
3566/**
3567 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3568 * @mdev: DRBD device.
3569 * @io_fn: IO callback to be called when bitmap IO is possible
3570 * @why: Descriptive text of the reason for doing the IO
3571 *
3572 * freezes application IO while that the actual IO operations runs. This
3573 * functions MAY NOT be called from worker context.
3574 */
3575int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3576{
3577 int rv;
3578
3579 D_ASSERT(current != mdev->worker.task);
3580
3581 drbd_suspend_io(mdev);
3582
3583 drbd_bm_lock(mdev, why);
3584 rv = io_fn(mdev);
3585 drbd_bm_unlock(mdev);
3586
3587 drbd_resume_io(mdev);
3588
3589 return rv;
3590}
3591
3592void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3593{
3594 if ((mdev->ldev->md.flags & flag) != flag) {
3595 drbd_md_mark_dirty(mdev);
3596 mdev->ldev->md.flags |= flag;
3597 }
3598}
3599
3600void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3601{
3602 if ((mdev->ldev->md.flags & flag) != 0) {
3603 drbd_md_mark_dirty(mdev);
3604 mdev->ldev->md.flags &= ~flag;
3605 }
3606}
3607int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3608{
3609 return (bdev->md.flags & flag) != 0;
3610}
3611
3612static void md_sync_timer_fn(unsigned long data)
3613{
3614 struct drbd_conf *mdev = (struct drbd_conf *) data;
3615
3616 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3617}
3618
3619static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3620{
3621 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3622 drbd_md_sync(mdev);
3623
3624 return 1;
3625}
3626
3627#ifdef CONFIG_DRBD_FAULT_INJECTION
3628/* Fault insertion support including random number generator shamelessly
3629 * stolen from kernel/rcutorture.c */
3630struct fault_random_state {
3631 unsigned long state;
3632 unsigned long count;
3633};
3634
3635#define FAULT_RANDOM_MULT 39916801 /* prime */
3636#define FAULT_RANDOM_ADD 479001701 /* prime */
3637#define FAULT_RANDOM_REFRESH 10000
3638
3639/*
3640 * Crude but fast random-number generator. Uses a linear congruential
3641 * generator, with occasional help from get_random_bytes().
3642 */
3643static unsigned long
3644_drbd_fault_random(struct fault_random_state *rsp)
3645{
3646 long refresh;
3647
Roel Kluin49829ea2009-12-15 22:55:44 +01003648 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003649 get_random_bytes(&refresh, sizeof(refresh));
3650 rsp->state += refresh;
3651 rsp->count = FAULT_RANDOM_REFRESH;
3652 }
3653 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3654 return swahw32(rsp->state);
3655}
3656
3657static char *
3658_drbd_fault_str(unsigned int type) {
3659 static char *_faults[] = {
3660 [DRBD_FAULT_MD_WR] = "Meta-data write",
3661 [DRBD_FAULT_MD_RD] = "Meta-data read",
3662 [DRBD_FAULT_RS_WR] = "Resync write",
3663 [DRBD_FAULT_RS_RD] = "Resync read",
3664 [DRBD_FAULT_DT_WR] = "Data write",
3665 [DRBD_FAULT_DT_RD] = "Data read",
3666 [DRBD_FAULT_DT_RA] = "Data read ahead",
3667 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3668 [DRBD_FAULT_AL_EE] = "EE allocation"
3669 };
3670
3671 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3672}
3673
3674unsigned int
3675_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3676{
3677 static struct fault_random_state rrs = {0, 0};
3678
3679 unsigned int ret = (
3680 (fault_devs == 0 ||
3681 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3682 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3683
3684 if (ret) {
3685 fault_count++;
3686
3687 if (printk_ratelimit())
3688 dev_warn(DEV, "***Simulating %s failure\n",
3689 _drbd_fault_str(type));
3690 }
3691
3692 return ret;
3693}
3694#endif
3695
3696const char *drbd_buildtag(void)
3697{
3698 /* DRBD built from external sources has here a reference to the
3699 git hash of the source code. */
3700
3701 static char buildtag[38] = "\0uilt-in";
3702
3703 if (buildtag[0] == 0) {
3704#ifdef CONFIG_MODULES
3705 if (THIS_MODULE != NULL)
3706 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3707 else
3708#endif
3709 buildtag[0] = 'b';
3710 }
3711
3712 return buildtag;
3713}
3714
3715module_init(drbd_init)
3716module_exit(drbd_cleanup)
3717
Philipp Reisnerb411b362009-09-25 16:07:19 -07003718EXPORT_SYMBOL(drbd_conn_str);
3719EXPORT_SYMBOL(drbd_role_str);
3720EXPORT_SYMBOL(drbd_disk_str);
3721EXPORT_SYMBOL(drbd_set_st_err_str);