blob: b06b9e52fba8a65d21284ba47dcbe4346fa39693 [file] [log] [blame]
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/jiffies.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/bio.h>
28#include <linux/blkdev.h>
29#include <linux/delay.h>
30#include <linux/file.h>
31#include <linux/kthread.h>
32#include <linux/configfs.h>
33#include <linux/random.h>
34#include <linux/crc32.h>
35#include <linux/time.h>
Sunil Mushran87d3d3f2008-12-17 14:17:42 -080036#include <linux/debugfs.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090037#include <linux/slab.h>
Mark Fasheha7f6a5f2005-12-15 14:31:23 -080038
39#include "heartbeat.h"
40#include "tcp.h"
41#include "nodemanager.h"
42#include "quorum.h"
43
44#include "masklog.h"
45
46
47/*
48 * The first heartbeat pass had one global thread that would serialize all hb
49 * callback calls. This global serializing sem should only be removed once
50 * we've made sure that all callees can deal with being called concurrently
51 * from multiple hb region threads.
52 */
53static DECLARE_RWSEM(o2hb_callback_sem);
54
55/*
56 * multiple hb threads are watching multiple regions. A node is live
57 * whenever any of the threads sees activity from the node in its region.
58 */
Ingo Molnar34af9462006-06-27 02:53:55 -070059static DEFINE_SPINLOCK(o2hb_live_lock);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -080060static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
61static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
62static LIST_HEAD(o2hb_node_events);
63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
64
Sunil Mushran536f0742010-10-07 17:03:07 -070065/*
66 * In global heartbeat, we maintain a series of region bitmaps.
67 * - o2hb_region_bitmap allows us to limit the region number to max region.
Sunil Mushrane7d656b2010-10-06 17:55:18 -070068 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
Sunil Mushran43182d22010-10-06 17:55:16 -070069 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it.
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -070071 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
Sunil Mushran536f0742010-10-07 17:03:07 -070072 */
73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
Sunil Mushrane7d656b2010-10-06 17:55:18 -070074static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
Sunil Mushran43182d22010-10-06 17:55:16 -070075static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -070076static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
Sunil Mushran536f0742010-10-07 17:03:07 -070077
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -070078#define O2HB_DB_TYPE_LIVENODES 0
Sunil Mushrana6de0132010-10-06 17:55:13 -070079#define O2HB_DB_TYPE_LIVEREGIONS 1
80#define O2HB_DB_TYPE_QUORUMREGIONS 2
81#define O2HB_DB_TYPE_FAILEDREGIONS 3
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -070082struct o2hb_debug_buf {
83 int db_type;
84 int db_size;
85 int db_len;
86 void *db_data;
87};
88
89static struct o2hb_debug_buf *o2hb_db_livenodes;
Sunil Mushrana6de0132010-10-06 17:55:13 -070090static struct o2hb_debug_buf *o2hb_db_liveregions;
91static struct o2hb_debug_buf *o2hb_db_quorumregions;
92static struct o2hb_debug_buf *o2hb_db_failedregions;
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -070093
Sunil Mushran87d3d3f2008-12-17 14:17:42 -080094#define O2HB_DEBUG_DIR "o2hb"
95#define O2HB_DEBUG_LIVENODES "livenodes"
Sunil Mushrana6de0132010-10-06 17:55:13 -070096#define O2HB_DEBUG_LIVEREGIONS "live_regions"
97#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
98#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -070099
Sunil Mushran87d3d3f2008-12-17 14:17:42 -0800100static struct dentry *o2hb_debug_dir;
101static struct dentry *o2hb_debug_livenodes;
Sunil Mushrana6de0132010-10-06 17:55:13 -0700102static struct dentry *o2hb_debug_liveregions;
103static struct dentry *o2hb_debug_quorumregions;
104static struct dentry *o2hb_debug_failedregions;
Sunil Mushran87d3d3f2008-12-17 14:17:42 -0800105
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800106static LIST_HEAD(o2hb_all_regions);
107
108static struct o2hb_callback {
109 struct list_head list;
110} o2hb_callbacks[O2HB_NUM_CB];
111
112static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
113
114#define O2HB_DEFAULT_BLOCK_BITS 9
115
Sunil Mushran54b51872010-10-07 15:26:08 -0700116enum o2hb_heartbeat_modes {
117 O2HB_HEARTBEAT_LOCAL = 0,
118 O2HB_HEARTBEAT_GLOBAL,
119 O2HB_HEARTBEAT_NUM_MODES,
120};
121
122char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
123 "local", /* O2HB_HEARTBEAT_LOCAL */
124 "global", /* O2HB_HEARTBEAT_GLOBAL */
125};
126
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800127unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
Sunil Mushran54b51872010-10-07 15:26:08 -0700128unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800129
Sunil Mushran2bd63212010-01-25 16:57:38 -0800130/* Only sets a new threshold if there are no active regions.
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800131 *
132 * No locking or otherwise interesting code is required for reading
133 * o2hb_dead_threshold as it can't change once regions are active and
134 * it's not interesting to anyone until then anyway. */
135static void o2hb_dead_threshold_set(unsigned int threshold)
136{
137 if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
138 spin_lock(&o2hb_live_lock);
139 if (list_empty(&o2hb_all_regions))
140 o2hb_dead_threshold = threshold;
141 spin_unlock(&o2hb_live_lock);
142 }
143}
144
Sunil Mushran54b51872010-10-07 15:26:08 -0700145static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
146{
147 int ret = -1;
148
149 if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
150 spin_lock(&o2hb_live_lock);
151 if (list_empty(&o2hb_all_regions)) {
152 o2hb_heartbeat_mode = hb_mode;
153 ret = 0;
154 }
155 spin_unlock(&o2hb_live_lock);
156 }
157
158 return ret;
159}
160
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800161struct o2hb_node_event {
162 struct list_head hn_item;
163 enum o2hb_callback_type hn_event_type;
164 struct o2nm_node *hn_node;
165 int hn_node_num;
166};
167
168struct o2hb_disk_slot {
169 struct o2hb_disk_heartbeat_block *ds_raw_block;
170 u8 ds_node_num;
171 u64 ds_last_time;
172 u64 ds_last_generation;
173 u16 ds_equal_samples;
174 u16 ds_changed_samples;
175 struct list_head ds_live_item;
176};
177
178/* each thread owns a region.. when we're asked to tear down the region
179 * we ask the thread to stop, who cleans up the region */
180struct o2hb_region {
181 struct config_item hr_item;
182
183 struct list_head hr_all_item;
184 unsigned hr_unclean_stop:1;
185
186 /* protected by the hr_callback_sem */
187 struct task_struct *hr_task;
188
189 unsigned int hr_blocks;
190 unsigned long long hr_start_block;
191
192 unsigned int hr_block_bits;
193 unsigned int hr_block_bytes;
194
195 unsigned int hr_slots_per_page;
196 unsigned int hr_num_pages;
197
198 struct page **hr_slot_data;
199 struct block_device *hr_bdev;
200 struct o2hb_disk_slot *hr_slots;
201
Sunil Mushran823a6372010-10-06 17:55:21 -0700202 /* live node map of this region */
203 unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
Sunil Mushran536f0742010-10-07 17:03:07 -0700204 unsigned int hr_region_num;
Sunil Mushran823a6372010-10-06 17:55:21 -0700205
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800206 /* let the person setting up hb wait for it to return until it
207 * has reached a 'steady' state. This will be fixed when we have
208 * a more complete api that doesn't lead to this sort of fragility. */
209 atomic_t hr_steady_iterations;
210
211 char hr_dev_name[BDEVNAME_SIZE];
212
213 unsigned int hr_timeout_ms;
214
215 /* randomized as the region goes up and down so that a node
216 * recognizes a node going up and down in one iteration */
217 u64 hr_generation;
218
David Howellsc4028952006-11-22 14:57:56 +0000219 struct delayed_work hr_write_timeout_work;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800220 unsigned long hr_last_timeout_start;
221
222 /* Used during o2hb_check_slot to hold a copy of the block
223 * being checked because we temporarily have to zero out the
224 * crc field. */
225 struct o2hb_disk_heartbeat_block *hr_tmp_block;
226};
227
228struct o2hb_bio_wait_ctxt {
229 atomic_t wc_num_reqs;
230 struct completion wc_io_complete;
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800231 int wc_error;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800232};
233
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -0700234static int o2hb_pop_count(void *map, int count)
235{
236 int i = -1, pop = 0;
237
238 while ((i = find_next_bit(map, count, i + 1)) < count)
239 pop++;
240 return pop;
241}
242
David Howellsc4028952006-11-22 14:57:56 +0000243static void o2hb_write_timeout(struct work_struct *work)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800244{
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -0700245 int failed, quorum;
246 unsigned long flags;
David Howellsc4028952006-11-22 14:57:56 +0000247 struct o2hb_region *reg =
248 container_of(work, struct o2hb_region,
249 hr_write_timeout_work.work);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800250
251 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
252 "milliseconds\n", reg->hr_dev_name,
Sunil Mushran2bd63212010-01-25 16:57:38 -0800253 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -0700254
255 if (o2hb_global_heartbeat_active()) {
256 spin_lock_irqsave(&o2hb_live_lock, flags);
257 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
258 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
259 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
260 O2NM_MAX_REGIONS);
261 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
262 O2NM_MAX_REGIONS);
263 spin_unlock_irqrestore(&o2hb_live_lock, flags);
264
265 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
266 quorum, failed);
267
268 /*
269 * Fence if the number of failed regions >= half the number
270 * of quorum regions
271 */
272 if ((failed << 1) < quorum)
273 return;
274 }
275
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800276 o2quo_disk_timeout();
277}
278
279static void o2hb_arm_write_timeout(struct o2hb_region *reg)
280{
Tao Mab31d3082009-12-22 10:32:15 +0800281 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
282 O2HB_MAX_WRITE_TIMEOUT_MS);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800283
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -0700284 if (o2hb_global_heartbeat_active()) {
285 spin_lock(&o2hb_live_lock);
286 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
287 spin_unlock(&o2hb_live_lock);
288 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800289 cancel_delayed_work(&reg->hr_write_timeout_work);
290 reg->hr_last_timeout_start = jiffies;
291 schedule_delayed_work(&reg->hr_write_timeout_work,
292 msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
293}
294
295static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
296{
297 cancel_delayed_work(&reg->hr_write_timeout_work);
298 flush_scheduled_work();
299}
300
Philipp Reisnerb5592922007-01-11 10:58:10 +0100301static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800302{
Philipp Reisnerb5592922007-01-11 10:58:10 +0100303 atomic_set(&wc->wc_num_reqs, 1);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800304 init_completion(&wc->wc_io_complete);
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800305 wc->wc_error = 0;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800306}
307
308/* Used in error paths too */
309static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
310 unsigned int num)
311{
312 /* sadly atomic_sub_and_test() isn't available on all platforms. The
313 * good news is that the fast path only completes one at a time */
314 while(num--) {
315 if (atomic_dec_and_test(&wc->wc_num_reqs)) {
316 BUG_ON(num > 0);
317 complete(&wc->wc_io_complete);
318 }
319 }
320}
321
322static void o2hb_wait_on_io(struct o2hb_region *reg,
323 struct o2hb_bio_wait_ctxt *wc)
324{
325 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
326
327 blk_run_address_space(mapping);
Philipp Reisnerb5592922007-01-11 10:58:10 +0100328 o2hb_bio_wait_dec(wc, 1);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800329
330 wait_for_completion(&wc->wc_io_complete);
331}
332
Al Viro782e3b32007-10-12 07:17:47 +0100333static void o2hb_bio_end_io(struct bio *bio,
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800334 int error)
335{
336 struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
337
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800338 if (error) {
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800339 mlog(ML_ERROR, "IO Error %d\n", error);
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800340 wc->wc_error = error;
341 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800342
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800343 o2hb_bio_wait_dec(wc, 1);
Philipp Reisnerb5592922007-01-11 10:58:10 +0100344 bio_put(bio);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800345}
346
347/* Setup a Bio to cover I/O against num_slots slots starting at
348 * start_slot. */
349static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
350 struct o2hb_bio_wait_ctxt *wc,
Philipp Reisnerb5592922007-01-11 10:58:10 +0100351 unsigned int *current_slot,
352 unsigned int max_slots)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800353{
Philipp Reisnerb5592922007-01-11 10:58:10 +0100354 int len, current_page;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800355 unsigned int vec_len, vec_start;
356 unsigned int bits = reg->hr_block_bits;
357 unsigned int spp = reg->hr_slots_per_page;
Philipp Reisnerb5592922007-01-11 10:58:10 +0100358 unsigned int cs = *current_slot;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800359 struct bio *bio;
360 struct page *page;
361
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800362 /* Testing has shown this allocation to take long enough under
363 * GFP_KERNEL that the local node can get fenced. It would be
364 * nicest if we could pre-allocate these bios and avoid this
365 * all together. */
Philipp Reisnerb5592922007-01-11 10:58:10 +0100366 bio = bio_alloc(GFP_ATOMIC, 16);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800367 if (!bio) {
368 mlog(ML_ERROR, "Could not alloc slots BIO!\n");
369 bio = ERR_PTR(-ENOMEM);
370 goto bail;
371 }
372
373 /* Must put everything in 512 byte sectors for the bio... */
Philipp Reisnerb5592922007-01-11 10:58:10 +0100374 bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800375 bio->bi_bdev = reg->hr_bdev;
376 bio->bi_private = wc;
377 bio->bi_end_io = o2hb_bio_end_io;
378
Philipp Reisnerb5592922007-01-11 10:58:10 +0100379 vec_start = (cs << bits) % PAGE_CACHE_SIZE;
380 while(cs < max_slots) {
381 current_page = cs / spp;
382 page = reg->hr_slot_data[current_page];
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800383
Jan Karabc7e97c2007-10-10 16:25:42 +0200384 vec_len = min(PAGE_CACHE_SIZE - vec_start,
Philipp Reisnerb5592922007-01-11 10:58:10 +0100385 (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800386
387 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
Philipp Reisnerb5592922007-01-11 10:58:10 +0100388 current_page, vec_len, vec_start);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800389
390 len = bio_add_page(bio, page, vec_len, vec_start);
Philipp Reisnerb5592922007-01-11 10:58:10 +0100391 if (len != vec_len) break;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800392
Philipp Reisnerb5592922007-01-11 10:58:10 +0100393 cs += vec_len / (PAGE_CACHE_SIZE/spp);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800394 vec_start = 0;
395 }
396
397bail:
Philipp Reisnerb5592922007-01-11 10:58:10 +0100398 *current_slot = cs;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800399 return bio;
400}
401
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800402static int o2hb_read_slots(struct o2hb_region *reg,
403 unsigned int max_slots)
404{
Philipp Reisnerb5592922007-01-11 10:58:10 +0100405 unsigned int current_slot=0;
406 int status;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800407 struct o2hb_bio_wait_ctxt wc;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800408 struct bio *bio;
409
Philipp Reisnerb5592922007-01-11 10:58:10 +0100410 o2hb_bio_wait_init(&wc);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800411
Philipp Reisnerb5592922007-01-11 10:58:10 +0100412 while(current_slot < max_slots) {
413 bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800414 if (IS_ERR(bio)) {
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800415 status = PTR_ERR(bio);
416 mlog_errno(status);
417 goto bail_and_wait;
418 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800419
Philipp Reisnerb5592922007-01-11 10:58:10 +0100420 atomic_inc(&wc.wc_num_reqs);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800421 submit_bio(READ, bio);
422 }
423
424 status = 0;
425
426bail_and_wait:
427 o2hb_wait_on_io(reg, &wc);
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800428 if (wc.wc_error && !status)
429 status = wc.wc_error;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800430
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800431 return status;
432}
433
434static int o2hb_issue_node_write(struct o2hb_region *reg,
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800435 struct o2hb_bio_wait_ctxt *write_wc)
436{
437 int status;
438 unsigned int slot;
439 struct bio *bio;
440
Philipp Reisnerb5592922007-01-11 10:58:10 +0100441 o2hb_bio_wait_init(write_wc);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800442
443 slot = o2nm_this_node();
444
Philipp Reisnerb5592922007-01-11 10:58:10 +0100445 bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800446 if (IS_ERR(bio)) {
447 status = PTR_ERR(bio);
448 mlog_errno(status);
449 goto bail;
450 }
451
Philipp Reisnerb5592922007-01-11 10:58:10 +0100452 atomic_inc(&write_wc->wc_num_reqs);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800453 submit_bio(WRITE, bio);
454
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800455 status = 0;
456bail:
457 return status;
458}
459
460static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
461 struct o2hb_disk_heartbeat_block *hb_block)
462{
463 __le32 old_cksum;
464 u32 ret;
465
466 /* We want to compute the block crc with a 0 value in the
467 * hb_cksum field. Save it off here and replace after the
468 * crc. */
469 old_cksum = hb_block->hb_cksum;
470 hb_block->hb_cksum = 0;
471
472 ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
473
474 hb_block->hb_cksum = old_cksum;
475
476 return ret;
477}
478
479static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
480{
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800481 mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, "
482 "cksum = 0x%x, generation 0x%llx\n",
483 (long long)le64_to_cpu(hb_block->hb_seq),
484 hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum),
485 (long long)le64_to_cpu(hb_block->hb_generation));
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800486}
487
488static int o2hb_verify_crc(struct o2hb_region *reg,
489 struct o2hb_disk_heartbeat_block *hb_block)
490{
491 u32 read, computed;
492
493 read = le32_to_cpu(hb_block->hb_cksum);
494 computed = o2hb_compute_block_crc_le(reg, hb_block);
495
496 return read == computed;
497}
498
499/* We want to make sure that nobody is heartbeating on top of us --
500 * this will help detect an invalid configuration. */
501static int o2hb_check_last_timestamp(struct o2hb_region *reg)
502{
503 int node_num, ret;
504 struct o2hb_disk_slot *slot;
505 struct o2hb_disk_heartbeat_block *hb_block;
506
507 node_num = o2nm_this_node();
508
509 ret = 1;
510 slot = &reg->hr_slots[node_num];
511 /* Don't check on our 1st timestamp */
512 if (slot->ds_last_time) {
513 hb_block = slot->ds_raw_block;
514
515 if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
516 ret = 0;
517 }
518
519 return ret;
520}
521
522static inline void o2hb_prepare_block(struct o2hb_region *reg,
523 u64 generation)
524{
525 int node_num;
526 u64 cputime;
527 struct o2hb_disk_slot *slot;
528 struct o2hb_disk_heartbeat_block *hb_block;
529
530 node_num = o2nm_this_node();
531 slot = &reg->hr_slots[node_num];
532
533 hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
534 memset(hb_block, 0, reg->hr_block_bytes);
535 /* TODO: time stuff */
536 cputime = CURRENT_TIME.tv_sec;
537 if (!cputime)
538 cputime = 1;
539
540 hb_block->hb_seq = cpu_to_le64(cputime);
541 hb_block->hb_node = node_num;
542 hb_block->hb_generation = cpu_to_le64(generation);
Mark Fasheh0db638f2006-05-09 15:09:35 -0700543 hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800544
545 /* This step must always happen last! */
546 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
547 hb_block));
548
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800549 mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
Mark Fasheh5fdf1e62007-04-27 16:50:03 -0700550 (long long)generation,
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800551 le32_to_cpu(hb_block->hb_cksum));
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800552}
553
554static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
555 struct o2nm_node *node,
556 int idx)
557{
558 struct list_head *iter;
559 struct o2hb_callback_func *f;
560
561 list_for_each(iter, &hbcall->list) {
562 f = list_entry(iter, struct o2hb_callback_func, hc_item);
563 mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
564 (f->hc_func)(node, idx, f->hc_data);
565 }
566}
567
568/* Will run the list in order until we process the passed event */
569static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
570{
571 int empty;
572 struct o2hb_callback *hbcall;
573 struct o2hb_node_event *event;
574
575 spin_lock(&o2hb_live_lock);
576 empty = list_empty(&queued_event->hn_item);
577 spin_unlock(&o2hb_live_lock);
578 if (empty)
579 return;
580
581 /* Holding callback sem assures we don't alter the callback
582 * lists when doing this, and serializes ourselves with other
583 * processes wanting callbacks. */
584 down_write(&o2hb_callback_sem);
585
586 spin_lock(&o2hb_live_lock);
587 while (!list_empty(&o2hb_node_events)
588 && !list_empty(&queued_event->hn_item)) {
589 event = list_entry(o2hb_node_events.next,
590 struct o2hb_node_event,
591 hn_item);
592 list_del_init(&event->hn_item);
593 spin_unlock(&o2hb_live_lock);
594
595 mlog(ML_HEARTBEAT, "Node %s event for %d\n",
596 event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
597 event->hn_node_num);
598
599 hbcall = hbcall_from_type(event->hn_event_type);
600
601 /* We should *never* have gotten on to the list with a
602 * bad type... This isn't something that we should try
603 * to recover from. */
604 BUG_ON(IS_ERR(hbcall));
605
606 o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
607
608 spin_lock(&o2hb_live_lock);
609 }
610 spin_unlock(&o2hb_live_lock);
611
612 up_write(&o2hb_callback_sem);
613}
614
615static void o2hb_queue_node_event(struct o2hb_node_event *event,
616 enum o2hb_callback_type type,
617 struct o2nm_node *node,
618 int node_num)
619{
620 assert_spin_locked(&o2hb_live_lock);
621
Sunil Mushran0e105d32010-10-07 17:00:16 -0700622 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
623
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800624 event->hn_event_type = type;
625 event->hn_node = node;
626 event->hn_node_num = node_num;
627
628 mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
629 type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
630
631 list_add_tail(&event->hn_item, &o2hb_node_events);
632}
633
634static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
635{
636 struct o2hb_node_event event =
637 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
638 struct o2nm_node *node;
639
640 node = o2nm_get_node_by_num(slot->ds_node_num);
641 if (!node)
642 return;
643
644 spin_lock(&o2hb_live_lock);
645 if (!list_empty(&slot->ds_live_item)) {
646 mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
647 slot->ds_node_num);
648
649 list_del_init(&slot->ds_live_item);
650
651 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
652 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
653
654 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
655 slot->ds_node_num);
656 }
657 }
658 spin_unlock(&o2hb_live_lock);
659
660 o2hb_run_event_list(&event);
661
662 o2nm_node_put(node);
663}
664
Sunil Mushran43182d22010-10-06 17:55:16 -0700665static void o2hb_set_quorum_device(struct o2hb_region *reg,
666 struct o2hb_disk_slot *slot)
667{
668 assert_spin_locked(&o2hb_live_lock);
669
670 if (!o2hb_global_heartbeat_active())
671 return;
672
673 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
674 return;
675
676 /*
677 * A region can be added to the quorum only when it sees all
678 * live nodes heartbeat on it. In other words, the region has been
679 * added to all nodes.
680 */
681 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
682 sizeof(o2hb_live_node_bitmap)))
683 return;
684
685 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
686 return;
687
688 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
689 config_item_name(&reg->hr_item));
690
691 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
692}
693
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800694static int o2hb_check_slot(struct o2hb_region *reg,
695 struct o2hb_disk_slot *slot)
696{
697 int changed = 0, gen_changed = 0;
698 struct o2hb_node_event event =
699 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
700 struct o2nm_node *node;
701 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
702 u64 cputime;
Mark Fasheh0db638f2006-05-09 15:09:35 -0700703 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
704 unsigned int slot_dead_ms;
Sunil Mushran0e105d32010-10-07 17:00:16 -0700705 int tmp;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800706
707 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
708
Sunil Mushran0e105d32010-10-07 17:00:16 -0700709 /*
710 * If a node is no longer configured but is still in the livemap, we
711 * may need to clear that bit from the livemap.
712 */
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800713 node = o2nm_get_node_by_num(slot->ds_node_num);
Sunil Mushran0e105d32010-10-07 17:00:16 -0700714 if (!node) {
715 spin_lock(&o2hb_live_lock);
716 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
717 spin_unlock(&o2hb_live_lock);
718 if (!tmp)
719 return 0;
720 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800721
722 if (!o2hb_verify_crc(reg, hb_block)) {
723 /* all paths from here will drop o2hb_live_lock for
724 * us. */
725 spin_lock(&o2hb_live_lock);
726
727 /* Don't print an error on the console in this case -
728 * a freshly formatted heartbeat area will not have a
729 * crc set on it. */
730 if (list_empty(&slot->ds_live_item))
731 goto out;
732
733 /* The node is live but pushed out a bad crc. We
734 * consider it a transient miss but don't populate any
735 * other values as they may be junk. */
736 mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
737 slot->ds_node_num, reg->hr_dev_name);
738 o2hb_dump_slot(hb_block);
739
740 slot->ds_equal_samples++;
741 goto fire_callbacks;
742 }
743
744 /* we don't care if these wrap.. the state transitions below
745 * clear at the right places */
746 cputime = le64_to_cpu(hb_block->hb_seq);
747 if (slot->ds_last_time != cputime)
748 slot->ds_changed_samples++;
749 else
750 slot->ds_equal_samples++;
751 slot->ds_last_time = cputime;
752
753 /* The node changed heartbeat generations. We assume this to
754 * mean it dropped off but came back before we timed out. We
755 * want to consider it down for the time being but don't want
756 * to lose any changed_samples state we might build up to
757 * considering it live again. */
758 if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
759 gen_changed = 1;
760 slot->ds_equal_samples = 0;
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800761 mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx "
762 "to 0x%llx)\n", slot->ds_node_num,
763 (long long)slot->ds_last_generation,
764 (long long)le64_to_cpu(hb_block->hb_generation));
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800765 }
766
767 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
768
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800769 mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x "
770 "seq %llu last %llu changed %u equal %u\n",
771 slot->ds_node_num, (long long)slot->ds_last_generation,
772 le32_to_cpu(hb_block->hb_cksum),
Sunil Mushran2bd63212010-01-25 16:57:38 -0800773 (unsigned long long)le64_to_cpu(hb_block->hb_seq),
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800774 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800775 slot->ds_equal_samples);
776
777 spin_lock(&o2hb_live_lock);
778
779fire_callbacks:
780 /* dead nodes only come to life after some number of
781 * changes at any time during their dead time */
782 if (list_empty(&slot->ds_live_item) &&
783 slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
Mark Fasheh70bacbdb2006-03-02 11:10:05 -0800784 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
785 slot->ds_node_num, (long long)slot->ds_last_generation);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800786
Sunil Mushran823a6372010-10-06 17:55:21 -0700787 set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
788
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800789 /* first on the list generates a callback */
790 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
791 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
792
793 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
794 slot->ds_node_num);
795
796 changed = 1;
797 }
798
799 list_add_tail(&slot->ds_live_item,
800 &o2hb_live_slots[slot->ds_node_num]);
801
802 slot->ds_equal_samples = 0;
Mark Fasheh0db638f2006-05-09 15:09:35 -0700803
804 /* We want to be sure that all nodes agree on the
805 * number of milliseconds before a node will be
806 * considered dead. The self-fencing timeout is
807 * computed from this value, and a discrepancy might
808 * result in heartbeat calling a node dead when it
809 * hasn't self-fenced yet. */
810 slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
811 if (slot_dead_ms && slot_dead_ms != dead_ms) {
812 /* TODO: Perhaps we can fail the region here. */
813 mlog(ML_ERROR, "Node %d on device %s has a dead count "
814 "of %u ms, but our count is %u ms.\n"
815 "Please double check your configuration values "
816 "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
817 slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
818 dead_ms);
819 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800820 goto out;
821 }
822
823 /* if the list is dead, we're done.. */
824 if (list_empty(&slot->ds_live_item))
825 goto out;
826
827 /* live nodes only go dead after enough consequtive missed
828 * samples.. reset the missed counter whenever we see
829 * activity */
830 if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
831 mlog(ML_HEARTBEAT, "Node %d left my region\n",
832 slot->ds_node_num);
833
Sunil Mushran823a6372010-10-06 17:55:21 -0700834 clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
835
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800836 /* last off the live_slot generates a callback */
837 list_del_init(&slot->ds_live_item);
838 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
839 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
840
Sunil Mushran0e105d32010-10-07 17:00:16 -0700841 /* node can be null */
842 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
843 node, slot->ds_node_num);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800844
845 changed = 1;
846 }
847
848 /* We don't clear this because the node is still
849 * actually writing new blocks. */
850 if (!gen_changed)
851 slot->ds_changed_samples = 0;
852 goto out;
853 }
854 if (slot->ds_changed_samples) {
855 slot->ds_changed_samples = 0;
856 slot->ds_equal_samples = 0;
857 }
858out:
Sunil Mushran43182d22010-10-06 17:55:16 -0700859 o2hb_set_quorum_device(reg, slot);
860
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800861 spin_unlock(&o2hb_live_lock);
862
863 o2hb_run_event_list(&event);
864
Sunil Mushran0e105d32010-10-07 17:00:16 -0700865 if (node)
866 o2nm_node_put(node);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800867 return changed;
868}
869
870/* This could be faster if we just implmented a find_last_bit, but I
871 * don't think the circumstances warrant it. */
872static int o2hb_highest_node(unsigned long *nodes,
873 int numbits)
874{
875 int highest, node;
876
877 highest = numbits;
878 node = -1;
879 while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
880 if (node >= numbits)
881 break;
882
883 highest = node;
884 }
885
886 return highest;
887}
888
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800889static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800890{
891 int i, ret, highest_node, change = 0;
892 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
Sunil Mushran0e105d32010-10-07 17:00:16 -0700893 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800894 struct o2hb_bio_wait_ctxt write_wc;
895
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800896 ret = o2nm_configured_node_map(configured_nodes,
897 sizeof(configured_nodes));
898 if (ret) {
899 mlog_errno(ret);
900 return ret;
901 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800902
Sunil Mushran0e105d32010-10-07 17:00:16 -0700903 /*
904 * If a node is not configured but is in the livemap, we still need
905 * to read the slot so as to be able to remove it from the livemap.
906 */
907 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
908 i = -1;
909 while ((i = find_next_bit(live_node_bitmap,
910 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
911 set_bit(i, configured_nodes);
912 }
913
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800914 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
915 if (highest_node >= O2NM_MAX_NODES) {
916 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800917 return -EINVAL;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800918 }
919
920 /* No sense in reading the slots of nodes that don't exist
921 * yet. Of course, if the node definitions have holes in them
922 * then we're reading an empty slot anyway... Consider this
923 * best-effort. */
924 ret = o2hb_read_slots(reg, highest_node + 1);
925 if (ret < 0) {
926 mlog_errno(ret);
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800927 return ret;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800928 }
929
930 /* With an up to date view of the slots, we can check that no
931 * other node has been improperly configured to heartbeat in
932 * our slot. */
933 if (!o2hb_check_last_timestamp(reg))
934 mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
935 "in our slot!\n", reg->hr_dev_name);
936
937 /* fill in the proper info for our next heartbeat */
938 o2hb_prepare_block(reg, reg->hr_generation);
939
940 /* And fire off the write. Note that we don't wait on this I/O
941 * until later. */
Philipp Reisnerb5592922007-01-11 10:58:10 +0100942 ret = o2hb_issue_node_write(reg, &write_wc);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800943 if (ret < 0) {
944 mlog_errno(ret);
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800945 return ret;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800946 }
947
948 i = -1;
949 while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
950
951 change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
952 }
953
954 /*
955 * We have to be sure we've advertised ourselves on disk
956 * before we can go to steady state. This ensures that
957 * people we find in our steady state have seen us.
958 */
959 o2hb_wait_on_io(reg, &write_wc);
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800960 if (write_wc.wc_error) {
961 /* Do not re-arm the write timeout on I/O error - we
962 * can't be sure that the new block ever made it to
963 * disk */
964 mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
965 write_wc.wc_error, reg->hr_dev_name);
966 return write_wc.wc_error;
967 }
968
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800969 o2hb_arm_write_timeout(reg);
970
971 /* let the person who launched us know when things are steady */
972 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
973 if (atomic_dec_and_test(&reg->hr_steady_iterations))
974 wake_up(&o2hb_steady_queue);
975 }
Mark Fasheha9e2ae32006-03-24 14:20:17 -0800976
977 return 0;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -0800978}
979
980/* Subtract b from a, storing the result in a. a *must* have a larger
981 * value than b. */
982static void o2hb_tv_subtract(struct timeval *a,
983 struct timeval *b)
984{
985 /* just return 0 when a is after b */
986 if (a->tv_sec < b->tv_sec ||
987 (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
988 a->tv_sec = 0;
989 a->tv_usec = 0;
990 return;
991 }
992
993 a->tv_sec -= b->tv_sec;
994 a->tv_usec -= b->tv_usec;
995 while ( a->tv_usec < 0 ) {
996 a->tv_sec--;
997 a->tv_usec += 1000000;
998 }
999}
1000
1001static unsigned int o2hb_elapsed_msecs(struct timeval *start,
1002 struct timeval *end)
1003{
1004 struct timeval res = *end;
1005
1006 o2hb_tv_subtract(&res, start);
1007
1008 return res.tv_sec * 1000 + res.tv_usec / 1000;
1009}
1010
1011/*
1012 * we ride the region ref that the region dir holds. before the region
1013 * dir is removed and drops it ref it will wait to tear down this
1014 * thread.
1015 */
1016static int o2hb_thread(void *data)
1017{
1018 int i, ret;
1019 struct o2hb_region *reg = data;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001020 struct o2hb_bio_wait_ctxt write_wc;
1021 struct timeval before_hb, after_hb;
1022 unsigned int elapsed_msec;
1023
1024 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
1025
1026 set_user_nice(current, -20);
1027
1028 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
1029 /* We track the time spent inside
Frederik Schwarzer025dfda2008-10-16 19:02:37 +02001030 * o2hb_do_disk_heartbeat so that we avoid more than
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001031 * hr_timeout_ms between disk writes. On busy systems
1032 * this should result in a heartbeat which is less
1033 * likely to time itself out. */
1034 do_gettimeofday(&before_hb);
1035
Mark Fasheha9e2ae32006-03-24 14:20:17 -08001036 i = 0;
1037 do {
1038 ret = o2hb_do_disk_heartbeat(reg);
1039 } while (ret && ++i < 2);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001040
1041 do_gettimeofday(&after_hb);
1042 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
1043
Tao Mab31d3082009-12-22 10:32:15 +08001044 mlog(ML_HEARTBEAT,
1045 "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
Mark Fasheh215c7f92006-02-01 16:42:10 -08001046 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
1047 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1048 elapsed_msec);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001049
1050 if (elapsed_msec < reg->hr_timeout_ms) {
1051 /* the kthread api has blocked signals for us so no
1052 * need to record the return value. */
1053 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
1054 }
1055 }
1056
1057 o2hb_disarm_write_timeout(reg);
1058
1059 /* unclean stop is only used in very bad situation */
1060 for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
1061 o2hb_shutdown_slot(&reg->hr_slots[i]);
1062
1063 /* Explicit down notification - avoid forcing the other nodes
1064 * to timeout on this region when we could just as easily
1065 * write a clear generation - thus indicating to them that
1066 * this node has left this region.
1067 *
1068 * XXX: Should we skip this on unclean_stop? */
1069 o2hb_prepare_block(reg, 0);
Philipp Reisnerb5592922007-01-11 10:58:10 +01001070 ret = o2hb_issue_node_write(reg, &write_wc);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001071 if (ret == 0) {
1072 o2hb_wait_on_io(reg, &write_wc);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001073 } else {
1074 mlog_errno(ret);
1075 }
1076
1077 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
1078
1079 return 0;
1080}
1081
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001082#ifdef CONFIG_DEBUG_FS
1083static int o2hb_debug_open(struct inode *inode, struct file *file)
1084{
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001085 struct o2hb_debug_buf *db = inode->i_private;
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001086 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1087 char *buf = NULL;
1088 int i = -1;
1089 int out = 0;
1090
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001091 /* max_nodes should be the largest bitmap we pass here */
1092 BUG_ON(sizeof(map) < db->db_size);
1093
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001094 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
1095 if (!buf)
1096 goto bail;
1097
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001098 switch (db->db_type) {
1099 case O2HB_DB_TYPE_LIVENODES:
Sunil Mushrana6de0132010-10-06 17:55:13 -07001100 case O2HB_DB_TYPE_LIVEREGIONS:
1101 case O2HB_DB_TYPE_QUORUMREGIONS:
1102 case O2HB_DB_TYPE_FAILEDREGIONS:
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001103 spin_lock(&o2hb_live_lock);
1104 memcpy(map, db->db_data, db->db_size);
1105 spin_unlock(&o2hb_live_lock);
1106 break;
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001107
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001108 default:
1109 goto done;
1110 }
1111
1112 while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001113 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
1114 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
1115
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001116done:
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001117 i_size_write(inode, out);
1118
1119 file->private_data = buf;
1120
1121 return 0;
1122bail:
1123 return -ENOMEM;
1124}
1125
1126static int o2hb_debug_release(struct inode *inode, struct file *file)
1127{
1128 kfree(file->private_data);
1129 return 0;
1130}
1131
1132static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
1133 size_t nbytes, loff_t *ppos)
1134{
1135 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
1136 i_size_read(file->f_mapping->host));
1137}
1138#else
1139static int o2hb_debug_open(struct inode *inode, struct file *file)
1140{
1141 return 0;
1142}
1143static int o2hb_debug_release(struct inode *inode, struct file *file)
1144{
1145 return 0;
1146}
1147static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
1148 size_t nbytes, loff_t *ppos)
1149{
1150 return 0;
1151}
1152#endif /* CONFIG_DEBUG_FS */
1153
Alexey Dobriyan828c0952009-10-01 15:43:56 -07001154static const struct file_operations o2hb_debug_fops = {
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001155 .open = o2hb_debug_open,
1156 .release = o2hb_debug_release,
1157 .read = o2hb_debug_read,
1158 .llseek = generic_file_llseek,
1159};
1160
1161void o2hb_exit(void)
1162{
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001163 kfree(o2hb_db_livenodes);
Sunil Mushrana6de0132010-10-06 17:55:13 -07001164 kfree(o2hb_db_liveregions);
1165 kfree(o2hb_db_quorumregions);
1166 kfree(o2hb_db_failedregions);
1167 debugfs_remove(o2hb_debug_failedregions);
1168 debugfs_remove(o2hb_debug_quorumregions);
1169 debugfs_remove(o2hb_debug_liveregions);
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001170 debugfs_remove(o2hb_debug_livenodes);
1171 debugfs_remove(o2hb_debug_dir);
1172}
1173
1174static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
1175 struct o2hb_debug_buf **db, int db_len,
1176 int type, int size, int len, void *data)
1177{
1178 *db = kmalloc(db_len, GFP_KERNEL);
1179 if (!*db)
1180 return NULL;
1181
1182 (*db)->db_type = type;
1183 (*db)->db_size = size;
1184 (*db)->db_len = len;
1185 (*db)->db_data = data;
1186
1187 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
1188 &o2hb_debug_fops);
1189}
1190
1191static int o2hb_debug_init(void)
1192{
1193 int ret = -ENOMEM;
1194
1195 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1196 if (!o2hb_debug_dir) {
1197 mlog_errno(ret);
1198 goto bail;
1199 }
1200
1201 o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1202 o2hb_debug_dir,
1203 &o2hb_db_livenodes,
1204 sizeof(*o2hb_db_livenodes),
1205 O2HB_DB_TYPE_LIVENODES,
1206 sizeof(o2hb_live_node_bitmap),
1207 O2NM_MAX_NODES,
1208 o2hb_live_node_bitmap);
1209 if (!o2hb_debug_livenodes) {
1210 mlog_errno(ret);
1211 goto bail;
1212 }
Sunil Mushrana6de0132010-10-06 17:55:13 -07001213
1214 o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
1215 o2hb_debug_dir,
1216 &o2hb_db_liveregions,
1217 sizeof(*o2hb_db_liveregions),
1218 O2HB_DB_TYPE_LIVEREGIONS,
1219 sizeof(o2hb_live_region_bitmap),
1220 O2NM_MAX_REGIONS,
1221 o2hb_live_region_bitmap);
1222 if (!o2hb_debug_liveregions) {
1223 mlog_errno(ret);
1224 goto bail;
1225 }
1226
1227 o2hb_debug_quorumregions =
1228 o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
1229 o2hb_debug_dir,
1230 &o2hb_db_quorumregions,
1231 sizeof(*o2hb_db_quorumregions),
1232 O2HB_DB_TYPE_QUORUMREGIONS,
1233 sizeof(o2hb_quorum_region_bitmap),
1234 O2NM_MAX_REGIONS,
1235 o2hb_quorum_region_bitmap);
1236 if (!o2hb_debug_quorumregions) {
1237 mlog_errno(ret);
1238 goto bail;
1239 }
1240
1241 o2hb_debug_failedregions =
1242 o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
1243 o2hb_debug_dir,
1244 &o2hb_db_failedregions,
1245 sizeof(*o2hb_db_failedregions),
1246 O2HB_DB_TYPE_FAILEDREGIONS,
1247 sizeof(o2hb_failed_region_bitmap),
1248 O2NM_MAX_REGIONS,
1249 o2hb_failed_region_bitmap);
1250 if (!o2hb_debug_failedregions) {
1251 mlog_errno(ret);
1252 goto bail;
1253 }
1254
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001255 ret = 0;
1256bail:
1257 if (ret)
1258 o2hb_exit();
1259
1260 return ret;
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001261}
1262
1263int o2hb_init(void)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001264{
1265 int i;
1266
1267 for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
1268 INIT_LIST_HEAD(&o2hb_callbacks[i].list);
1269
1270 for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
1271 INIT_LIST_HEAD(&o2hb_live_slots[i]);
1272
1273 INIT_LIST_HEAD(&o2hb_node_events);
1274
1275 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
Sunil Mushran536f0742010-10-07 17:03:07 -07001276 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
Sunil Mushrane7d656b2010-10-06 17:55:18 -07001277 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
Sunil Mushran43182d22010-10-06 17:55:16 -07001278 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
Sunil Mushranb1c5ebf2010-10-07 17:05:52 -07001279 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
Sunil Mushran87d3d3f2008-12-17 14:17:42 -08001280
Sunil Mushran8ca8b0b2010-10-07 17:01:27 -07001281 return o2hb_debug_init();
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001282}
1283
1284/* if we're already in a callback then we're already serialized by the sem */
1285static void o2hb_fill_node_map_from_callback(unsigned long *map,
1286 unsigned bytes)
1287{
1288 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
1289
1290 memcpy(map, &o2hb_live_node_bitmap, bytes);
1291}
1292
1293/*
1294 * get a map of all nodes that are heartbeating in any regions
1295 */
1296void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
1297{
1298 /* callers want to serialize this map and callbacks so that they
1299 * can trust that they don't miss nodes coming to the party */
1300 down_read(&o2hb_callback_sem);
1301 spin_lock(&o2hb_live_lock);
1302 o2hb_fill_node_map_from_callback(map, bytes);
1303 spin_unlock(&o2hb_live_lock);
1304 up_read(&o2hb_callback_sem);
1305}
1306EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
1307
1308/*
1309 * heartbeat configfs bits. The heartbeat set is a default set under
1310 * the cluster set in nodemanager.c.
1311 */
1312
1313static struct o2hb_region *to_o2hb_region(struct config_item *item)
1314{
1315 return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
1316}
1317
1318/* drop_item only drops its ref after killing the thread, nothing should
1319 * be using the region anymore. this has to clean up any state that
1320 * attributes might have built up. */
1321static void o2hb_region_release(struct config_item *item)
1322{
1323 int i;
1324 struct page *page;
1325 struct o2hb_region *reg = to_o2hb_region(item);
1326
1327 if (reg->hr_tmp_block)
1328 kfree(reg->hr_tmp_block);
1329
1330 if (reg->hr_slot_data) {
1331 for (i = 0; i < reg->hr_num_pages; i++) {
1332 page = reg->hr_slot_data[i];
1333 if (page)
1334 __free_page(page);
1335 }
1336 kfree(reg->hr_slot_data);
1337 }
1338
1339 if (reg->hr_bdev)
Al Viro9a1c3542008-02-22 20:40:24 -05001340 blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001341
1342 if (reg->hr_slots)
1343 kfree(reg->hr_slots);
1344
1345 spin_lock(&o2hb_live_lock);
1346 list_del(&reg->hr_all_item);
1347 spin_unlock(&o2hb_live_lock);
1348
1349 kfree(reg);
1350}
1351
1352static int o2hb_read_block_input(struct o2hb_region *reg,
1353 const char *page,
1354 size_t count,
1355 unsigned long *ret_bytes,
1356 unsigned int *ret_bits)
1357{
1358 unsigned long bytes;
1359 char *p = (char *)page;
1360
1361 bytes = simple_strtoul(p, &p, 0);
1362 if (!p || (*p && (*p != '\n')))
1363 return -EINVAL;
1364
1365 /* Heartbeat and fs min / max block sizes are the same. */
1366 if (bytes > 4096 || bytes < 512)
1367 return -ERANGE;
1368 if (hweight16(bytes) != 1)
1369 return -EINVAL;
1370
1371 if (ret_bytes)
1372 *ret_bytes = bytes;
1373 if (ret_bits)
1374 *ret_bits = ffs(bytes) - 1;
1375
1376 return 0;
1377}
1378
1379static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
1380 char *page)
1381{
1382 return sprintf(page, "%u\n", reg->hr_block_bytes);
1383}
1384
1385static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
1386 const char *page,
1387 size_t count)
1388{
1389 int status;
1390 unsigned long block_bytes;
1391 unsigned int block_bits;
1392
1393 if (reg->hr_bdev)
1394 return -EINVAL;
1395
1396 status = o2hb_read_block_input(reg, page, count,
1397 &block_bytes, &block_bits);
1398 if (status)
1399 return status;
1400
1401 reg->hr_block_bytes = (unsigned int)block_bytes;
1402 reg->hr_block_bits = block_bits;
1403
1404 return count;
1405}
1406
1407static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
1408 char *page)
1409{
1410 return sprintf(page, "%llu\n", reg->hr_start_block);
1411}
1412
1413static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
1414 const char *page,
1415 size_t count)
1416{
1417 unsigned long long tmp;
1418 char *p = (char *)page;
1419
1420 if (reg->hr_bdev)
1421 return -EINVAL;
1422
1423 tmp = simple_strtoull(p, &p, 0);
1424 if (!p || (*p && (*p != '\n')))
1425 return -EINVAL;
1426
1427 reg->hr_start_block = tmp;
1428
1429 return count;
1430}
1431
1432static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
1433 char *page)
1434{
1435 return sprintf(page, "%d\n", reg->hr_blocks);
1436}
1437
1438static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
1439 const char *page,
1440 size_t count)
1441{
1442 unsigned long tmp;
1443 char *p = (char *)page;
1444
1445 if (reg->hr_bdev)
1446 return -EINVAL;
1447
1448 tmp = simple_strtoul(p, &p, 0);
1449 if (!p || (*p && (*p != '\n')))
1450 return -EINVAL;
1451
1452 if (tmp > O2NM_MAX_NODES || tmp == 0)
1453 return -ERANGE;
1454
1455 reg->hr_blocks = (unsigned int)tmp;
1456
1457 return count;
1458}
1459
1460static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
1461 char *page)
1462{
1463 unsigned int ret = 0;
1464
1465 if (reg->hr_bdev)
1466 ret = sprintf(page, "%s\n", reg->hr_dev_name);
1467
1468 return ret;
1469}
1470
1471static void o2hb_init_region_params(struct o2hb_region *reg)
1472{
1473 reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
1474 reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
1475
1476 mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
1477 reg->hr_start_block, reg->hr_blocks);
1478 mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
1479 reg->hr_block_bytes, reg->hr_block_bits);
1480 mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
1481 mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
1482}
1483
1484static int o2hb_map_slot_data(struct o2hb_region *reg)
1485{
1486 int i, j;
1487 unsigned int last_slot;
1488 unsigned int spp = reg->hr_slots_per_page;
1489 struct page *page;
1490 char *raw;
1491 struct o2hb_disk_slot *slot;
1492
1493 reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
1494 if (reg->hr_tmp_block == NULL) {
1495 mlog_errno(-ENOMEM);
1496 return -ENOMEM;
1497 }
1498
1499 reg->hr_slots = kcalloc(reg->hr_blocks,
1500 sizeof(struct o2hb_disk_slot), GFP_KERNEL);
1501 if (reg->hr_slots == NULL) {
1502 mlog_errno(-ENOMEM);
1503 return -ENOMEM;
1504 }
1505
1506 for(i = 0; i < reg->hr_blocks; i++) {
1507 slot = &reg->hr_slots[i];
1508 slot->ds_node_num = i;
1509 INIT_LIST_HEAD(&slot->ds_live_item);
1510 slot->ds_raw_block = NULL;
1511 }
1512
1513 reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
1514 mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
1515 "at %u blocks per page\n",
1516 reg->hr_num_pages, reg->hr_blocks, spp);
1517
1518 reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
1519 GFP_KERNEL);
1520 if (!reg->hr_slot_data) {
1521 mlog_errno(-ENOMEM);
1522 return -ENOMEM;
1523 }
1524
1525 for(i = 0; i < reg->hr_num_pages; i++) {
1526 page = alloc_page(GFP_KERNEL);
1527 if (!page) {
1528 mlog_errno(-ENOMEM);
1529 return -ENOMEM;
1530 }
1531
1532 reg->hr_slot_data[i] = page;
1533
1534 last_slot = i * spp;
1535 raw = page_address(page);
1536 for (j = 0;
1537 (j < spp) && ((j + last_slot) < reg->hr_blocks);
1538 j++) {
1539 BUG_ON((j + last_slot) >= reg->hr_blocks);
1540
1541 slot = &reg->hr_slots[j + last_slot];
1542 slot->ds_raw_block =
1543 (struct o2hb_disk_heartbeat_block *) raw;
1544
1545 raw += reg->hr_block_bytes;
1546 }
1547 }
1548
1549 return 0;
1550}
1551
1552/* Read in all the slots available and populate the tracking
1553 * structures so that we can start with a baseline idea of what's
1554 * there. */
1555static int o2hb_populate_slot_data(struct o2hb_region *reg)
1556{
1557 int ret, i;
1558 struct o2hb_disk_slot *slot;
1559 struct o2hb_disk_heartbeat_block *hb_block;
1560
1561 mlog_entry_void();
1562
1563 ret = o2hb_read_slots(reg, reg->hr_blocks);
1564 if (ret) {
1565 mlog_errno(ret);
1566 goto out;
1567 }
1568
1569 /* We only want to get an idea of the values initially in each
1570 * slot, so we do no verification - o2hb_check_slot will
1571 * actually determine if each configured slot is valid and
1572 * whether any values have changed. */
1573 for(i = 0; i < reg->hr_blocks; i++) {
1574 slot = &reg->hr_slots[i];
1575 hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
1576
1577 /* Only fill the values that o2hb_check_slot uses to
1578 * determine changing slots */
1579 slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
1580 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
1581 }
1582
1583out:
1584 mlog_exit(ret);
1585 return ret;
1586}
1587
1588/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
1589static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1590 const char *page,
1591 size_t count)
1592{
Joel Beckere6c352d2007-02-03 03:04:20 -08001593 struct task_struct *hb_task;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001594 long fd;
1595 int sectsize;
1596 char *p = (char *)page;
1597 struct file *filp = NULL;
1598 struct inode *inode = NULL;
1599 ssize_t ret = -EINVAL;
1600
1601 if (reg->hr_bdev)
1602 goto out;
1603
1604 /* We can't heartbeat without having had our node number
1605 * configured yet. */
1606 if (o2nm_this_node() == O2NM_MAX_NODES)
1607 goto out;
1608
1609 fd = simple_strtol(p, &p, 0);
1610 if (!p || (*p && (*p != '\n')))
1611 goto out;
1612
1613 if (fd < 0 || fd >= INT_MAX)
1614 goto out;
1615
1616 filp = fget(fd);
1617 if (filp == NULL)
1618 goto out;
1619
1620 if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
1621 reg->hr_block_bytes == 0)
1622 goto out;
1623
1624 inode = igrab(filp->f_mapping->host);
1625 if (inode == NULL)
1626 goto out;
1627
1628 if (!S_ISBLK(inode->i_mode))
1629 goto out;
1630
1631 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
Al Viro572c4892007-10-08 13:24:05 -04001632 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001633 if (ret) {
1634 reg->hr_bdev = NULL;
1635 goto out;
1636 }
1637 inode = NULL;
1638
1639 bdevname(reg->hr_bdev, reg->hr_dev_name);
1640
Martin K. Petersene1defc42009-05-22 17:17:49 -04001641 sectsize = bdev_logical_block_size(reg->hr_bdev);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001642 if (sectsize != reg->hr_block_bytes) {
1643 mlog(ML_ERROR,
1644 "blocksize %u incorrect for device, expected %d",
1645 reg->hr_block_bytes, sectsize);
1646 ret = -EINVAL;
1647 goto out;
1648 }
1649
1650 o2hb_init_region_params(reg);
1651
1652 /* Generation of zero is invalid */
1653 do {
1654 get_random_bytes(&reg->hr_generation,
1655 sizeof(reg->hr_generation));
1656 } while (reg->hr_generation == 0);
1657
1658 ret = o2hb_map_slot_data(reg);
1659 if (ret) {
1660 mlog_errno(ret);
1661 goto out;
1662 }
1663
1664 ret = o2hb_populate_slot_data(reg);
1665 if (ret) {
1666 mlog_errno(ret);
1667 goto out;
1668 }
1669
David Howellsc4028952006-11-22 14:57:56 +00001670 INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001671
1672 /*
1673 * A node is considered live after it has beat LIVE_THRESHOLD
1674 * times. We're not steady until we've given them a chance
1675 * _after_ our first read.
1676 */
1677 atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
1678
Joel Beckere6c352d2007-02-03 03:04:20 -08001679 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1680 reg->hr_item.ci_name);
1681 if (IS_ERR(hb_task)) {
1682 ret = PTR_ERR(hb_task);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001683 mlog_errno(ret);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001684 goto out;
1685 }
1686
Joel Beckere6c352d2007-02-03 03:04:20 -08001687 spin_lock(&o2hb_live_lock);
1688 reg->hr_task = hb_task;
1689 spin_unlock(&o2hb_live_lock);
1690
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001691 ret = wait_event_interruptible(o2hb_steady_queue,
1692 atomic_read(&reg->hr_steady_iterations) == 0);
1693 if (ret) {
Joel Beckere6df3a62007-02-06 15:45:39 -08001694 /* We got interrupted (hello ptrace!). Clean up */
Joel Beckere6c352d2007-02-03 03:04:20 -08001695 spin_lock(&o2hb_live_lock);
1696 hb_task = reg->hr_task;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001697 reg->hr_task = NULL;
Joel Beckere6c352d2007-02-03 03:04:20 -08001698 spin_unlock(&o2hb_live_lock);
1699
1700 if (hb_task)
1701 kthread_stop(hb_task);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001702 goto out;
1703 }
1704
Joel Beckere6df3a62007-02-06 15:45:39 -08001705 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1706 spin_lock(&o2hb_live_lock);
1707 hb_task = reg->hr_task;
Sunil Mushrane7d656b2010-10-06 17:55:18 -07001708 if (o2hb_global_heartbeat_active())
1709 set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
Joel Beckere6df3a62007-02-06 15:45:39 -08001710 spin_unlock(&o2hb_live_lock);
1711
1712 if (hb_task)
1713 ret = count;
1714 else
1715 ret = -EIO;
1716
Sunil Mushran18c50cb2010-10-06 18:26:59 -07001717 if (hb_task && o2hb_global_heartbeat_active())
1718 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
1719 config_item_name(&reg->hr_item));
1720
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001721out:
1722 if (filp)
1723 fput(filp);
1724 if (inode)
1725 iput(inode);
1726 if (ret < 0) {
1727 if (reg->hr_bdev) {
Al Viro9a1c3542008-02-22 20:40:24 -05001728 blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001729 reg->hr_bdev = NULL;
1730 }
1731 }
1732 return ret;
1733}
1734
Zhen Wei92efc152006-12-08 00:48:17 -07001735static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
1736 char *page)
1737{
Joel Beckere6c352d2007-02-03 03:04:20 -08001738 pid_t pid = 0;
1739
1740 spin_lock(&o2hb_live_lock);
1741 if (reg->hr_task)
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -07001742 pid = task_pid_nr(reg->hr_task);
Joel Beckere6c352d2007-02-03 03:04:20 -08001743 spin_unlock(&o2hb_live_lock);
1744
1745 if (!pid)
Zhen Wei92efc152006-12-08 00:48:17 -07001746 return 0;
1747
Joel Beckere6c352d2007-02-03 03:04:20 -08001748 return sprintf(page, "%u\n", pid);
Zhen Wei92efc152006-12-08 00:48:17 -07001749}
1750
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001751struct o2hb_region_attribute {
1752 struct configfs_attribute attr;
1753 ssize_t (*show)(struct o2hb_region *, char *);
1754 ssize_t (*store)(struct o2hb_region *, const char *, size_t);
1755};
1756
1757static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
1758 .attr = { .ca_owner = THIS_MODULE,
1759 .ca_name = "block_bytes",
1760 .ca_mode = S_IRUGO | S_IWUSR },
1761 .show = o2hb_region_block_bytes_read,
1762 .store = o2hb_region_block_bytes_write,
1763};
1764
1765static struct o2hb_region_attribute o2hb_region_attr_start_block = {
1766 .attr = { .ca_owner = THIS_MODULE,
1767 .ca_name = "start_block",
1768 .ca_mode = S_IRUGO | S_IWUSR },
1769 .show = o2hb_region_start_block_read,
1770 .store = o2hb_region_start_block_write,
1771};
1772
1773static struct o2hb_region_attribute o2hb_region_attr_blocks = {
1774 .attr = { .ca_owner = THIS_MODULE,
1775 .ca_name = "blocks",
1776 .ca_mode = S_IRUGO | S_IWUSR },
1777 .show = o2hb_region_blocks_read,
1778 .store = o2hb_region_blocks_write,
1779};
1780
1781static struct o2hb_region_attribute o2hb_region_attr_dev = {
1782 .attr = { .ca_owner = THIS_MODULE,
1783 .ca_name = "dev",
1784 .ca_mode = S_IRUGO | S_IWUSR },
1785 .show = o2hb_region_dev_read,
1786 .store = o2hb_region_dev_write,
1787};
1788
Zhen Wei92efc152006-12-08 00:48:17 -07001789static struct o2hb_region_attribute o2hb_region_attr_pid = {
1790 .attr = { .ca_owner = THIS_MODULE,
1791 .ca_name = "pid",
1792 .ca_mode = S_IRUGO | S_IRUSR },
1793 .show = o2hb_region_pid_read,
1794};
1795
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001796static struct configfs_attribute *o2hb_region_attrs[] = {
1797 &o2hb_region_attr_block_bytes.attr,
1798 &o2hb_region_attr_start_block.attr,
1799 &o2hb_region_attr_blocks.attr,
1800 &o2hb_region_attr_dev.attr,
Zhen Wei92efc152006-12-08 00:48:17 -07001801 &o2hb_region_attr_pid.attr,
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001802 NULL,
1803};
1804
1805static ssize_t o2hb_region_show(struct config_item *item,
1806 struct configfs_attribute *attr,
1807 char *page)
1808{
1809 struct o2hb_region *reg = to_o2hb_region(item);
1810 struct o2hb_region_attribute *o2hb_region_attr =
1811 container_of(attr, struct o2hb_region_attribute, attr);
1812 ssize_t ret = 0;
1813
1814 if (o2hb_region_attr->show)
1815 ret = o2hb_region_attr->show(reg, page);
1816 return ret;
1817}
1818
1819static ssize_t o2hb_region_store(struct config_item *item,
1820 struct configfs_attribute *attr,
1821 const char *page, size_t count)
1822{
1823 struct o2hb_region *reg = to_o2hb_region(item);
1824 struct o2hb_region_attribute *o2hb_region_attr =
1825 container_of(attr, struct o2hb_region_attribute, attr);
1826 ssize_t ret = -EINVAL;
1827
1828 if (o2hb_region_attr->store)
1829 ret = o2hb_region_attr->store(reg, page, count);
1830 return ret;
1831}
1832
1833static struct configfs_item_operations o2hb_region_item_ops = {
1834 .release = o2hb_region_release,
1835 .show_attribute = o2hb_region_show,
1836 .store_attribute = o2hb_region_store,
1837};
1838
1839static struct config_item_type o2hb_region_type = {
1840 .ct_item_ops = &o2hb_region_item_ops,
1841 .ct_attrs = o2hb_region_attrs,
1842 .ct_owner = THIS_MODULE,
1843};
1844
1845/* heartbeat set */
1846
1847struct o2hb_heartbeat_group {
1848 struct config_group hs_group;
1849 /* some stuff? */
1850};
1851
1852static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
1853{
1854 return group ?
1855 container_of(group, struct o2hb_heartbeat_group, hs_group)
1856 : NULL;
1857}
1858
Joel Beckerf89ab862008-07-17 14:53:48 -07001859static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1860 const char *name)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001861{
1862 struct o2hb_region *reg = NULL;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001863
Robert P. J. Daycd861282006-12-13 00:34:52 -08001864 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
Joel Beckerf89ab862008-07-17 14:53:48 -07001865 if (reg == NULL)
Joel Beckera6795e92008-07-17 15:21:29 -07001866 return ERR_PTR(-ENOMEM);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001867
Sunil Mushranb3c85c42010-10-07 14:31:06 -07001868 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
1869 return ERR_PTR(-ENAMETOOLONG);
1870
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001871 spin_lock(&o2hb_live_lock);
Sunil Mushran536f0742010-10-07 17:03:07 -07001872 reg->hr_region_num = 0;
1873 if (o2hb_global_heartbeat_active()) {
1874 reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
1875 O2NM_MAX_REGIONS);
1876 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1877 spin_unlock(&o2hb_live_lock);
1878 return ERR_PTR(-EFBIG);
1879 }
1880 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1881 }
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001882 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1883 spin_unlock(&o2hb_live_lock);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001884
Sunil Mushran536f0742010-10-07 17:03:07 -07001885 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1886
Joel Beckera6795e92008-07-17 15:21:29 -07001887 return &reg->hr_item;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001888}
1889
1890static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1891 struct config_item *item)
1892{
Joel Beckere6c352d2007-02-03 03:04:20 -08001893 struct task_struct *hb_task;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001894 struct o2hb_region *reg = to_o2hb_region(item);
1895
1896 /* stop the thread when the user removes the region dir */
Joel Beckere6c352d2007-02-03 03:04:20 -08001897 spin_lock(&o2hb_live_lock);
Sunil Mushrane7d656b2010-10-06 17:55:18 -07001898 if (o2hb_global_heartbeat_active()) {
Sunil Mushran536f0742010-10-07 17:03:07 -07001899 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
Sunil Mushrane7d656b2010-10-06 17:55:18 -07001900 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
1901 }
Joel Beckere6c352d2007-02-03 03:04:20 -08001902 hb_task = reg->hr_task;
1903 reg->hr_task = NULL;
1904 spin_unlock(&o2hb_live_lock);
1905
1906 if (hb_task)
1907 kthread_stop(hb_task);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001908
Joel Beckere6df3a62007-02-06 15:45:39 -08001909 /*
1910 * If we're racing a dev_write(), we need to wake them. They will
1911 * check reg->hr_task
1912 */
1913 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1914 atomic_set(&reg->hr_steady_iterations, 0);
1915 wake_up(&o2hb_steady_queue);
1916 }
1917
Sunil Mushran18c50cb2010-10-06 18:26:59 -07001918 if (o2hb_global_heartbeat_active())
1919 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
1920 config_item_name(&reg->hr_item));
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08001921 config_item_put(item);
1922}
1923
1924struct o2hb_heartbeat_group_attribute {
1925 struct configfs_attribute attr;
1926 ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
1927 ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
1928};
1929
1930static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
1931 struct configfs_attribute *attr,
1932 char *page)
1933{
1934 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1935 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1936 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1937 ssize_t ret = 0;
1938
1939 if (o2hb_heartbeat_group_attr->show)
1940 ret = o2hb_heartbeat_group_attr->show(reg, page);
1941 return ret;
1942}
1943
1944static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
1945 struct configfs_attribute *attr,
1946 const char *page, size_t count)
1947{
1948 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1949 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1950 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1951 ssize_t ret = -EINVAL;
1952
1953 if (o2hb_heartbeat_group_attr->store)
1954 ret = o2hb_heartbeat_group_attr->store(reg, page, count);
1955 return ret;
1956}
1957
1958static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
1959 char *page)
1960{
1961 return sprintf(page, "%u\n", o2hb_dead_threshold);
1962}
1963
1964static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
1965 const char *page,
1966 size_t count)
1967{
1968 unsigned long tmp;
1969 char *p = (char *)page;
1970
1971 tmp = simple_strtoul(p, &p, 10);
1972 if (!p || (*p && (*p != '\n')))
1973 return -EINVAL;
1974
1975 /* this will validate ranges for us. */
1976 o2hb_dead_threshold_set((unsigned int) tmp);
1977
1978 return count;
1979}
1980
Sunil Mushran54b51872010-10-07 15:26:08 -07001981static
1982ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
1983 char *page)
1984{
1985 return sprintf(page, "%s\n",
1986 o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
1987}
1988
1989static
1990ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
1991 const char *page, size_t count)
1992{
1993 unsigned int i;
1994 int ret;
1995 size_t len;
1996
1997 len = (page[count - 1] == '\n') ? count - 1 : count;
1998 if (!len)
1999 return -EINVAL;
2000
2001 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2002 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2003 continue;
2004
2005 ret = o2hb_global_hearbeat_mode_set(i);
2006 if (!ret)
Sunil Mushran18c50cb2010-10-06 18:26:59 -07002007 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
Sunil Mushran54b51872010-10-07 15:26:08 -07002008 o2hb_heartbeat_mode_desc[i]);
2009 return count;
2010 }
2011
2012 return -EINVAL;
2013
2014}
2015
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002016static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
2017 .attr = { .ca_owner = THIS_MODULE,
2018 .ca_name = "dead_threshold",
2019 .ca_mode = S_IRUGO | S_IWUSR },
2020 .show = o2hb_heartbeat_group_threshold_show,
2021 .store = o2hb_heartbeat_group_threshold_store,
2022};
2023
Sunil Mushran54b51872010-10-07 15:26:08 -07002024static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
2025 .attr = { .ca_owner = THIS_MODULE,
2026 .ca_name = "mode",
2027 .ca_mode = S_IRUGO | S_IWUSR },
2028 .show = o2hb_heartbeat_group_mode_show,
2029 .store = o2hb_heartbeat_group_mode_store,
2030};
2031
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002032static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
2033 &o2hb_heartbeat_group_attr_threshold.attr,
Sunil Mushran54b51872010-10-07 15:26:08 -07002034 &o2hb_heartbeat_group_attr_mode.attr,
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002035 NULL,
2036};
2037
2038static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
2039 .show_attribute = o2hb_heartbeat_group_show,
2040 .store_attribute = o2hb_heartbeat_group_store,
2041};
2042
2043static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
2044 .make_item = o2hb_heartbeat_group_make_item,
2045 .drop_item = o2hb_heartbeat_group_drop_item,
2046};
2047
2048static struct config_item_type o2hb_heartbeat_group_type = {
2049 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
2050 .ct_item_ops = &o2hb_hearbeat_group_item_ops,
2051 .ct_attrs = o2hb_heartbeat_group_attrs,
2052 .ct_owner = THIS_MODULE,
2053};
2054
2055/* this is just here to avoid touching group in heartbeat.h which the
2056 * entire damn world #includes */
2057struct config_group *o2hb_alloc_hb_set(void)
2058{
2059 struct o2hb_heartbeat_group *hs = NULL;
2060 struct config_group *ret = NULL;
2061
Robert P. J. Daycd861282006-12-13 00:34:52 -08002062 hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002063 if (hs == NULL)
2064 goto out;
2065
2066 config_group_init_type_name(&hs->hs_group, "heartbeat",
2067 &o2hb_heartbeat_group_type);
2068
2069 ret = &hs->hs_group;
2070out:
2071 if (ret == NULL)
2072 kfree(hs);
2073 return ret;
2074}
2075
2076void o2hb_free_hb_set(struct config_group *group)
2077{
2078 struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
2079 kfree(hs);
2080}
2081
2082/* hb callback registration and issueing */
2083
2084static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
2085{
2086 if (type == O2HB_NUM_CB)
2087 return ERR_PTR(-EINVAL);
2088
2089 return &o2hb_callbacks[type];
2090}
2091
2092void o2hb_setup_callback(struct o2hb_callback_func *hc,
2093 enum o2hb_callback_type type,
2094 o2hb_cb_func *func,
2095 void *data,
2096 int priority)
2097{
2098 INIT_LIST_HEAD(&hc->hc_item);
2099 hc->hc_func = func;
2100 hc->hc_data = data;
2101 hc->hc_priority = priority;
2102 hc->hc_type = type;
2103 hc->hc_magic = O2HB_CB_MAGIC;
2104}
2105EXPORT_SYMBOL_GPL(o2hb_setup_callback);
2106
Joel Becker14829422007-06-14 21:40:49 -07002107static struct o2hb_region *o2hb_find_region(const char *region_uuid)
2108{
2109 struct o2hb_region *p, *reg = NULL;
2110
2111 assert_spin_locked(&o2hb_live_lock);
2112
2113 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
2114 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
2115 reg = p;
2116 break;
2117 }
2118 }
2119
2120 return reg;
2121}
2122
2123static int o2hb_region_get(const char *region_uuid)
2124{
2125 int ret = 0;
2126 struct o2hb_region *reg;
2127
2128 spin_lock(&o2hb_live_lock);
2129
2130 reg = o2hb_find_region(region_uuid);
2131 if (!reg)
2132 ret = -ENOENT;
2133 spin_unlock(&o2hb_live_lock);
2134
Joel Becker16c6a4f2007-06-19 11:34:03 -07002135 if (ret)
2136 goto out;
Joel Becker14829422007-06-14 21:40:49 -07002137
Joel Becker16c6a4f2007-06-19 11:34:03 -07002138 ret = o2nm_depend_this_node();
2139 if (ret)
2140 goto out;
2141
2142 ret = o2nm_depend_item(&reg->hr_item);
2143 if (ret)
2144 o2nm_undepend_this_node();
2145
2146out:
Joel Becker14829422007-06-14 21:40:49 -07002147 return ret;
2148}
2149
2150static void o2hb_region_put(const char *region_uuid)
2151{
2152 struct o2hb_region *reg;
2153
2154 spin_lock(&o2hb_live_lock);
2155
2156 reg = o2hb_find_region(region_uuid);
2157
2158 spin_unlock(&o2hb_live_lock);
2159
Joel Becker16c6a4f2007-06-19 11:34:03 -07002160 if (reg) {
Joel Becker14829422007-06-14 21:40:49 -07002161 o2nm_undepend_item(&reg->hr_item);
Joel Becker16c6a4f2007-06-19 11:34:03 -07002162 o2nm_undepend_this_node();
2163 }
Joel Becker14829422007-06-14 21:40:49 -07002164}
2165
2166int o2hb_register_callback(const char *region_uuid,
2167 struct o2hb_callback_func *hc)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002168{
2169 struct o2hb_callback_func *tmp;
2170 struct list_head *iter;
2171 struct o2hb_callback *hbcall;
2172 int ret;
2173
2174 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
2175 BUG_ON(!list_empty(&hc->hc_item));
2176
2177 hbcall = hbcall_from_type(hc->hc_type);
2178 if (IS_ERR(hbcall)) {
2179 ret = PTR_ERR(hbcall);
2180 goto out;
2181 }
2182
Joel Becker14829422007-06-14 21:40:49 -07002183 if (region_uuid) {
2184 ret = o2hb_region_get(region_uuid);
2185 if (ret)
2186 goto out;
2187 }
2188
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002189 down_write(&o2hb_callback_sem);
2190
2191 list_for_each(iter, &hbcall->list) {
2192 tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
2193 if (hc->hc_priority < tmp->hc_priority) {
2194 list_add_tail(&hc->hc_item, iter);
2195 break;
2196 }
2197 }
2198 if (list_empty(&hc->hc_item))
2199 list_add_tail(&hc->hc_item, &hbcall->list);
2200
2201 up_write(&o2hb_callback_sem);
2202 ret = 0;
2203out:
2204 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
2205 ret, __builtin_return_address(0), hc);
2206 return ret;
2207}
2208EXPORT_SYMBOL_GPL(o2hb_register_callback);
2209
Joel Becker14829422007-06-14 21:40:49 -07002210void o2hb_unregister_callback(const char *region_uuid,
2211 struct o2hb_callback_func *hc)
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002212{
2213 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
2214
2215 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
2216 __builtin_return_address(0), hc);
2217
Joel Becker14829422007-06-14 21:40:49 -07002218 /* XXX Can this happen _with_ a region reference? */
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002219 if (list_empty(&hc->hc_item))
Joel Beckerc24f72c2007-02-03 03:14:30 -08002220 return;
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002221
Joel Becker14829422007-06-14 21:40:49 -07002222 if (region_uuid)
2223 o2hb_region_put(region_uuid);
2224
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002225 down_write(&o2hb_callback_sem);
2226
2227 list_del_init(&hc->hc_item);
2228
2229 up_write(&o2hb_callback_sem);
Mark Fasheha7f6a5f2005-12-15 14:31:23 -08002230}
2231EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
2232
2233int o2hb_check_node_heartbeating(u8 node_num)
2234{
2235 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2236
2237 o2hb_fill_node_map(testing_map, sizeof(testing_map));
2238 if (!test_bit(node_num, testing_map)) {
2239 mlog(ML_HEARTBEAT,
2240 "node (%u) does not have heartbeating enabled.\n",
2241 node_num);
2242 return 0;
2243 }
2244
2245 return 1;
2246}
2247EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2248
2249int o2hb_check_node_heartbeating_from_callback(u8 node_num)
2250{
2251 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2252
2253 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2254 if (!test_bit(node_num, testing_map)) {
2255 mlog(ML_HEARTBEAT,
2256 "node (%u) does not have heartbeating enabled.\n",
2257 node_num);
2258 return 0;
2259 }
2260
2261 return 1;
2262}
2263EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
2264
2265/* Makes sure our local node is configured with a node number, and is
2266 * heartbeating. */
2267int o2hb_check_local_node_heartbeating(void)
2268{
2269 u8 node_num;
2270
2271 /* if this node was set then we have networking */
2272 node_num = o2nm_this_node();
2273 if (node_num == O2NM_MAX_NODES) {
2274 mlog(ML_HEARTBEAT, "this node has not been configured.\n");
2275 return 0;
2276 }
2277
2278 return o2hb_check_node_heartbeating(node_num);
2279}
2280EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
2281
2282/*
2283 * this is just a hack until we get the plumbing which flips file systems
2284 * read only and drops the hb ref instead of killing the node dead.
2285 */
2286void o2hb_stop_all_regions(void)
2287{
2288 struct o2hb_region *reg;
2289
2290 mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
2291
2292 spin_lock(&o2hb_live_lock);
2293
2294 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
2295 reg->hr_unclean_stop = 1;
2296
2297 spin_unlock(&o2hb_live_lock);
2298}
2299EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
Sunil Mushranb3c85c42010-10-07 14:31:06 -07002300
2301int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2302{
2303 struct o2hb_region *reg;
2304 int numregs = 0;
2305 char *p;
2306
2307 spin_lock(&o2hb_live_lock);
2308
2309 p = region_uuids;
2310 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2311 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2312 if (numregs < max_regions) {
2313 memcpy(p, config_item_name(&reg->hr_item),
2314 O2HB_MAX_REGION_NAME_LEN);
2315 p += O2HB_MAX_REGION_NAME_LEN;
2316 }
2317 numregs++;
2318 }
2319
2320 spin_unlock(&o2hb_live_lock);
2321
2322 return numregs;
2323}
2324EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
2325
2326int o2hb_global_heartbeat_active(void)
2327{
2328 return 0;
2329}
2330EXPORT_SYMBOL(o2hb_global_heartbeat_active);