| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1 | /* -*- mode: c; c-basic-offset: 8; -*- | 
|  | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 
|  | 3 | * | 
|  | 4 | * dlmrecovery.c | 
|  | 5 | * | 
|  | 6 | * recovery stuff | 
|  | 7 | * | 
|  | 8 | * Copyright (C) 2004 Oracle.  All rights reserved. | 
|  | 9 | * | 
|  | 10 | * This program is free software; you can redistribute it and/or | 
|  | 11 | * modify it under the terms of the GNU General Public | 
|  | 12 | * License as published by the Free Software Foundation; either | 
|  | 13 | * version 2 of the License, or (at your option) any later version. | 
|  | 14 | * | 
|  | 15 | * This program is distributed in the hope that it will be useful, | 
|  | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 18 | * General Public License for more details. | 
|  | 19 | * | 
|  | 20 | * You should have received a copy of the GNU General Public | 
|  | 21 | * License along with this program; if not, write to the | 
|  | 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 
|  | 23 | * Boston, MA 021110-1307, USA. | 
|  | 24 | * | 
|  | 25 | */ | 
|  | 26 |  | 
|  | 27 |  | 
|  | 28 | #include <linux/module.h> | 
|  | 29 | #include <linux/fs.h> | 
|  | 30 | #include <linux/types.h> | 
|  | 31 | #include <linux/slab.h> | 
|  | 32 | #include <linux/highmem.h> | 
|  | 33 | #include <linux/utsname.h> | 
|  | 34 | #include <linux/init.h> | 
|  | 35 | #include <linux/sysctl.h> | 
|  | 36 | #include <linux/random.h> | 
|  | 37 | #include <linux/blkdev.h> | 
|  | 38 | #include <linux/socket.h> | 
|  | 39 | #include <linux/inet.h> | 
|  | 40 | #include <linux/timer.h> | 
|  | 41 | #include <linux/kthread.h> | 
| Adrian Bunk | b4c7f53 | 2006-01-14 20:55:10 +0100 | [diff] [blame] | 42 | #include <linux/delay.h> | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 43 |  | 
|  | 44 |  | 
|  | 45 | #include "cluster/heartbeat.h" | 
|  | 46 | #include "cluster/nodemanager.h" | 
|  | 47 | #include "cluster/tcp.h" | 
|  | 48 |  | 
|  | 49 | #include "dlmapi.h" | 
|  | 50 | #include "dlmcommon.h" | 
|  | 51 | #include "dlmdomain.h" | 
|  | 52 |  | 
|  | 53 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) | 
|  | 54 | #include "cluster/masklog.h" | 
|  | 55 |  | 
|  | 56 | static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); | 
|  | 57 |  | 
|  | 58 | static int dlm_recovery_thread(void *data); | 
|  | 59 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); | 
|  | 60 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 61 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 62 | static int dlm_do_recovery(struct dlm_ctxt *dlm); | 
|  | 63 |  | 
|  | 64 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); | 
|  | 65 | static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); | 
|  | 66 | static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); | 
|  | 67 | static int dlm_request_all_locks(struct dlm_ctxt *dlm, | 
|  | 68 | u8 request_from, u8 dead_node); | 
|  | 69 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); | 
|  | 70 |  | 
|  | 71 | static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); | 
|  | 72 | static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | 
|  | 73 | const char *lockname, int namelen, | 
|  | 74 | int total_locks, u64 cookie, | 
|  | 75 | u8 flags, u8 master); | 
|  | 76 | static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | 
|  | 77 | struct dlm_migratable_lockres *mres, | 
|  | 78 | u8 send_to, | 
|  | 79 | struct dlm_lock_resource *res, | 
|  | 80 | int total_locks); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 81 | static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | 
|  | 82 | struct dlm_lock_resource *res, | 
|  | 83 | struct dlm_migratable_lockres *mres); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 84 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm); | 
|  | 85 | static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, | 
|  | 86 | u8 dead_node, u8 send_to); | 
|  | 87 | static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node); | 
|  | 88 | static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | 
|  | 89 | struct list_head *list, u8 dead_node); | 
|  | 90 | static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | 
|  | 91 | u8 dead_node, u8 new_master); | 
|  | 92 | static void dlm_reco_ast(void *astdata); | 
|  | 93 | static void dlm_reco_bast(void *astdata, int blocked_type); | 
|  | 94 | static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st); | 
|  | 95 | static void dlm_request_all_locks_worker(struct dlm_work_item *item, | 
|  | 96 | void *data); | 
|  | 97 | static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); | 
| Adrian Bunk | 8169cae | 2006-03-31 16:53:55 +0200 | [diff] [blame] | 98 | static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, | 
|  | 99 | struct dlm_lock_resource *res, | 
|  | 100 | u8 *real_master); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 101 |  | 
|  | 102 | static u64 dlm_get_next_mig_cookie(void); | 
|  | 103 |  | 
| Ingo Molnar | 34af946 | 2006-06-27 02:53:55 -0700 | [diff] [blame] | 104 | static DEFINE_SPINLOCK(dlm_reco_state_lock); | 
|  | 105 | static DEFINE_SPINLOCK(dlm_mig_cookie_lock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 106 | static u64 dlm_mig_cookie = 1; | 
|  | 107 |  | 
|  | 108 | static u64 dlm_get_next_mig_cookie(void) | 
|  | 109 | { | 
|  | 110 | u64 c; | 
|  | 111 | spin_lock(&dlm_mig_cookie_lock); | 
|  | 112 | c = dlm_mig_cookie; | 
|  | 113 | if (dlm_mig_cookie == (~0ULL)) | 
|  | 114 | dlm_mig_cookie = 1; | 
|  | 115 | else | 
|  | 116 | dlm_mig_cookie++; | 
|  | 117 | spin_unlock(&dlm_mig_cookie_lock); | 
|  | 118 | return c; | 
|  | 119 | } | 
|  | 120 |  | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 121 | static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, | 
|  | 122 | u8 dead_node) | 
|  | 123 | { | 
|  | 124 | assert_spin_locked(&dlm->spinlock); | 
|  | 125 | if (dlm->reco.dead_node != dead_node) | 
|  | 126 | mlog(0, "%s: changing dead_node from %u to %u\n", | 
|  | 127 | dlm->name, dlm->reco.dead_node, dead_node); | 
|  | 128 | dlm->reco.dead_node = dead_node; | 
|  | 129 | } | 
|  | 130 |  | 
|  | 131 | static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, | 
|  | 132 | u8 master) | 
|  | 133 | { | 
|  | 134 | assert_spin_locked(&dlm->spinlock); | 
|  | 135 | mlog(0, "%s: changing new_master from %u to %u\n", | 
|  | 136 | dlm->name, dlm->reco.new_master, master); | 
|  | 137 | dlm->reco.new_master = master; | 
|  | 138 | } | 
|  | 139 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 140 | static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 141 | { | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 142 | assert_spin_locked(&dlm->spinlock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 143 | clear_bit(dlm->reco.dead_node, dlm->recovery_map); | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 144 | dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); | 
|  | 145 | dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 146 | } | 
|  | 147 |  | 
|  | 148 | static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) | 
|  | 149 | { | 
|  | 150 | spin_lock(&dlm->spinlock); | 
|  | 151 | __dlm_reset_recovery(dlm); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 152 | spin_unlock(&dlm->spinlock); | 
|  | 153 | } | 
|  | 154 |  | 
|  | 155 | /* Worker function used during recovery. */ | 
| David Howells | c402895 | 2006-11-22 14:57:56 +0000 | [diff] [blame] | 156 | void dlm_dispatch_work(struct work_struct *work) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 157 | { | 
| David Howells | c402895 | 2006-11-22 14:57:56 +0000 | [diff] [blame] | 158 | struct dlm_ctxt *dlm = | 
|  | 159 | container_of(work, struct dlm_ctxt, dispatched_work); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 160 | LIST_HEAD(tmp_list); | 
|  | 161 | struct list_head *iter, *iter2; | 
|  | 162 | struct dlm_work_item *item; | 
|  | 163 | dlm_workfunc_t *workfunc; | 
| Kurt Hackel | 3156d26 | 2006-05-01 14:39:29 -0700 | [diff] [blame] | 164 | int tot=0; | 
|  | 165 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 166 | spin_lock(&dlm->work_lock); | 
|  | 167 | list_splice_init(&dlm->work_list, &tmp_list); | 
|  | 168 | spin_unlock(&dlm->work_lock); | 
|  | 169 |  | 
|  | 170 | list_for_each_safe(iter, iter2, &tmp_list) { | 
| Kurt Hackel | 3156d26 | 2006-05-01 14:39:29 -0700 | [diff] [blame] | 171 | tot++; | 
|  | 172 | } | 
|  | 173 | mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); | 
|  | 174 |  | 
|  | 175 | list_for_each_safe(iter, iter2, &tmp_list) { | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 176 | item = list_entry(iter, struct dlm_work_item, list); | 
|  | 177 | workfunc = item->func; | 
|  | 178 | list_del_init(&item->list); | 
|  | 179 |  | 
|  | 180 | /* already have ref on dlm to avoid having | 
|  | 181 | * it disappear.  just double-check. */ | 
|  | 182 | BUG_ON(item->dlm != dlm); | 
|  | 183 |  | 
|  | 184 | /* this is allowed to sleep and | 
|  | 185 | * call network stuff */ | 
|  | 186 | workfunc(item, item->data); | 
|  | 187 |  | 
|  | 188 | dlm_put(dlm); | 
|  | 189 | kfree(item); | 
|  | 190 | } | 
|  | 191 | } | 
|  | 192 |  | 
|  | 193 | /* | 
|  | 194 | * RECOVERY THREAD | 
|  | 195 | */ | 
|  | 196 |  | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 197 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 198 | { | 
|  | 199 | /* wake the recovery thread | 
|  | 200 | * this will wake the reco thread in one of three places | 
|  | 201 | * 1) sleeping with no recovery happening | 
|  | 202 | * 2) sleeping with recovery mastered elsewhere | 
|  | 203 | * 3) recovery mastered here, waiting on reco data */ | 
|  | 204 |  | 
|  | 205 | wake_up(&dlm->dlm_reco_thread_wq); | 
|  | 206 | } | 
|  | 207 |  | 
|  | 208 | /* Launch the recovery thread */ | 
|  | 209 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) | 
|  | 210 | { | 
|  | 211 | mlog(0, "starting dlm recovery thread...\n"); | 
|  | 212 |  | 
|  | 213 | dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, | 
|  | 214 | "dlm_reco_thread"); | 
|  | 215 | if (IS_ERR(dlm->dlm_reco_thread_task)) { | 
|  | 216 | mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); | 
|  | 217 | dlm->dlm_reco_thread_task = NULL; | 
|  | 218 | return -EINVAL; | 
|  | 219 | } | 
|  | 220 |  | 
|  | 221 | return 0; | 
|  | 222 | } | 
|  | 223 |  | 
|  | 224 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) | 
|  | 225 | { | 
|  | 226 | if (dlm->dlm_reco_thread_task) { | 
|  | 227 | mlog(0, "waiting for dlm recovery thread to exit\n"); | 
|  | 228 | kthread_stop(dlm->dlm_reco_thread_task); | 
|  | 229 | dlm->dlm_reco_thread_task = NULL; | 
|  | 230 | } | 
|  | 231 | } | 
|  | 232 |  | 
|  | 233 |  | 
|  | 234 |  | 
|  | 235 | /* | 
|  | 236 | * this is lame, but here's how recovery works... | 
|  | 237 | * 1) all recovery threads cluster wide will work on recovering | 
|  | 238 | *    ONE node at a time | 
|  | 239 | * 2) negotiate who will take over all the locks for the dead node. | 
|  | 240 | *    thats right... ALL the locks. | 
|  | 241 | * 3) once a new master is chosen, everyone scans all locks | 
|  | 242 | *    and moves aside those mastered by the dead guy | 
|  | 243 | * 4) each of these locks should be locked until recovery is done | 
|  | 244 | * 5) the new master collects up all of secondary lock queue info | 
|  | 245 | *    one lock at a time, forcing each node to communicate back | 
|  | 246 | *    before continuing | 
|  | 247 | * 6) each secondary lock queue responds with the full known lock info | 
|  | 248 | * 7) once the new master has run all its locks, it sends a ALLDONE! | 
|  | 249 | *    message to everyone | 
|  | 250 | * 8) upon receiving this message, the secondary queue node unlocks | 
|  | 251 | *    and responds to the ALLDONE | 
|  | 252 | * 9) once the new master gets responses from everyone, he unlocks | 
|  | 253 | *    everything and recovery for this dead node is done | 
|  | 254 | *10) go back to 2) while there are still dead nodes | 
|  | 255 | * | 
|  | 256 | */ | 
|  | 257 |  | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 258 | static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) | 
|  | 259 | { | 
|  | 260 | struct dlm_reco_node_data *ndata; | 
|  | 261 | struct dlm_lock_resource *res; | 
|  | 262 |  | 
|  | 263 | mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", | 
|  | 264 | dlm->name, dlm->dlm_reco_thread_task->pid, | 
|  | 265 | dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", | 
|  | 266 | dlm->reco.dead_node, dlm->reco.new_master); | 
|  | 267 |  | 
|  | 268 | list_for_each_entry(ndata, &dlm->reco.node_data, list) { | 
|  | 269 | char *st = "unknown"; | 
|  | 270 | switch (ndata->state) { | 
|  | 271 | case DLM_RECO_NODE_DATA_INIT: | 
|  | 272 | st = "init"; | 
|  | 273 | break; | 
|  | 274 | case DLM_RECO_NODE_DATA_REQUESTING: | 
|  | 275 | st = "requesting"; | 
|  | 276 | break; | 
|  | 277 | case DLM_RECO_NODE_DATA_DEAD: | 
|  | 278 | st = "dead"; | 
|  | 279 | break; | 
|  | 280 | case DLM_RECO_NODE_DATA_RECEIVING: | 
|  | 281 | st = "receiving"; | 
|  | 282 | break; | 
|  | 283 | case DLM_RECO_NODE_DATA_REQUESTED: | 
|  | 284 | st = "requested"; | 
|  | 285 | break; | 
|  | 286 | case DLM_RECO_NODE_DATA_DONE: | 
|  | 287 | st = "done"; | 
|  | 288 | break; | 
|  | 289 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 
|  | 290 | st = "finalize-sent"; | 
|  | 291 | break; | 
|  | 292 | default: | 
|  | 293 | st = "bad"; | 
|  | 294 | break; | 
|  | 295 | } | 
|  | 296 | mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", | 
|  | 297 | dlm->name, ndata->node_num, st); | 
|  | 298 | } | 
|  | 299 | list_for_each_entry(res, &dlm->reco.resources, recovering) { | 
|  | 300 | mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", | 
|  | 301 | dlm->name, res->lockname.len, res->lockname.name); | 
|  | 302 | } | 
|  | 303 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 304 |  | 
|  | 305 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) | 
|  | 306 |  | 
|  | 307 | static int dlm_recovery_thread(void *data) | 
|  | 308 | { | 
|  | 309 | int status; | 
|  | 310 | struct dlm_ctxt *dlm = data; | 
|  | 311 | unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS); | 
|  | 312 |  | 
|  | 313 | mlog(0, "dlm thread running for %s...\n", dlm->name); | 
|  | 314 |  | 
|  | 315 | while (!kthread_should_stop()) { | 
|  | 316 | if (dlm_joined(dlm)) { | 
|  | 317 | status = dlm_do_recovery(dlm); | 
|  | 318 | if (status == -EAGAIN) { | 
|  | 319 | /* do not sleep, recheck immediately. */ | 
|  | 320 | continue; | 
|  | 321 | } | 
|  | 322 | if (status < 0) | 
|  | 323 | mlog_errno(status); | 
|  | 324 | } | 
|  | 325 |  | 
|  | 326 | wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, | 
|  | 327 | kthread_should_stop(), | 
|  | 328 | timeout); | 
|  | 329 | } | 
|  | 330 |  | 
|  | 331 | mlog(0, "quitting DLM recovery thread\n"); | 
|  | 332 | return 0; | 
|  | 333 | } | 
|  | 334 |  | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 335 | /* returns true when the recovery master has contacted us */ | 
|  | 336 | static int dlm_reco_master_ready(struct dlm_ctxt *dlm) | 
|  | 337 | { | 
|  | 338 | int ready; | 
|  | 339 | spin_lock(&dlm->spinlock); | 
|  | 340 | ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); | 
|  | 341 | spin_unlock(&dlm->spinlock); | 
|  | 342 | return ready; | 
|  | 343 | } | 
|  | 344 |  | 
|  | 345 | /* returns true if node is no longer in the domain | 
|  | 346 | * could be dead or just not joined */ | 
|  | 347 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) | 
|  | 348 | { | 
|  | 349 | int dead; | 
|  | 350 | spin_lock(&dlm->spinlock); | 
| Kurt Hackel | aba9aac | 2006-04-27 18:00:21 -0700 | [diff] [blame] | 351 | dead = !test_bit(node, dlm->domain_map); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 352 | spin_unlock(&dlm->spinlock); | 
|  | 353 | return dead; | 
|  | 354 | } | 
|  | 355 |  | 
| Kurt Hackel | b7084ab | 2006-05-01 13:54:07 -0700 | [diff] [blame] | 356 | /* returns true if node is no longer in the domain | 
|  | 357 | * could be dead or just not joined */ | 
| Adrian Bunk | 3fb5a98 | 2006-05-16 17:26:41 +0200 | [diff] [blame] | 358 | static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) | 
| Kurt Hackel | b7084ab | 2006-05-01 13:54:07 -0700 | [diff] [blame] | 359 | { | 
|  | 360 | int recovered; | 
|  | 361 | spin_lock(&dlm->spinlock); | 
|  | 362 | recovered = !test_bit(node, dlm->recovery_map); | 
|  | 363 | spin_unlock(&dlm->spinlock); | 
|  | 364 | return recovered; | 
|  | 365 | } | 
|  | 366 |  | 
|  | 367 |  | 
| Kurt Hackel | 44465a7 | 2006-01-18 17:05:38 -0800 | [diff] [blame] | 368 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) | 
|  | 369 | { | 
|  | 370 | if (timeout) { | 
|  | 371 | mlog(ML_NOTICE, "%s: waiting %dms for notification of " | 
|  | 372 | "death of node %u\n", dlm->name, timeout, node); | 
|  | 373 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 
|  | 374 | dlm_is_node_dead(dlm, node), | 
|  | 375 | msecs_to_jiffies(timeout)); | 
|  | 376 | } else { | 
|  | 377 | mlog(ML_NOTICE, "%s: waiting indefinitely for notification " | 
|  | 378 | "of death of node %u\n", dlm->name, node); | 
|  | 379 | wait_event(dlm->dlm_reco_thread_wq, | 
|  | 380 | dlm_is_node_dead(dlm, node)); | 
|  | 381 | } | 
|  | 382 | /* for now, return 0 */ | 
|  | 383 | return 0; | 
|  | 384 | } | 
|  | 385 |  | 
| Kurt Hackel | b7084ab | 2006-05-01 13:54:07 -0700 | [diff] [blame] | 386 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) | 
|  | 387 | { | 
|  | 388 | if (timeout) { | 
|  | 389 | mlog(0, "%s: waiting %dms for notification of " | 
|  | 390 | "recovery of node %u\n", dlm->name, timeout, node); | 
|  | 391 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 
|  | 392 | dlm_is_node_recovered(dlm, node), | 
|  | 393 | msecs_to_jiffies(timeout)); | 
|  | 394 | } else { | 
|  | 395 | mlog(0, "%s: waiting indefinitely for notification " | 
|  | 396 | "of recovery of node %u\n", dlm->name, node); | 
|  | 397 | wait_event(dlm->dlm_reco_thread_wq, | 
|  | 398 | dlm_is_node_recovered(dlm, node)); | 
|  | 399 | } | 
|  | 400 | /* for now, return 0 */ | 
|  | 401 | return 0; | 
|  | 402 | } | 
|  | 403 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 404 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | 
|  | 405 | * block on the dlm->reco.event when recovery is in progress. | 
|  | 406 | * the dlm recovery thread will set this state when it begins | 
|  | 407 | * recovering a dead node (as the new master or not) and clear | 
|  | 408 | * the state and wake as soon as all affected lock resources have | 
|  | 409 | * been marked with the RECOVERY flag */ | 
|  | 410 | static int dlm_in_recovery(struct dlm_ctxt *dlm) | 
|  | 411 | { | 
|  | 412 | int in_recovery; | 
|  | 413 | spin_lock(&dlm->spinlock); | 
|  | 414 | in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | 
|  | 415 | spin_unlock(&dlm->spinlock); | 
|  | 416 | return in_recovery; | 
|  | 417 | } | 
|  | 418 |  | 
|  | 419 |  | 
|  | 420 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm) | 
|  | 421 | { | 
| Kurt Hackel | 56a7c10 | 2006-05-01 14:30:39 -0700 | [diff] [blame] | 422 | if (dlm_in_recovery(dlm)) { | 
| Kurt Hackel | 3b3b84a | 2006-05-01 14:31:37 -0700 | [diff] [blame] | 423 | mlog(0, "%s: reco thread %d in recovery: " | 
| Kurt Hackel | 56a7c10 | 2006-05-01 14:30:39 -0700 | [diff] [blame] | 424 | "state=%d, master=%u, dead=%u\n", | 
|  | 425 | dlm->name, dlm->dlm_reco_thread_task->pid, | 
|  | 426 | dlm->reco.state, dlm->reco.new_master, | 
|  | 427 | dlm->reco.dead_node); | 
|  | 428 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 429 | wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); | 
|  | 430 | } | 
|  | 431 |  | 
|  | 432 | static void dlm_begin_recovery(struct dlm_ctxt *dlm) | 
|  | 433 | { | 
|  | 434 | spin_lock(&dlm->spinlock); | 
|  | 435 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | 
|  | 436 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; | 
|  | 437 | spin_unlock(&dlm->spinlock); | 
|  | 438 | } | 
|  | 439 |  | 
|  | 440 | static void dlm_end_recovery(struct dlm_ctxt *dlm) | 
|  | 441 | { | 
|  | 442 | spin_lock(&dlm->spinlock); | 
|  | 443 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); | 
|  | 444 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; | 
|  | 445 | spin_unlock(&dlm->spinlock); | 
|  | 446 | wake_up(&dlm->reco.event); | 
|  | 447 | } | 
|  | 448 |  | 
|  | 449 | static int dlm_do_recovery(struct dlm_ctxt *dlm) | 
|  | 450 | { | 
|  | 451 | int status = 0; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 452 | int ret; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 453 |  | 
|  | 454 | spin_lock(&dlm->spinlock); | 
|  | 455 |  | 
|  | 456 | /* check to see if the new master has died */ | 
|  | 457 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && | 
|  | 458 | test_bit(dlm->reco.new_master, dlm->recovery_map)) { | 
|  | 459 | mlog(0, "new master %u died while recovering %u!\n", | 
|  | 460 | dlm->reco.new_master, dlm->reco.dead_node); | 
|  | 461 | /* unset the new_master, leave dead_node */ | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 462 | dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 463 | } | 
|  | 464 |  | 
|  | 465 | /* select a target to recover */ | 
|  | 466 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | 
|  | 467 | int bit; | 
|  | 468 |  | 
|  | 469 | bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); | 
|  | 470 | if (bit >= O2NM_MAX_NODES || bit < 0) | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 471 | dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 472 | else | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 473 | dlm_set_reco_dead_node(dlm, bit); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 474 | } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { | 
|  | 475 | /* BUG? */ | 
|  | 476 | mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", | 
|  | 477 | dlm->reco.dead_node); | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 478 | dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 479 | } | 
|  | 480 |  | 
|  | 481 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | 
|  | 482 | // mlog(0, "nothing to recover!  sleeping now!\n"); | 
|  | 483 | spin_unlock(&dlm->spinlock); | 
|  | 484 | /* return to main thread loop and sleep. */ | 
|  | 485 | return 0; | 
|  | 486 | } | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 487 | mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", | 
|  | 488 | dlm->name, dlm->dlm_reco_thread_task->pid, | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 489 | dlm->reco.dead_node); | 
|  | 490 | spin_unlock(&dlm->spinlock); | 
|  | 491 |  | 
|  | 492 | /* take write barrier */ | 
|  | 493 | /* (stops the list reshuffling thread, proxy ast handling) */ | 
|  | 494 | dlm_begin_recovery(dlm); | 
|  | 495 |  | 
|  | 496 | if (dlm->reco.new_master == dlm->node_num) | 
|  | 497 | goto master_here; | 
|  | 498 |  | 
|  | 499 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 500 | /* choose a new master, returns 0 if this node | 
|  | 501 | * is the master, -EEXIST if it's another node. | 
|  | 502 | * this does not return until a new master is chosen | 
|  | 503 | * or recovery completes entirely. */ | 
|  | 504 | ret = dlm_pick_recovery_master(dlm); | 
|  | 505 | if (!ret) { | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 506 | /* already notified everyone.  go. */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 507 | goto master_here; | 
|  | 508 | } | 
|  | 509 | mlog(0, "another node will master this recovery session.\n"); | 
|  | 510 | } | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 511 | mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", | 
|  | 512 | dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 513 | dlm->node_num, dlm->reco.dead_node); | 
|  | 514 |  | 
|  | 515 | /* it is safe to start everything back up here | 
|  | 516 | * because all of the dead node's lock resources | 
|  | 517 | * have been marked as in-recovery */ | 
|  | 518 | dlm_end_recovery(dlm); | 
|  | 519 |  | 
|  | 520 | /* sleep out in main dlm_recovery_thread loop. */ | 
|  | 521 | return 0; | 
|  | 522 |  | 
|  | 523 | master_here: | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 524 | mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", | 
|  | 525 | dlm->dlm_reco_thread_task->pid, | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 526 | dlm->name, dlm->reco.dead_node, dlm->node_num); | 
|  | 527 |  | 
|  | 528 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | 
|  | 529 | if (status < 0) { | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 530 | /* we should never hit this anymore */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 531 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | 
|  | 532 | "retrying.\n", status, dlm->reco.dead_node); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 533 | /* yield a bit to allow any final network messages | 
|  | 534 | * to get handled on remaining nodes */ | 
|  | 535 | msleep(100); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 536 | } else { | 
|  | 537 | /* success!  see if any other nodes need recovery */ | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 538 | mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", | 
|  | 539 | dlm->name, dlm->reco.dead_node, dlm->node_num); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 540 | dlm_reset_recovery(dlm); | 
|  | 541 | } | 
|  | 542 | dlm_end_recovery(dlm); | 
|  | 543 |  | 
|  | 544 | /* continue and look for another dead node */ | 
|  | 545 | return -EAGAIN; | 
|  | 546 | } | 
|  | 547 |  | 
|  | 548 | static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | 
|  | 549 | { | 
|  | 550 | int status = 0; | 
|  | 551 | struct dlm_reco_node_data *ndata; | 
|  | 552 | struct list_head *iter; | 
|  | 553 | int all_nodes_done; | 
|  | 554 | int destroy = 0; | 
|  | 555 | int pass = 0; | 
|  | 556 |  | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 557 | do { | 
|  | 558 | /* we have become recovery master.  there is no escaping | 
|  | 559 | * this, so just keep trying until we get it. */ | 
|  | 560 | status = dlm_init_recovery_area(dlm, dead_node); | 
|  | 561 | if (status < 0) { | 
|  | 562 | mlog(ML_ERROR, "%s: failed to alloc recovery area, " | 
|  | 563 | "retrying\n", dlm->name); | 
|  | 564 | msleep(1000); | 
|  | 565 | } | 
|  | 566 | } while (status != 0); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 567 |  | 
|  | 568 | /* safe to access the node data list without a lock, since this | 
|  | 569 | * process is the only one to change the list */ | 
|  | 570 | list_for_each(iter, &dlm->reco.node_data) { | 
|  | 571 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | 
|  | 572 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); | 
|  | 573 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; | 
|  | 574 |  | 
|  | 575 | mlog(0, "requesting lock info from node %u\n", | 
|  | 576 | ndata->node_num); | 
|  | 577 |  | 
|  | 578 | if (ndata->node_num == dlm->node_num) { | 
|  | 579 | ndata->state = DLM_RECO_NODE_DATA_DONE; | 
|  | 580 | continue; | 
|  | 581 | } | 
|  | 582 |  | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 583 | do { | 
|  | 584 | status = dlm_request_all_locks(dlm, ndata->node_num, | 
|  | 585 | dead_node); | 
|  | 586 | if (status < 0) { | 
|  | 587 | mlog_errno(status); | 
|  | 588 | if (dlm_is_host_down(status)) { | 
|  | 589 | /* node died, ignore it for recovery */ | 
|  | 590 | status = 0; | 
|  | 591 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | 
|  | 592 | /* wait for the domain map to catch up | 
|  | 593 | * with the network state. */ | 
|  | 594 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 
|  | 595 | dlm_is_node_dead(dlm, | 
|  | 596 | ndata->node_num), | 
|  | 597 | msecs_to_jiffies(1000)); | 
|  | 598 | mlog(0, "waited 1 sec for %u, " | 
|  | 599 | "dead? %s\n", ndata->node_num, | 
|  | 600 | dlm_is_node_dead(dlm, ndata->node_num) ? | 
|  | 601 | "yes" : "no"); | 
|  | 602 | } else { | 
|  | 603 | /* -ENOMEM on the other node */ | 
|  | 604 | mlog(0, "%s: node %u returned " | 
|  | 605 | "%d during recovery, retrying " | 
|  | 606 | "after a short wait\n", | 
|  | 607 | dlm->name, ndata->node_num, | 
|  | 608 | status); | 
|  | 609 | msleep(100); | 
|  | 610 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 611 | } | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 612 | } while (status != 0); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 613 |  | 
|  | 614 | switch (ndata->state) { | 
|  | 615 | case DLM_RECO_NODE_DATA_INIT: | 
|  | 616 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 
|  | 617 | case DLM_RECO_NODE_DATA_REQUESTED: | 
|  | 618 | BUG(); | 
|  | 619 | break; | 
|  | 620 | case DLM_RECO_NODE_DATA_DEAD: | 
|  | 621 | mlog(0, "node %u died after requesting " | 
|  | 622 | "recovery info for node %u\n", | 
|  | 623 | ndata->node_num, dead_node); | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 624 | /* fine.  don't need this node's info. | 
|  | 625 | * continue without it. */ | 
|  | 626 | break; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 627 | case DLM_RECO_NODE_DATA_REQUESTING: | 
|  | 628 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; | 
|  | 629 | mlog(0, "now receiving recovery data from " | 
|  | 630 | "node %u for dead node %u\n", | 
|  | 631 | ndata->node_num, dead_node); | 
|  | 632 | break; | 
|  | 633 | case DLM_RECO_NODE_DATA_RECEIVING: | 
|  | 634 | mlog(0, "already receiving recovery data from " | 
|  | 635 | "node %u for dead node %u\n", | 
|  | 636 | ndata->node_num, dead_node); | 
|  | 637 | break; | 
|  | 638 | case DLM_RECO_NODE_DATA_DONE: | 
|  | 639 | mlog(0, "already DONE receiving recovery data " | 
|  | 640 | "from node %u for dead node %u\n", | 
|  | 641 | ndata->node_num, dead_node); | 
|  | 642 | break; | 
|  | 643 | } | 
|  | 644 | } | 
|  | 645 |  | 
|  | 646 | mlog(0, "done requesting all lock info\n"); | 
|  | 647 |  | 
|  | 648 | /* nodes should be sending reco data now | 
|  | 649 | * just need to wait */ | 
|  | 650 |  | 
|  | 651 | while (1) { | 
|  | 652 | /* check all the nodes now to see if we are | 
|  | 653 | * done, or if anyone died */ | 
|  | 654 | all_nodes_done = 1; | 
|  | 655 | spin_lock(&dlm_reco_state_lock); | 
|  | 656 | list_for_each(iter, &dlm->reco.node_data) { | 
|  | 657 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | 
|  | 658 |  | 
|  | 659 | mlog(0, "checking recovery state of node %u\n", | 
|  | 660 | ndata->node_num); | 
|  | 661 | switch (ndata->state) { | 
|  | 662 | case DLM_RECO_NODE_DATA_INIT: | 
|  | 663 | case DLM_RECO_NODE_DATA_REQUESTING: | 
|  | 664 | mlog(ML_ERROR, "bad ndata state for " | 
|  | 665 | "node %u: state=%d\n", | 
|  | 666 | ndata->node_num, ndata->state); | 
|  | 667 | BUG(); | 
|  | 668 | break; | 
|  | 669 | case DLM_RECO_NODE_DATA_DEAD: | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 670 | mlog(0, "node %u died after " | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 671 | "requesting recovery info for " | 
|  | 672 | "node %u\n", ndata->node_num, | 
|  | 673 | dead_node); | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 674 | break; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 675 | case DLM_RECO_NODE_DATA_RECEIVING: | 
|  | 676 | case DLM_RECO_NODE_DATA_REQUESTED: | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 677 | mlog(0, "%s: node %u still in state %s\n", | 
|  | 678 | dlm->name, ndata->node_num, | 
|  | 679 | ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? | 
|  | 680 | "receiving" : "requested"); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 681 | all_nodes_done = 0; | 
|  | 682 | break; | 
|  | 683 | case DLM_RECO_NODE_DATA_DONE: | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 684 | mlog(0, "%s: node %u state is done\n", | 
|  | 685 | dlm->name, ndata->node_num); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 686 | break; | 
|  | 687 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 688 | mlog(0, "%s: node %u state is finalize\n", | 
|  | 689 | dlm->name, ndata->node_num); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 690 | break; | 
|  | 691 | } | 
|  | 692 | } | 
|  | 693 | spin_unlock(&dlm_reco_state_lock); | 
|  | 694 |  | 
|  | 695 | mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, | 
|  | 696 | all_nodes_done?"yes":"no"); | 
|  | 697 | if (all_nodes_done) { | 
|  | 698 | int ret; | 
|  | 699 |  | 
|  | 700 | /* all nodes are now in DLM_RECO_NODE_DATA_DONE state | 
|  | 701 | * just send a finalize message to everyone and | 
|  | 702 | * clean up */ | 
|  | 703 | mlog(0, "all nodes are done! send finalize\n"); | 
|  | 704 | ret = dlm_send_finalize_reco_message(dlm); | 
|  | 705 | if (ret < 0) | 
|  | 706 | mlog_errno(ret); | 
|  | 707 |  | 
|  | 708 | spin_lock(&dlm->spinlock); | 
|  | 709 | dlm_finish_local_lockres_recovery(dlm, dead_node, | 
|  | 710 | dlm->node_num); | 
|  | 711 | spin_unlock(&dlm->spinlock); | 
|  | 712 | mlog(0, "should be done with recovery!\n"); | 
|  | 713 |  | 
|  | 714 | mlog(0, "finishing recovery of %s at %lu, " | 
|  | 715 | "dead=%u, this=%u, new=%u\n", dlm->name, | 
|  | 716 | jiffies, dlm->reco.dead_node, | 
|  | 717 | dlm->node_num, dlm->reco.new_master); | 
|  | 718 | destroy = 1; | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 719 | status = 0; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 720 | /* rescan everything marked dirty along the way */ | 
|  | 721 | dlm_kick_thread(dlm, NULL); | 
|  | 722 | break; | 
|  | 723 | } | 
|  | 724 | /* wait to be signalled, with periodic timeout | 
|  | 725 | * to check for node death */ | 
|  | 726 | wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, | 
|  | 727 | kthread_should_stop(), | 
|  | 728 | msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS)); | 
|  | 729 |  | 
|  | 730 | } | 
|  | 731 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 732 | if (destroy) | 
|  | 733 | dlm_destroy_recovery_area(dlm, dead_node); | 
|  | 734 |  | 
|  | 735 | mlog_exit(status); | 
|  | 736 | return status; | 
|  | 737 | } | 
|  | 738 |  | 
|  | 739 | static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | 
|  | 740 | { | 
|  | 741 | int num=0; | 
|  | 742 | struct dlm_reco_node_data *ndata; | 
|  | 743 |  | 
|  | 744 | spin_lock(&dlm->spinlock); | 
|  | 745 | memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); | 
|  | 746 | /* nodes can only be removed (by dying) after dropping | 
|  | 747 | * this lock, and death will be trapped later, so this should do */ | 
|  | 748 | spin_unlock(&dlm->spinlock); | 
|  | 749 |  | 
|  | 750 | while (1) { | 
|  | 751 | num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num); | 
|  | 752 | if (num >= O2NM_MAX_NODES) { | 
|  | 753 | break; | 
|  | 754 | } | 
|  | 755 | BUG_ON(num == dead_node); | 
|  | 756 |  | 
| Robert P. J. Day | cd86128 | 2006-12-13 00:34:52 -0800 | [diff] [blame] | 757 | ndata = kzalloc(sizeof(*ndata), GFP_NOFS); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 758 | if (!ndata) { | 
|  | 759 | dlm_destroy_recovery_area(dlm, dead_node); | 
|  | 760 | return -ENOMEM; | 
|  | 761 | } | 
|  | 762 | ndata->node_num = num; | 
|  | 763 | ndata->state = DLM_RECO_NODE_DATA_INIT; | 
|  | 764 | spin_lock(&dlm_reco_state_lock); | 
|  | 765 | list_add_tail(&ndata->list, &dlm->reco.node_data); | 
|  | 766 | spin_unlock(&dlm_reco_state_lock); | 
|  | 767 | num++; | 
|  | 768 | } | 
|  | 769 |  | 
|  | 770 | return 0; | 
|  | 771 | } | 
|  | 772 |  | 
|  | 773 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | 
|  | 774 | { | 
|  | 775 | struct list_head *iter, *iter2; | 
|  | 776 | struct dlm_reco_node_data *ndata; | 
|  | 777 | LIST_HEAD(tmplist); | 
|  | 778 |  | 
|  | 779 | spin_lock(&dlm_reco_state_lock); | 
|  | 780 | list_splice_init(&dlm->reco.node_data, &tmplist); | 
|  | 781 | spin_unlock(&dlm_reco_state_lock); | 
|  | 782 |  | 
|  | 783 | list_for_each_safe(iter, iter2, &tmplist) { | 
|  | 784 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | 
|  | 785 | list_del_init(&ndata->list); | 
|  | 786 | kfree(ndata); | 
|  | 787 | } | 
|  | 788 | } | 
|  | 789 |  | 
|  | 790 | static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, | 
|  | 791 | u8 dead_node) | 
|  | 792 | { | 
|  | 793 | struct dlm_lock_request lr; | 
|  | 794 | enum dlm_status ret; | 
|  | 795 |  | 
|  | 796 | mlog(0, "\n"); | 
|  | 797 |  | 
|  | 798 |  | 
|  | 799 | mlog(0, "dlm_request_all_locks: dead node is %u, sending request " | 
|  | 800 | "to %u\n", dead_node, request_from); | 
|  | 801 |  | 
|  | 802 | memset(&lr, 0, sizeof(lr)); | 
|  | 803 | lr.node_idx = dlm->node_num; | 
|  | 804 | lr.dead_node = dead_node; | 
|  | 805 |  | 
|  | 806 | // send message | 
|  | 807 | ret = DLM_NOLOCKMGR; | 
|  | 808 | ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, | 
|  | 809 | &lr, sizeof(lr), request_from, NULL); | 
|  | 810 |  | 
|  | 811 | /* negative status is handled by caller */ | 
|  | 812 | if (ret < 0) | 
|  | 813 | mlog_errno(ret); | 
|  | 814 |  | 
|  | 815 | // return from here, then | 
|  | 816 | // sleep until all received or error | 
|  | 817 | return ret; | 
|  | 818 |  | 
|  | 819 | } | 
|  | 820 |  | 
| Kurt Hackel | d74c980 | 2007-01-17 17:04:25 -0800 | [diff] [blame] | 821 | int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, | 
|  | 822 | void **ret_data) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 823 | { | 
|  | 824 | struct dlm_ctxt *dlm = data; | 
|  | 825 | struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; | 
|  | 826 | char *buf = NULL; | 
|  | 827 | struct dlm_work_item *item = NULL; | 
|  | 828 |  | 
|  | 829 | if (!dlm_grab(dlm)) | 
|  | 830 | return -EINVAL; | 
|  | 831 |  | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 832 | if (lr->dead_node != dlm->reco.dead_node) { | 
|  | 833 | mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " | 
|  | 834 | "dead_node is %u\n", dlm->name, lr->node_idx, | 
|  | 835 | lr->dead_node, dlm->reco.dead_node); | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 836 | dlm_print_reco_node_status(dlm); | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 837 | /* this is a hack */ | 
|  | 838 | dlm_put(dlm); | 
|  | 839 | return -ENOMEM; | 
|  | 840 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 841 | BUG_ON(lr->dead_node != dlm->reco.dead_node); | 
|  | 842 |  | 
| Robert P. J. Day | cd86128 | 2006-12-13 00:34:52 -0800 | [diff] [blame] | 843 | item = kzalloc(sizeof(*item), GFP_NOFS); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 844 | if (!item) { | 
|  | 845 | dlm_put(dlm); | 
|  | 846 | return -ENOMEM; | 
|  | 847 | } | 
|  | 848 |  | 
|  | 849 | /* this will get freed by dlm_request_all_locks_worker */ | 
| Kurt Hackel | ad8100e | 2006-05-01 14:25:21 -0700 | [diff] [blame] | 850 | buf = (char *) __get_free_page(GFP_NOFS); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 851 | if (!buf) { | 
|  | 852 | kfree(item); | 
|  | 853 | dlm_put(dlm); | 
|  | 854 | return -ENOMEM; | 
|  | 855 | } | 
|  | 856 |  | 
|  | 857 | /* queue up work for dlm_request_all_locks_worker */ | 
|  | 858 | dlm_grab(dlm);  /* get an extra ref for the work item */ | 
|  | 859 | dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf); | 
|  | 860 | item->u.ral.reco_master = lr->node_idx; | 
|  | 861 | item->u.ral.dead_node = lr->dead_node; | 
|  | 862 | spin_lock(&dlm->work_lock); | 
|  | 863 | list_add_tail(&item->list, &dlm->work_list); | 
|  | 864 | spin_unlock(&dlm->work_lock); | 
| Kurt Hackel | 3156d26 | 2006-05-01 14:39:29 -0700 | [diff] [blame] | 865 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 866 |  | 
|  | 867 | dlm_put(dlm); | 
|  | 868 | return 0; | 
|  | 869 | } | 
|  | 870 |  | 
|  | 871 | static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | 
|  | 872 | { | 
|  | 873 | struct dlm_migratable_lockres *mres; | 
|  | 874 | struct dlm_lock_resource *res; | 
|  | 875 | struct dlm_ctxt *dlm; | 
|  | 876 | LIST_HEAD(resources); | 
|  | 877 | struct list_head *iter; | 
|  | 878 | int ret; | 
|  | 879 | u8 dead_node, reco_master; | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 880 | int skip_all_done = 0; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 881 |  | 
|  | 882 | dlm = item->dlm; | 
|  | 883 | dead_node = item->u.ral.dead_node; | 
|  | 884 | reco_master = item->u.ral.reco_master; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 885 | mres = (struct dlm_migratable_lockres *)data; | 
|  | 886 |  | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 887 | mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", | 
|  | 888 | dlm->name, dead_node, reco_master); | 
|  | 889 |  | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 890 | if (dead_node != dlm->reco.dead_node || | 
|  | 891 | reco_master != dlm->reco.new_master) { | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 892 | /* worker could have been created before the recovery master | 
|  | 893 | * died.  if so, do not continue, but do not error. */ | 
|  | 894 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { | 
|  | 895 | mlog(ML_NOTICE, "%s: will not send recovery state, " | 
|  | 896 | "recovery master %u died, thread=(dead=%u,mas=%u)" | 
|  | 897 | " current=(dead=%u,mas=%u)\n", dlm->name, | 
|  | 898 | reco_master, dead_node, reco_master, | 
|  | 899 | dlm->reco.dead_node, dlm->reco.new_master); | 
|  | 900 | } else { | 
|  | 901 | mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " | 
|  | 902 | "master=%u), request(dead=%u, master=%u)\n", | 
|  | 903 | dlm->name, dlm->reco.dead_node, | 
|  | 904 | dlm->reco.new_master, dead_node, reco_master); | 
|  | 905 | } | 
|  | 906 | goto leave; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 907 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 908 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 909 | /* lock resources should have already been moved to the | 
|  | 910 | * dlm->reco.resources list.  now move items from that list | 
|  | 911 | * to a temp list if the dead owner matches.  note that the | 
|  | 912 | * whole cluster recovers only one node at a time, so we | 
|  | 913 | * can safely move UNKNOWN lock resources for each recovery | 
|  | 914 | * session. */ | 
|  | 915 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); | 
|  | 916 |  | 
|  | 917 | /* now we can begin blasting lockreses without the dlm lock */ | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 918 |  | 
|  | 919 | /* any errors returned will be due to the new_master dying, | 
|  | 920 | * the dlm_reco_thread should detect this */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 921 | list_for_each(iter, &resources) { | 
|  | 922 | res = list_entry (iter, struct dlm_lock_resource, recovering); | 
|  | 923 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | 
|  | 924 | DLM_MRES_RECOVERY); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 925 | if (ret < 0) { | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 926 | mlog(ML_ERROR, "%s: node %u went down while sending " | 
|  | 927 | "recovery state for dead node %u, ret=%d\n", dlm->name, | 
|  | 928 | reco_master, dead_node, ret); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 929 | skip_all_done = 1; | 
|  | 930 | break; | 
|  | 931 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 932 | } | 
|  | 933 |  | 
|  | 934 | /* move the resources back to the list */ | 
|  | 935 | spin_lock(&dlm->spinlock); | 
|  | 936 | list_splice_init(&resources, &dlm->reco.resources); | 
|  | 937 | spin_unlock(&dlm->spinlock); | 
|  | 938 |  | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 939 | if (!skip_all_done) { | 
|  | 940 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); | 
|  | 941 | if (ret < 0) { | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 942 | mlog(ML_ERROR, "%s: node %u went down while sending " | 
|  | 943 | "recovery all-done for dead node %u, ret=%d\n", | 
|  | 944 | dlm->name, reco_master, dead_node, ret); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 945 | } | 
|  | 946 | } | 
| Kurt Hackel | 6a41321 | 2006-05-01 13:49:20 -0700 | [diff] [blame] | 947 | leave: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 948 | free_page((unsigned long)data); | 
|  | 949 | } | 
|  | 950 |  | 
|  | 951 |  | 
|  | 952 | static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | 
|  | 953 | { | 
|  | 954 | int ret, tmpret; | 
|  | 955 | struct dlm_reco_data_done done_msg; | 
|  | 956 |  | 
|  | 957 | memset(&done_msg, 0, sizeof(done_msg)); | 
|  | 958 | done_msg.node_idx = dlm->node_num; | 
|  | 959 | done_msg.dead_node = dead_node; | 
|  | 960 | mlog(0, "sending DATA DONE message to %u, " | 
|  | 961 | "my node=%u, dead node=%u\n", send_to, done_msg.node_idx, | 
|  | 962 | done_msg.dead_node); | 
|  | 963 |  | 
|  | 964 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | 
|  | 965 | sizeof(done_msg), send_to, &tmpret); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 966 | if (ret < 0) { | 
|  | 967 | if (!dlm_is_host_down(ret)) { | 
|  | 968 | mlog_errno(ret); | 
|  | 969 | mlog(ML_ERROR, "%s: unknown error sending data-done " | 
|  | 970 | "to %u\n", dlm->name, send_to); | 
|  | 971 | BUG(); | 
|  | 972 | } | 
|  | 973 | } else | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 974 | ret = tmpret; | 
|  | 975 | return ret; | 
|  | 976 | } | 
|  | 977 |  | 
|  | 978 |  | 
| Kurt Hackel | d74c980 | 2007-01-17 17:04:25 -0800 | [diff] [blame] | 979 | int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, | 
|  | 980 | void **ret_data) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 981 | { | 
|  | 982 | struct dlm_ctxt *dlm = data; | 
|  | 983 | struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; | 
|  | 984 | struct list_head *iter; | 
|  | 985 | struct dlm_reco_node_data *ndata = NULL; | 
|  | 986 | int ret = -EINVAL; | 
|  | 987 |  | 
|  | 988 | if (!dlm_grab(dlm)) | 
|  | 989 | return -EINVAL; | 
|  | 990 |  | 
|  | 991 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " | 
|  | 992 | "node_idx=%u, this node=%u\n", done->dead_node, | 
|  | 993 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 994 |  | 
|  | 995 | mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), | 
|  | 996 | "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " | 
|  | 997 | "node_idx=%u, this node=%u\n", done->dead_node, | 
|  | 998 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 999 |  | 
|  | 1000 | spin_lock(&dlm_reco_state_lock); | 
|  | 1001 | list_for_each(iter, &dlm->reco.node_data) { | 
|  | 1002 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | 
|  | 1003 | if (ndata->node_num != done->node_idx) | 
|  | 1004 | continue; | 
|  | 1005 |  | 
|  | 1006 | switch (ndata->state) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1007 | /* should have moved beyond INIT but not to FINALIZE yet */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1008 | case DLM_RECO_NODE_DATA_INIT: | 
|  | 1009 | case DLM_RECO_NODE_DATA_DEAD: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1010 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 
|  | 1011 | mlog(ML_ERROR, "bad ndata state for node %u:" | 
|  | 1012 | " state=%d\n", ndata->node_num, | 
|  | 1013 | ndata->state); | 
|  | 1014 | BUG(); | 
|  | 1015 | break; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1016 | /* these states are possible at this point, anywhere along | 
|  | 1017 | * the line of recovery */ | 
|  | 1018 | case DLM_RECO_NODE_DATA_DONE: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1019 | case DLM_RECO_NODE_DATA_RECEIVING: | 
|  | 1020 | case DLM_RECO_NODE_DATA_REQUESTED: | 
|  | 1021 | case DLM_RECO_NODE_DATA_REQUESTING: | 
|  | 1022 | mlog(0, "node %u is DONE sending " | 
|  | 1023 | "recovery data!\n", | 
|  | 1024 | ndata->node_num); | 
|  | 1025 |  | 
|  | 1026 | ndata->state = DLM_RECO_NODE_DATA_DONE; | 
|  | 1027 | ret = 0; | 
|  | 1028 | break; | 
|  | 1029 | } | 
|  | 1030 | } | 
|  | 1031 | spin_unlock(&dlm_reco_state_lock); | 
|  | 1032 |  | 
|  | 1033 | /* wake the recovery thread, some node is done */ | 
|  | 1034 | if (!ret) | 
|  | 1035 | dlm_kick_recovery_thread(dlm); | 
|  | 1036 |  | 
|  | 1037 | if (ret < 0) | 
|  | 1038 | mlog(ML_ERROR, "failed to find recovery node data for node " | 
|  | 1039 | "%u\n", done->node_idx); | 
|  | 1040 | dlm_put(dlm); | 
|  | 1041 |  | 
|  | 1042 | mlog(0, "leaving reco data done handler, ret=%d\n", ret); | 
|  | 1043 | return ret; | 
|  | 1044 | } | 
|  | 1045 |  | 
|  | 1046 | static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | 
|  | 1047 | struct list_head *list, | 
|  | 1048 | u8 dead_node) | 
|  | 1049 | { | 
|  | 1050 | struct dlm_lock_resource *res; | 
|  | 1051 | struct list_head *iter, *iter2; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1052 | struct dlm_lock *lock; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1053 |  | 
|  | 1054 | spin_lock(&dlm->spinlock); | 
|  | 1055 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | 
|  | 1056 | res = list_entry (iter, struct dlm_lock_resource, recovering); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1057 | /* always prune any $RECOVERY entries for dead nodes, | 
|  | 1058 | * otherwise hangs can occur during later recovery */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1059 | if (dlm_is_recovery_lock(res->lockname.name, | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1060 | res->lockname.len)) { | 
|  | 1061 | spin_lock(&res->spinlock); | 
|  | 1062 | list_for_each_entry(lock, &res->granted, list) { | 
|  | 1063 | if (lock->ml.node == dead_node) { | 
|  | 1064 | mlog(0, "AHA! there was " | 
|  | 1065 | "a $RECOVERY lock for dead " | 
|  | 1066 | "node %u (%s)!\n", | 
|  | 1067 | dead_node, dlm->name); | 
|  | 1068 | list_del_init(&lock->list); | 
|  | 1069 | dlm_lock_put(lock); | 
|  | 1070 | break; | 
|  | 1071 | } | 
|  | 1072 | } | 
|  | 1073 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1074 | continue; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1075 | } | 
|  | 1076 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1077 | if (res->owner == dead_node) { | 
|  | 1078 | mlog(0, "found lockres owned by dead node while " | 
|  | 1079 | "doing recovery for node %u. sending it.\n", | 
|  | 1080 | dead_node); | 
| Akinobu Mita | f116629 | 2006-06-26 00:24:46 -0700 | [diff] [blame] | 1081 | list_move_tail(&res->recovering, list); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1082 | } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | 
|  | 1083 | mlog(0, "found UNKNOWN owner while doing recovery " | 
|  | 1084 | "for node %u. sending it.\n", dead_node); | 
| Akinobu Mita | f116629 | 2006-06-26 00:24:46 -0700 | [diff] [blame] | 1085 | list_move_tail(&res->recovering, list); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1086 | } | 
|  | 1087 | } | 
|  | 1088 | spin_unlock(&dlm->spinlock); | 
|  | 1089 | } | 
|  | 1090 |  | 
|  | 1091 | static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res) | 
|  | 1092 | { | 
|  | 1093 | int total_locks = 0; | 
|  | 1094 | struct list_head *iter, *queue = &res->granted; | 
|  | 1095 | int i; | 
|  | 1096 |  | 
|  | 1097 | for (i=0; i<3; i++) { | 
|  | 1098 | list_for_each(iter, queue) | 
|  | 1099 | total_locks++; | 
|  | 1100 | queue++; | 
|  | 1101 | } | 
|  | 1102 | return total_locks; | 
|  | 1103 | } | 
|  | 1104 |  | 
|  | 1105 |  | 
|  | 1106 | static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | 
|  | 1107 | struct dlm_migratable_lockres *mres, | 
|  | 1108 | u8 send_to, | 
|  | 1109 | struct dlm_lock_resource *res, | 
|  | 1110 | int total_locks) | 
|  | 1111 | { | 
|  | 1112 | u64 mig_cookie = be64_to_cpu(mres->mig_cookie); | 
|  | 1113 | int mres_total_locks = be32_to_cpu(mres->total_locks); | 
|  | 1114 | int sz, ret = 0, status = 0; | 
|  | 1115 | u8 orig_flags = mres->flags, | 
|  | 1116 | orig_master = mres->master; | 
|  | 1117 |  | 
|  | 1118 | BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS); | 
|  | 1119 | if (!mres->num_locks) | 
|  | 1120 | return 0; | 
|  | 1121 |  | 
|  | 1122 | sz = sizeof(struct dlm_migratable_lockres) + | 
|  | 1123 | (mres->num_locks * sizeof(struct dlm_migratable_lock)); | 
|  | 1124 |  | 
|  | 1125 | /* add an all-done flag if we reached the last lock */ | 
|  | 1126 | orig_flags = mres->flags; | 
|  | 1127 | BUG_ON(total_locks > mres_total_locks); | 
|  | 1128 | if (total_locks == mres_total_locks) | 
|  | 1129 | mres->flags |= DLM_MRES_ALL_DONE; | 
|  | 1130 |  | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1131 | mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", | 
|  | 1132 | dlm->name, res->lockname.len, res->lockname.name, | 
|  | 1133 | orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", | 
|  | 1134 | send_to); | 
|  | 1135 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1136 | /* send it */ | 
|  | 1137 | ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, | 
|  | 1138 | sz, send_to, &status); | 
|  | 1139 | if (ret < 0) { | 
|  | 1140 | /* XXX: negative status is not handled. | 
|  | 1141 | * this will end up killing this node. */ | 
|  | 1142 | mlog_errno(ret); | 
|  | 1143 | } else { | 
|  | 1144 | /* might get an -ENOMEM back here */ | 
|  | 1145 | ret = status; | 
|  | 1146 | if (ret < 0) { | 
|  | 1147 | mlog_errno(ret); | 
|  | 1148 |  | 
|  | 1149 | if (ret == -EFAULT) { | 
|  | 1150 | mlog(ML_ERROR, "node %u told me to kill " | 
|  | 1151 | "myself!\n", send_to); | 
|  | 1152 | BUG(); | 
|  | 1153 | } | 
|  | 1154 | } | 
|  | 1155 | } | 
|  | 1156 |  | 
|  | 1157 | /* zero and reinit the message buffer */ | 
|  | 1158 | dlm_init_migratable_lockres(mres, res->lockname.name, | 
|  | 1159 | res->lockname.len, mres_total_locks, | 
|  | 1160 | mig_cookie, orig_flags, orig_master); | 
|  | 1161 | return ret; | 
|  | 1162 | } | 
|  | 1163 |  | 
|  | 1164 | static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | 
|  | 1165 | const char *lockname, int namelen, | 
|  | 1166 | int total_locks, u64 cookie, | 
|  | 1167 | u8 flags, u8 master) | 
|  | 1168 | { | 
|  | 1169 | /* mres here is one full page */ | 
|  | 1170 | memset(mres, 0, PAGE_SIZE); | 
|  | 1171 | mres->lockname_len = namelen; | 
|  | 1172 | memcpy(mres->lockname, lockname, namelen); | 
|  | 1173 | mres->num_locks = 0; | 
|  | 1174 | mres->total_locks = cpu_to_be32(total_locks); | 
|  | 1175 | mres->mig_cookie = cpu_to_be64(cookie); | 
|  | 1176 | mres->flags = flags; | 
|  | 1177 | mres->master = master; | 
|  | 1178 | } | 
|  | 1179 |  | 
|  | 1180 |  | 
|  | 1181 | /* returns 1 if this lock fills the network structure, | 
|  | 1182 | * 0 otherwise */ | 
|  | 1183 | static int dlm_add_lock_to_array(struct dlm_lock *lock, | 
|  | 1184 | struct dlm_migratable_lockres *mres, int queue) | 
|  | 1185 | { | 
|  | 1186 | struct dlm_migratable_lock *ml; | 
|  | 1187 | int lock_num = mres->num_locks; | 
|  | 1188 |  | 
|  | 1189 | ml = &(mres->ml[lock_num]); | 
|  | 1190 | ml->cookie = lock->ml.cookie; | 
|  | 1191 | ml->type = lock->ml.type; | 
|  | 1192 | ml->convert_type = lock->ml.convert_type; | 
|  | 1193 | ml->highest_blocked = lock->ml.highest_blocked; | 
|  | 1194 | ml->list = queue; | 
|  | 1195 | if (lock->lksb) { | 
|  | 1196 | ml->flags = lock->lksb->flags; | 
|  | 1197 | /* send our current lvb */ | 
|  | 1198 | if (ml->type == LKM_EXMODE || | 
|  | 1199 | ml->type == LKM_PRMODE) { | 
|  | 1200 | /* if it is already set, this had better be a PR | 
|  | 1201 | * and it has to match */ | 
| Kurt Hackel | 8bc674c | 2006-04-27 18:02:10 -0700 | [diff] [blame] | 1202 | if (!dlm_lvb_is_empty(mres->lvb) && | 
|  | 1203 | (ml->type == LKM_EXMODE || | 
|  | 1204 | memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1205 | mlog(ML_ERROR, "mismatched lvbs!\n"); | 
|  | 1206 | __dlm_print_one_lock_resource(lock->lockres); | 
|  | 1207 | BUG(); | 
|  | 1208 | } | 
|  | 1209 | memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); | 
|  | 1210 | } | 
|  | 1211 | } | 
|  | 1212 | ml->node = lock->ml.node; | 
|  | 1213 | mres->num_locks++; | 
|  | 1214 | /* we reached the max, send this network message */ | 
|  | 1215 | if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS) | 
|  | 1216 | return 1; | 
|  | 1217 | return 0; | 
|  | 1218 | } | 
|  | 1219 |  | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1220 | static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, | 
|  | 1221 | struct dlm_migratable_lockres *mres) | 
|  | 1222 | { | 
|  | 1223 | struct dlm_lock dummy; | 
|  | 1224 | memset(&dummy, 0, sizeof(dummy)); | 
|  | 1225 | dummy.ml.cookie = 0; | 
|  | 1226 | dummy.ml.type = LKM_IVMODE; | 
|  | 1227 | dummy.ml.convert_type = LKM_IVMODE; | 
|  | 1228 | dummy.ml.highest_blocked = LKM_IVMODE; | 
|  | 1229 | dummy.lksb = NULL; | 
|  | 1230 | dummy.ml.node = dlm->node_num; | 
|  | 1231 | dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); | 
|  | 1232 | } | 
|  | 1233 |  | 
|  | 1234 | static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, | 
|  | 1235 | struct dlm_migratable_lock *ml, | 
|  | 1236 | u8 *nodenum) | 
|  | 1237 | { | 
|  | 1238 | if (unlikely(ml->cookie == 0 && | 
|  | 1239 | ml->type == LKM_IVMODE && | 
|  | 1240 | ml->convert_type == LKM_IVMODE && | 
|  | 1241 | ml->highest_blocked == LKM_IVMODE && | 
|  | 1242 | ml->list == DLM_BLOCKED_LIST)) { | 
|  | 1243 | *nodenum = ml->node; | 
|  | 1244 | return 1; | 
|  | 1245 | } | 
|  | 1246 | return 0; | 
|  | 1247 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1248 |  | 
|  | 1249 | int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | 
|  | 1250 | struct dlm_migratable_lockres *mres, | 
|  | 1251 | u8 send_to, u8 flags) | 
|  | 1252 | { | 
|  | 1253 | struct list_head *queue, *iter; | 
|  | 1254 | int total_locks, i; | 
|  | 1255 | u64 mig_cookie = 0; | 
|  | 1256 | struct dlm_lock *lock; | 
|  | 1257 | int ret = 0; | 
|  | 1258 |  | 
|  | 1259 | BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); | 
|  | 1260 |  | 
|  | 1261 | mlog(0, "sending to %u\n", send_to); | 
|  | 1262 |  | 
|  | 1263 | total_locks = dlm_num_locks_in_lockres(res); | 
|  | 1264 | if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) { | 
|  | 1265 | /* rare, but possible */ | 
|  | 1266 | mlog(0, "argh.  lockres has %d locks.  this will " | 
|  | 1267 | "require more than one network packet to " | 
|  | 1268 | "migrate\n", total_locks); | 
|  | 1269 | mig_cookie = dlm_get_next_mig_cookie(); | 
|  | 1270 | } | 
|  | 1271 |  | 
|  | 1272 | dlm_init_migratable_lockres(mres, res->lockname.name, | 
|  | 1273 | res->lockname.len, total_locks, | 
|  | 1274 | mig_cookie, flags, res->owner); | 
|  | 1275 |  | 
|  | 1276 | total_locks = 0; | 
|  | 1277 | for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { | 
|  | 1278 | queue = dlm_list_idx_to_ptr(res, i); | 
|  | 1279 | list_for_each(iter, queue) { | 
|  | 1280 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 1281 |  | 
|  | 1282 | /* add another lock. */ | 
|  | 1283 | total_locks++; | 
|  | 1284 | if (!dlm_add_lock_to_array(lock, mres, i)) | 
|  | 1285 | continue; | 
|  | 1286 |  | 
|  | 1287 | /* this filled the lock message, | 
|  | 1288 | * we must send it immediately. */ | 
|  | 1289 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, | 
|  | 1290 | res, total_locks); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 1291 | if (ret < 0) | 
|  | 1292 | goto error; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1293 | } | 
|  | 1294 | } | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1295 | if (total_locks == 0) { | 
|  | 1296 | /* send a dummy lock to indicate a mastery reference only */ | 
|  | 1297 | mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", | 
|  | 1298 | dlm->name, res->lockname.len, res->lockname.name, | 
|  | 1299 | send_to, flags & DLM_MRES_RECOVERY ? "recovery" : | 
|  | 1300 | "migration"); | 
|  | 1301 | dlm_add_dummy_lock(dlm, mres); | 
|  | 1302 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1303 | /* flush any remaining locks */ | 
|  | 1304 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 1305 | if (ret < 0) | 
|  | 1306 | goto error; | 
|  | 1307 | return ret; | 
|  | 1308 |  | 
|  | 1309 | error: | 
|  | 1310 | mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", | 
|  | 1311 | dlm->name, ret); | 
|  | 1312 | if (!dlm_is_host_down(ret)) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1313 | BUG(); | 
| Kurt Hackel | 29c0fa0 | 2006-04-27 18:06:58 -0700 | [diff] [blame] | 1314 | mlog(0, "%s: node %u went down while sending %s " | 
|  | 1315 | "lockres %.*s\n", dlm->name, send_to, | 
|  | 1316 | flags & DLM_MRES_RECOVERY ?  "recovery" : "migration", | 
|  | 1317 | res->lockname.len, res->lockname.name); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1318 | return ret; | 
|  | 1319 | } | 
|  | 1320 |  | 
|  | 1321 |  | 
|  | 1322 |  | 
|  | 1323 | /* | 
|  | 1324 | * this message will contain no more than one page worth of | 
|  | 1325 | * recovery data, and it will work on only one lockres. | 
|  | 1326 | * there may be many locks in this page, and we may need to wait | 
|  | 1327 | * for additional packets to complete all the locks (rare, but | 
|  | 1328 | * possible). | 
|  | 1329 | */ | 
|  | 1330 | /* | 
|  | 1331 | * NOTE: the allocation error cases here are scary | 
|  | 1332 | * we really cannot afford to fail an alloc in recovery | 
|  | 1333 | * do we spin?  returning an error only delays the problem really | 
|  | 1334 | */ | 
|  | 1335 |  | 
| Kurt Hackel | d74c980 | 2007-01-17 17:04:25 -0800 | [diff] [blame] | 1336 | int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, | 
|  | 1337 | void **ret_data) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1338 | { | 
|  | 1339 | struct dlm_ctxt *dlm = data; | 
|  | 1340 | struct dlm_migratable_lockres *mres = | 
|  | 1341 | (struct dlm_migratable_lockres *)msg->buf; | 
|  | 1342 | int ret = 0; | 
|  | 1343 | u8 real_master; | 
|  | 1344 | char *buf = NULL; | 
|  | 1345 | struct dlm_work_item *item = NULL; | 
|  | 1346 | struct dlm_lock_resource *res = NULL; | 
|  | 1347 |  | 
|  | 1348 | if (!dlm_grab(dlm)) | 
|  | 1349 | return -EINVAL; | 
|  | 1350 |  | 
|  | 1351 | BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); | 
|  | 1352 |  | 
|  | 1353 | real_master = mres->master; | 
|  | 1354 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | 
|  | 1355 | /* cannot migrate a lockres with no master */ | 
|  | 1356 | BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); | 
|  | 1357 | } | 
|  | 1358 |  | 
|  | 1359 | mlog(0, "%s message received from node %u\n", | 
|  | 1360 | (mres->flags & DLM_MRES_RECOVERY) ? | 
|  | 1361 | "recovery" : "migration", mres->master); | 
|  | 1362 | if (mres->flags & DLM_MRES_ALL_DONE) | 
|  | 1363 | mlog(0, "all done flag.  all lockres data received!\n"); | 
|  | 1364 |  | 
|  | 1365 | ret = -ENOMEM; | 
| Kurt Hackel | ad8100e | 2006-05-01 14:25:21 -0700 | [diff] [blame] | 1366 | buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); | 
| Robert P. J. Day | cd86128 | 2006-12-13 00:34:52 -0800 | [diff] [blame] | 1367 | item = kzalloc(sizeof(*item), GFP_NOFS); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1368 | if (!buf || !item) | 
|  | 1369 | goto leave; | 
|  | 1370 |  | 
|  | 1371 | /* lookup the lock to see if we have a secondary queue for this | 
|  | 1372 | * already...  just add the locks in and this will have its owner | 
|  | 1373 | * and RECOVERY flag changed when it completes. */ | 
|  | 1374 | res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); | 
|  | 1375 | if (res) { | 
|  | 1376 | /* this will get a ref on res */ | 
|  | 1377 | /* mark it as recovering/migrating and hash it */ | 
|  | 1378 | spin_lock(&res->spinlock); | 
|  | 1379 | if (mres->flags & DLM_MRES_RECOVERY) { | 
|  | 1380 | res->state |= DLM_LOCK_RES_RECOVERING; | 
|  | 1381 | } else { | 
|  | 1382 | if (res->state & DLM_LOCK_RES_MIGRATING) { | 
|  | 1383 | /* this is at least the second | 
|  | 1384 | * lockres message */ | 
|  | 1385 | mlog(0, "lock %.*s is already migrating\n", | 
|  | 1386 | mres->lockname_len, | 
|  | 1387 | mres->lockname); | 
|  | 1388 | } else if (res->state & DLM_LOCK_RES_RECOVERING) { | 
|  | 1389 | /* caller should BUG */ | 
|  | 1390 | mlog(ML_ERROR, "node is attempting to migrate " | 
|  | 1391 | "lock %.*s, but marked as recovering!\n", | 
|  | 1392 | mres->lockname_len, mres->lockname); | 
|  | 1393 | ret = -EFAULT; | 
|  | 1394 | spin_unlock(&res->spinlock); | 
|  | 1395 | goto leave; | 
|  | 1396 | } | 
|  | 1397 | res->state |= DLM_LOCK_RES_MIGRATING; | 
|  | 1398 | } | 
|  | 1399 | spin_unlock(&res->spinlock); | 
|  | 1400 | } else { | 
|  | 1401 | /* need to allocate, just like if it was | 
|  | 1402 | * mastered here normally  */ | 
|  | 1403 | res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); | 
|  | 1404 | if (!res) | 
|  | 1405 | goto leave; | 
|  | 1406 |  | 
|  | 1407 | /* to match the ref that we would have gotten if | 
|  | 1408 | * dlm_lookup_lockres had succeeded */ | 
|  | 1409 | dlm_lockres_get(res); | 
|  | 1410 |  | 
|  | 1411 | /* mark it as recovering/migrating and hash it */ | 
|  | 1412 | if (mres->flags & DLM_MRES_RECOVERY) | 
|  | 1413 | res->state |= DLM_LOCK_RES_RECOVERING; | 
|  | 1414 | else | 
|  | 1415 | res->state |= DLM_LOCK_RES_MIGRATING; | 
|  | 1416 |  | 
|  | 1417 | spin_lock(&dlm->spinlock); | 
|  | 1418 | __dlm_insert_lockres(dlm, res); | 
|  | 1419 | spin_unlock(&dlm->spinlock); | 
|  | 1420 |  | 
|  | 1421 | /* now that the new lockres is inserted, | 
|  | 1422 | * make it usable by other processes */ | 
|  | 1423 | spin_lock(&res->spinlock); | 
|  | 1424 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 
|  | 1425 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | a6fa364 | 2007-01-17 14:59:12 -0800 | [diff] [blame] | 1426 | wake_up(&res->wq); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1427 |  | 
|  | 1428 | /* add an extra ref for just-allocated lockres | 
|  | 1429 | * otherwise the lockres will be purged immediately */ | 
|  | 1430 | dlm_lockres_get(res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1431 | } | 
|  | 1432 |  | 
|  | 1433 | /* at this point we have allocated everything we need, | 
|  | 1434 | * and we have a hashed lockres with an extra ref and | 
|  | 1435 | * the proper res->state flags. */ | 
|  | 1436 | ret = 0; | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1437 | spin_lock(&res->spinlock); | 
|  | 1438 | /* drop this either when master requery finds a different master | 
|  | 1439 | * or when a lock is added by the recovery worker */ | 
|  | 1440 | dlm_lockres_grab_inflight_ref(dlm, res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1441 | if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { | 
|  | 1442 | /* migration cannot have an unknown master */ | 
|  | 1443 | BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); | 
|  | 1444 | mlog(0, "recovery has passed me a lockres with an " | 
|  | 1445 | "unknown owner.. will need to requery: " | 
|  | 1446 | "%.*s\n", mres->lockname_len, mres->lockname); | 
|  | 1447 | } else { | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1448 | /* take a reference now to pin the lockres, drop it | 
|  | 1449 | * when locks are added in the worker */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1450 | dlm_change_lockres_owner(dlm, res, dlm->node_num); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1451 | } | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1452 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1453 |  | 
|  | 1454 | /* queue up work for dlm_mig_lockres_worker */ | 
|  | 1455 | dlm_grab(dlm);  /* get an extra ref for the work item */ | 
|  | 1456 | memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */ | 
|  | 1457 | dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf); | 
|  | 1458 | item->u.ml.lockres = res; /* already have a ref */ | 
|  | 1459 | item->u.ml.real_master = real_master; | 
|  | 1460 | spin_lock(&dlm->work_lock); | 
|  | 1461 | list_add_tail(&item->list, &dlm->work_list); | 
|  | 1462 | spin_unlock(&dlm->work_lock); | 
| Kurt Hackel | 3156d26 | 2006-05-01 14:39:29 -0700 | [diff] [blame] | 1463 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1464 |  | 
|  | 1465 | leave: | 
|  | 1466 | dlm_put(dlm); | 
|  | 1467 | if (ret < 0) { | 
|  | 1468 | if (buf) | 
|  | 1469 | kfree(buf); | 
|  | 1470 | if (item) | 
|  | 1471 | kfree(item); | 
|  | 1472 | } | 
|  | 1473 |  | 
|  | 1474 | mlog_exit(ret); | 
|  | 1475 | return ret; | 
|  | 1476 | } | 
|  | 1477 |  | 
|  | 1478 |  | 
|  | 1479 | static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) | 
|  | 1480 | { | 
|  | 1481 | struct dlm_ctxt *dlm = data; | 
|  | 1482 | struct dlm_migratable_lockres *mres; | 
|  | 1483 | int ret = 0; | 
|  | 1484 | struct dlm_lock_resource *res; | 
|  | 1485 | u8 real_master; | 
|  | 1486 |  | 
|  | 1487 | dlm = item->dlm; | 
|  | 1488 | mres = (struct dlm_migratable_lockres *)data; | 
|  | 1489 |  | 
|  | 1490 | res = item->u.ml.lockres; | 
|  | 1491 | real_master = item->u.ml.real_master; | 
|  | 1492 |  | 
|  | 1493 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | 
|  | 1494 | /* this case is super-rare. only occurs if | 
|  | 1495 | * node death happens during migration. */ | 
|  | 1496 | again: | 
|  | 1497 | ret = dlm_lockres_master_requery(dlm, res, &real_master); | 
|  | 1498 | if (ret < 0) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 1499 | mlog(0, "dlm_lockres_master_requery ret=%d\n", | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1500 | ret); | 
|  | 1501 | goto again; | 
|  | 1502 | } | 
|  | 1503 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | 
|  | 1504 | mlog(0, "lockres %.*s not claimed.  " | 
|  | 1505 | "this node will take it.\n", | 
|  | 1506 | res->lockname.len, res->lockname.name); | 
|  | 1507 | } else { | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1508 | spin_lock(&res->spinlock); | 
|  | 1509 | dlm_lockres_drop_inflight_ref(dlm, res); | 
|  | 1510 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1511 | mlog(0, "master needs to respond to sender " | 
|  | 1512 | "that node %u still owns %.*s\n", | 
|  | 1513 | real_master, res->lockname.len, | 
|  | 1514 | res->lockname.name); | 
|  | 1515 | /* cannot touch this lockres */ | 
|  | 1516 | goto leave; | 
|  | 1517 | } | 
|  | 1518 | } | 
|  | 1519 |  | 
|  | 1520 | ret = dlm_process_recovery_data(dlm, res, mres); | 
|  | 1521 | if (ret < 0) | 
|  | 1522 | mlog(0, "dlm_process_recovery_data returned  %d\n", ret); | 
|  | 1523 | else | 
|  | 1524 | mlog(0, "dlm_process_recovery_data succeeded\n"); | 
|  | 1525 |  | 
|  | 1526 | if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) == | 
|  | 1527 | (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) { | 
|  | 1528 | ret = dlm_finish_migration(dlm, res, mres->master); | 
|  | 1529 | if (ret < 0) | 
|  | 1530 | mlog_errno(ret); | 
|  | 1531 | } | 
|  | 1532 |  | 
|  | 1533 | leave: | 
|  | 1534 | kfree(data); | 
|  | 1535 | mlog_exit(ret); | 
|  | 1536 | } | 
|  | 1537 |  | 
|  | 1538 |  | 
|  | 1539 |  | 
| Adrian Bunk | 8169cae | 2006-03-31 16:53:55 +0200 | [diff] [blame] | 1540 | static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, | 
|  | 1541 | struct dlm_lock_resource *res, | 
|  | 1542 | u8 *real_master) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1543 | { | 
|  | 1544 | struct dlm_node_iter iter; | 
|  | 1545 | int nodenum; | 
|  | 1546 | int ret = 0; | 
|  | 1547 |  | 
|  | 1548 | *real_master = DLM_LOCK_RES_OWNER_UNKNOWN; | 
|  | 1549 |  | 
|  | 1550 | /* we only reach here if one of the two nodes in a | 
|  | 1551 | * migration died while the migration was in progress. | 
|  | 1552 | * at this point we need to requery the master.  we | 
|  | 1553 | * know that the new_master got as far as creating | 
|  | 1554 | * an mle on at least one node, but we do not know | 
|  | 1555 | * if any nodes had actually cleared the mle and set | 
|  | 1556 | * the master to the new_master.  the old master | 
|  | 1557 | * is supposed to set the owner to UNKNOWN in the | 
|  | 1558 | * event of a new_master death, so the only possible | 
|  | 1559 | * responses that we can get from nodes here are | 
|  | 1560 | * that the master is new_master, or that the master | 
|  | 1561 | * is UNKNOWN. | 
|  | 1562 | * if all nodes come back with UNKNOWN then we know | 
|  | 1563 | * the lock needs remastering here. | 
|  | 1564 | * if any node comes back with a valid master, check | 
|  | 1565 | * to see if that master is the one that we are | 
|  | 1566 | * recovering.  if so, then the new_master died and | 
|  | 1567 | * we need to remaster this lock.  if not, then the | 
|  | 1568 | * new_master survived and that node will respond to | 
|  | 1569 | * other nodes about the owner. | 
|  | 1570 | * if there is an owner, this node needs to dump this | 
|  | 1571 | * lockres and alert the sender that this lockres | 
|  | 1572 | * was rejected. */ | 
|  | 1573 | spin_lock(&dlm->spinlock); | 
|  | 1574 | dlm_node_iter_init(dlm->domain_map, &iter); | 
|  | 1575 | spin_unlock(&dlm->spinlock); | 
|  | 1576 |  | 
|  | 1577 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 
|  | 1578 | /* do not send to self */ | 
|  | 1579 | if (nodenum == dlm->node_num) | 
|  | 1580 | continue; | 
|  | 1581 | ret = dlm_do_master_requery(dlm, res, nodenum, real_master); | 
|  | 1582 | if (ret < 0) { | 
|  | 1583 | mlog_errno(ret); | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 1584 | if (!dlm_is_host_down(ret)) | 
|  | 1585 | BUG(); | 
|  | 1586 | /* host is down, so answer for that node would be | 
|  | 1587 | * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1588 | } | 
|  | 1589 | if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) { | 
|  | 1590 | mlog(0, "lock master is %u\n", *real_master); | 
|  | 1591 | break; | 
|  | 1592 | } | 
|  | 1593 | } | 
|  | 1594 | return ret; | 
|  | 1595 | } | 
|  | 1596 |  | 
|  | 1597 |  | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 1598 | int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | 
|  | 1599 | u8 nodenum, u8 *real_master) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1600 | { | 
|  | 1601 | int ret = -EINVAL; | 
|  | 1602 | struct dlm_master_requery req; | 
|  | 1603 | int status = DLM_LOCK_RES_OWNER_UNKNOWN; | 
|  | 1604 |  | 
|  | 1605 | memset(&req, 0, sizeof(req)); | 
|  | 1606 | req.node_idx = dlm->node_num; | 
|  | 1607 | req.namelen = res->lockname.len; | 
|  | 1608 | memcpy(req.name, res->lockname.name, res->lockname.len); | 
|  | 1609 |  | 
|  | 1610 | ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, | 
|  | 1611 | &req, sizeof(req), nodenum, &status); | 
|  | 1612 | /* XXX: negative status not handled properly here. */ | 
|  | 1613 | if (ret < 0) | 
|  | 1614 | mlog_errno(ret); | 
|  | 1615 | else { | 
|  | 1616 | BUG_ON(status < 0); | 
|  | 1617 | BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); | 
|  | 1618 | *real_master = (u8) (status & 0xff); | 
|  | 1619 | mlog(0, "node %u responded to master requery with %u\n", | 
|  | 1620 | nodenum, *real_master); | 
|  | 1621 | ret = 0; | 
|  | 1622 | } | 
|  | 1623 | return ret; | 
|  | 1624 | } | 
|  | 1625 |  | 
|  | 1626 |  | 
|  | 1627 | /* this function cannot error, so unless the sending | 
|  | 1628 | * or receiving of the message failed, the owner can | 
|  | 1629 | * be trusted */ | 
| Kurt Hackel | d74c980 | 2007-01-17 17:04:25 -0800 | [diff] [blame] | 1630 | int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, | 
|  | 1631 | void **ret_data) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1632 | { | 
|  | 1633 | struct dlm_ctxt *dlm = data; | 
|  | 1634 | struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; | 
|  | 1635 | struct dlm_lock_resource *res = NULL; | 
| Mark Fasheh | a3d3329 | 2006-03-09 17:55:56 -0800 | [diff] [blame] | 1636 | unsigned int hash; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1637 | int master = DLM_LOCK_RES_OWNER_UNKNOWN; | 
|  | 1638 | u32 flags = DLM_ASSERT_MASTER_REQUERY; | 
|  | 1639 |  | 
|  | 1640 | if (!dlm_grab(dlm)) { | 
|  | 1641 | /* since the domain has gone away on this | 
|  | 1642 | * node, the proper response is UNKNOWN */ | 
|  | 1643 | return master; | 
|  | 1644 | } | 
|  | 1645 |  | 
| Mark Fasheh | a3d3329 | 2006-03-09 17:55:56 -0800 | [diff] [blame] | 1646 | hash = dlm_lockid_hash(req->name, req->namelen); | 
|  | 1647 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1648 | spin_lock(&dlm->spinlock); | 
| Mark Fasheh | a3d3329 | 2006-03-09 17:55:56 -0800 | [diff] [blame] | 1649 | res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1650 | if (res) { | 
|  | 1651 | spin_lock(&res->spinlock); | 
|  | 1652 | master = res->owner; | 
|  | 1653 | if (master == dlm->node_num) { | 
|  | 1654 | int ret = dlm_dispatch_assert_master(dlm, res, | 
|  | 1655 | 0, 0, flags); | 
|  | 1656 | if (ret < 0) { | 
|  | 1657 | mlog_errno(-ENOMEM); | 
|  | 1658 | /* retry!? */ | 
|  | 1659 | BUG(); | 
|  | 1660 | } | 
|  | 1661 | } | 
|  | 1662 | spin_unlock(&res->spinlock); | 
|  | 1663 | } | 
|  | 1664 | spin_unlock(&dlm->spinlock); | 
|  | 1665 |  | 
|  | 1666 | dlm_put(dlm); | 
|  | 1667 | return master; | 
|  | 1668 | } | 
|  | 1669 |  | 
|  | 1670 | static inline struct list_head * | 
|  | 1671 | dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num) | 
|  | 1672 | { | 
|  | 1673 | struct list_head *ret; | 
|  | 1674 | BUG_ON(list_num < 0); | 
|  | 1675 | BUG_ON(list_num > 2); | 
|  | 1676 | ret = &(res->granted); | 
|  | 1677 | ret += list_num; | 
|  | 1678 | return ret; | 
|  | 1679 | } | 
|  | 1680 | /* TODO: do ast flush business | 
|  | 1681 | * TODO: do MIGRATING and RECOVERING spinning | 
|  | 1682 | */ | 
|  | 1683 |  | 
|  | 1684 | /* | 
|  | 1685 | * NOTE about in-flight requests during migration: | 
|  | 1686 | * | 
|  | 1687 | * Before attempting the migrate, the master has marked the lockres as | 
|  | 1688 | * MIGRATING and then flushed all of its pending ASTS.  So any in-flight | 
|  | 1689 | * requests either got queued before the MIGRATING flag got set, in which | 
|  | 1690 | * case the lock data will reflect the change and a return message is on | 
|  | 1691 | * the way, or the request failed to get in before MIGRATING got set.  In | 
|  | 1692 | * this case, the caller will be told to spin and wait for the MIGRATING | 
|  | 1693 | * flag to be dropped, then recheck the master. | 
|  | 1694 | * This holds true for the convert, cancel and unlock cases, and since lvb | 
|  | 1695 | * updates are tied to these same messages, it applies to lvb updates as | 
|  | 1696 | * well.  For the lock case, there is no way a lock can be on the master | 
|  | 1697 | * queue and not be on the secondary queue since the lock is always added | 
|  | 1698 | * locally first.  This means that the new target node will never be sent | 
|  | 1699 | * a lock that he doesn't already have on the list. | 
|  | 1700 | * In total, this means that the local lock is correct and should not be | 
|  | 1701 | * updated to match the one sent by the master.  Any messages sent back | 
|  | 1702 | * from the master before the MIGRATING flag will bring the lock properly | 
|  | 1703 | * up-to-date, and the change will be ordered properly for the waiter. | 
|  | 1704 | * We will *not* attempt to modify the lock underneath the waiter. | 
|  | 1705 | */ | 
|  | 1706 |  | 
|  | 1707 | static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | 
|  | 1708 | struct dlm_lock_resource *res, | 
|  | 1709 | struct dlm_migratable_lockres *mres) | 
|  | 1710 | { | 
|  | 1711 | struct dlm_migratable_lock *ml; | 
|  | 1712 | struct list_head *queue; | 
| Kurt Hackel | e17e75e | 2007-01-05 15:04:49 -0800 | [diff] [blame] | 1713 | struct list_head *tmpq = NULL; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1714 | struct dlm_lock *newlock = NULL; | 
|  | 1715 | struct dlm_lockstatus *lksb = NULL; | 
|  | 1716 | int ret = 0; | 
| Kurt Hackel | e17e75e | 2007-01-05 15:04:49 -0800 | [diff] [blame] | 1717 | int i, j, bad; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1718 | struct list_head *iter; | 
|  | 1719 | struct dlm_lock *lock = NULL; | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1720 | u8 from = O2NM_MAX_NODES; | 
|  | 1721 | unsigned int added = 0; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1722 |  | 
|  | 1723 | mlog(0, "running %d locks for this lockres\n", mres->num_locks); | 
|  | 1724 | for (i=0; i<mres->num_locks; i++) { | 
|  | 1725 | ml = &(mres->ml[i]); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1726 |  | 
|  | 1727 | if (dlm_is_dummy_lock(dlm, ml, &from)) { | 
|  | 1728 | /* placeholder, just need to set the refmap bit */ | 
|  | 1729 | BUG_ON(mres->num_locks != 1); | 
|  | 1730 | mlog(0, "%s:%.*s: dummy lock for %u\n", | 
|  | 1731 | dlm->name, mres->lockname_len, mres->lockname, | 
|  | 1732 | from); | 
|  | 1733 | spin_lock(&res->spinlock); | 
|  | 1734 | dlm_lockres_set_refmap_bit(from, res); | 
|  | 1735 | spin_unlock(&res->spinlock); | 
|  | 1736 | added++; | 
|  | 1737 | break; | 
|  | 1738 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1739 | BUG_ON(ml->highest_blocked != LKM_IVMODE); | 
|  | 1740 | newlock = NULL; | 
|  | 1741 | lksb = NULL; | 
|  | 1742 |  | 
|  | 1743 | queue = dlm_list_num_to_pointer(res, ml->list); | 
| Kurt Hackel | e17e75e | 2007-01-05 15:04:49 -0800 | [diff] [blame] | 1744 | tmpq = NULL; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1745 |  | 
|  | 1746 | /* if the lock is for the local node it needs to | 
|  | 1747 | * be moved to the proper location within the queue. | 
|  | 1748 | * do not allocate a new lock structure. */ | 
|  | 1749 | if (ml->node == dlm->node_num) { | 
|  | 1750 | /* MIGRATION ONLY! */ | 
|  | 1751 | BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); | 
|  | 1752 |  | 
|  | 1753 | spin_lock(&res->spinlock); | 
| Kurt Hackel | e17e75e | 2007-01-05 15:04:49 -0800 | [diff] [blame] | 1754 | for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { | 
|  | 1755 | tmpq = dlm_list_idx_to_ptr(res, j); | 
|  | 1756 | list_for_each(iter, tmpq) { | 
|  | 1757 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 1758 | if (lock->ml.cookie != ml->cookie) | 
|  | 1759 | lock = NULL; | 
|  | 1760 | else | 
|  | 1761 | break; | 
|  | 1762 | } | 
|  | 1763 | if (lock) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1764 | break; | 
|  | 1765 | } | 
|  | 1766 |  | 
|  | 1767 | /* lock is always created locally first, and | 
|  | 1768 | * destroyed locally last.  it must be on the list */ | 
|  | 1769 | if (!lock) { | 
| Kurt Hackel | 2900485 | 2006-03-02 16:43:36 -0800 | [diff] [blame] | 1770 | u64 c = ml->cookie; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1771 | mlog(ML_ERROR, "could not find local lock " | 
| Kurt Hackel | 2900485 | 2006-03-02 16:43:36 -0800 | [diff] [blame] | 1772 | "with cookie %u:%llu!\n", | 
| Kurt Hackel | 74aa258 | 2007-01-17 15:11:36 -0800 | [diff] [blame] | 1773 | dlm_get_lock_cookie_node(be64_to_cpu(c)), | 
|  | 1774 | dlm_get_lock_cookie_seq(be64_to_cpu(c))); | 
| Kurt Hackel | 71ac106 | 2007-01-05 15:02:30 -0800 | [diff] [blame] | 1775 | __dlm_print_one_lock_resource(res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1776 | BUG(); | 
|  | 1777 | } | 
|  | 1778 | BUG_ON(lock->ml.node != ml->node); | 
|  | 1779 |  | 
| Kurt Hackel | e17e75e | 2007-01-05 15:04:49 -0800 | [diff] [blame] | 1780 | if (tmpq != queue) { | 
|  | 1781 | mlog(0, "lock was on %u instead of %u for %.*s\n", | 
|  | 1782 | j, ml->list, res->lockname.len, res->lockname.name); | 
|  | 1783 | spin_unlock(&res->spinlock); | 
|  | 1784 | continue; | 
|  | 1785 | } | 
|  | 1786 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1787 | /* see NOTE above about why we do not update | 
|  | 1788 | * to match the master here */ | 
|  | 1789 |  | 
|  | 1790 | /* move the lock to its proper place */ | 
|  | 1791 | /* do not alter lock refcount.  switching lists. */ | 
| Akinobu Mita | f116629 | 2006-06-26 00:24:46 -0700 | [diff] [blame] | 1792 | list_move_tail(&lock->list, queue); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1793 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1794 | added++; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1795 |  | 
|  | 1796 | mlog(0, "just reordered a local lock!\n"); | 
|  | 1797 | continue; | 
|  | 1798 | } | 
|  | 1799 |  | 
|  | 1800 | /* lock is for another node. */ | 
|  | 1801 | newlock = dlm_new_lock(ml->type, ml->node, | 
|  | 1802 | be64_to_cpu(ml->cookie), NULL); | 
|  | 1803 | if (!newlock) { | 
|  | 1804 | ret = -ENOMEM; | 
|  | 1805 | goto leave; | 
|  | 1806 | } | 
|  | 1807 | lksb = newlock->lksb; | 
|  | 1808 | dlm_lock_attach_lockres(newlock, res); | 
|  | 1809 |  | 
|  | 1810 | if (ml->convert_type != LKM_IVMODE) { | 
|  | 1811 | BUG_ON(queue != &res->converting); | 
|  | 1812 | newlock->ml.convert_type = ml->convert_type; | 
|  | 1813 | } | 
|  | 1814 | lksb->flags |= (ml->flags & | 
|  | 1815 | (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); | 
| Kurt Hackel | ccd8b1f | 2006-05-01 11:32:14 -0700 | [diff] [blame] | 1816 |  | 
|  | 1817 | if (ml->type == LKM_NLMODE) | 
|  | 1818 | goto skip_lvb; | 
|  | 1819 |  | 
| Kurt Hackel | 8bc674c | 2006-04-27 18:02:10 -0700 | [diff] [blame] | 1820 | if (!dlm_lvb_is_empty(mres->lvb)) { | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1821 | if (lksb->flags & DLM_LKSB_PUT_LVB) { | 
|  | 1822 | /* other node was trying to update | 
|  | 1823 | * lvb when node died.  recreate the | 
|  | 1824 | * lksb with the updated lvb. */ | 
|  | 1825 | memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); | 
| Kurt Hackel | ccd8b1f | 2006-05-01 11:32:14 -0700 | [diff] [blame] | 1826 | /* the lock resource lvb update must happen | 
|  | 1827 | * NOW, before the spinlock is dropped. | 
|  | 1828 | * we no longer wait for the AST to update | 
|  | 1829 | * the lvb. */ | 
|  | 1830 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1831 | } else { | 
|  | 1832 | /* otherwise, the node is sending its | 
|  | 1833 | * most recent valid lvb info */ | 
|  | 1834 | BUG_ON(ml->type != LKM_EXMODE && | 
|  | 1835 | ml->type != LKM_PRMODE); | 
| Kurt Hackel | 8bc674c | 2006-04-27 18:02:10 -0700 | [diff] [blame] | 1836 | if (!dlm_lvb_is_empty(res->lvb) && | 
| Kurt Hackel | ccd8b1f | 2006-05-01 11:32:14 -0700 | [diff] [blame] | 1837 | (ml->type == LKM_EXMODE || | 
|  | 1838 | memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { | 
|  | 1839 | int i; | 
|  | 1840 | mlog(ML_ERROR, "%s:%.*s: received bad " | 
|  | 1841 | "lvb! type=%d\n", dlm->name, | 
|  | 1842 | res->lockname.len, | 
|  | 1843 | res->lockname.name, ml->type); | 
|  | 1844 | printk("lockres lvb=["); | 
|  | 1845 | for (i=0; i<DLM_LVB_LEN; i++) | 
|  | 1846 | printk("%02x", res->lvb[i]); | 
|  | 1847 | printk("]\nmigrated lvb=["); | 
|  | 1848 | for (i=0; i<DLM_LVB_LEN; i++) | 
|  | 1849 | printk("%02x", mres->lvb[i]); | 
|  | 1850 | printk("]\n"); | 
|  | 1851 | dlm_print_one_lock_resource(res); | 
|  | 1852 | BUG(); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1853 | } | 
|  | 1854 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); | 
|  | 1855 | } | 
|  | 1856 | } | 
| Kurt Hackel | ccd8b1f | 2006-05-01 11:32:14 -0700 | [diff] [blame] | 1857 | skip_lvb: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1858 |  | 
|  | 1859 | /* NOTE: | 
|  | 1860 | * wrt lock queue ordering and recovery: | 
|  | 1861 | *    1. order of locks on granted queue is | 
|  | 1862 | *       meaningless. | 
|  | 1863 | *    2. order of locks on converting queue is | 
|  | 1864 | *       LOST with the node death.  sorry charlie. | 
|  | 1865 | *    3. order of locks on the blocked queue is | 
|  | 1866 | *       also LOST. | 
|  | 1867 | * order of locks does not affect integrity, it | 
|  | 1868 | * just means that a lock request may get pushed | 
|  | 1869 | * back in line as a result of the node death. | 
|  | 1870 | * also note that for a given node the lock order | 
|  | 1871 | * for its secondary queue locks is preserved | 
|  | 1872 | * relative to each other, but clearly *not* | 
|  | 1873 | * preserved relative to locks from other nodes. | 
|  | 1874 | */ | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 1875 | bad = 0; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1876 | spin_lock(&res->spinlock); | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 1877 | list_for_each_entry(lock, queue, list) { | 
|  | 1878 | if (lock->ml.cookie == ml->cookie) { | 
|  | 1879 | u64 c = lock->ml.cookie; | 
|  | 1880 | mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " | 
|  | 1881 | "exists on this lockres!\n", dlm->name, | 
|  | 1882 | res->lockname.len, res->lockname.name, | 
| Kurt Hackel | 74aa258 | 2007-01-17 15:11:36 -0800 | [diff] [blame] | 1883 | dlm_get_lock_cookie_node(be64_to_cpu(c)), | 
|  | 1884 | dlm_get_lock_cookie_seq(be64_to_cpu(c))); | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 1885 |  | 
|  | 1886 | mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " | 
|  | 1887 | "node=%u, cookie=%u:%llu, queue=%d\n", | 
|  | 1888 | ml->type, ml->convert_type, ml->node, | 
| Kurt Hackel | 74aa258 | 2007-01-17 15:11:36 -0800 | [diff] [blame] | 1889 | dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), | 
|  | 1890 | dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 1891 | ml->list); | 
|  | 1892 |  | 
|  | 1893 | __dlm_print_one_lock_resource(res); | 
|  | 1894 | bad = 1; | 
|  | 1895 | break; | 
|  | 1896 | } | 
|  | 1897 | } | 
|  | 1898 | if (!bad) { | 
|  | 1899 | dlm_lock_get(newlock); | 
|  | 1900 | list_add_tail(&newlock->list, queue); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1901 | mlog(0, "%s:%.*s: added lock for node %u, " | 
|  | 1902 | "setting refmap bit\n", dlm->name, | 
|  | 1903 | res->lockname.len, res->lockname.name, ml->node); | 
|  | 1904 | dlm_lockres_set_refmap_bit(ml->node, res); | 
|  | 1905 | added++; | 
| Kurt Hackel | c3187ce | 2006-04-27 18:05:41 -0700 | [diff] [blame] | 1906 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1907 | spin_unlock(&res->spinlock); | 
|  | 1908 | } | 
|  | 1909 | mlog(0, "done running all the locks\n"); | 
|  | 1910 |  | 
|  | 1911 | leave: | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1912 | /* balance the ref taken when the work was queued */ | 
| Kurt Hackel | 50635f1 | 2007-01-17 14:54:39 -0800 | [diff] [blame] | 1913 | spin_lock(&res->spinlock); | 
|  | 1914 | dlm_lockres_drop_inflight_ref(dlm, res); | 
|  | 1915 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 1916 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1917 | if (ret < 0) { | 
|  | 1918 | mlog_errno(ret); | 
|  | 1919 | if (newlock) | 
|  | 1920 | dlm_lock_put(newlock); | 
|  | 1921 | } | 
|  | 1922 |  | 
|  | 1923 | mlog_exit(ret); | 
|  | 1924 | return ret; | 
|  | 1925 | } | 
|  | 1926 |  | 
|  | 1927 | void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | 
|  | 1928 | struct dlm_lock_resource *res) | 
|  | 1929 | { | 
|  | 1930 | int i; | 
|  | 1931 | struct list_head *queue, *iter, *iter2; | 
|  | 1932 | struct dlm_lock *lock; | 
|  | 1933 |  | 
|  | 1934 | res->state |= DLM_LOCK_RES_RECOVERING; | 
| Kurt Hackel | 69d72b0 | 2006-05-01 10:57:51 -0700 | [diff] [blame] | 1935 | if (!list_empty(&res->recovering)) { | 
|  | 1936 | mlog(0, | 
|  | 1937 | "Recovering res %s:%.*s, is already on recovery list!\n", | 
|  | 1938 | dlm->name, res->lockname.len, res->lockname.name); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1939 | list_del_init(&res->recovering); | 
| Kurt Hackel | 69d72b0 | 2006-05-01 10:57:51 -0700 | [diff] [blame] | 1940 | } | 
|  | 1941 | /* We need to hold a reference while on the recovery list */ | 
|  | 1942 | dlm_lockres_get(res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1943 | list_add_tail(&res->recovering, &dlm->reco.resources); | 
|  | 1944 |  | 
|  | 1945 | /* find any pending locks and put them back on proper list */ | 
|  | 1946 | for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { | 
|  | 1947 | queue = dlm_list_idx_to_ptr(res, i); | 
|  | 1948 | list_for_each_safe(iter, iter2, queue) { | 
|  | 1949 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 1950 | dlm_lock_get(lock); | 
|  | 1951 | if (lock->convert_pending) { | 
|  | 1952 | /* move converting lock back to granted */ | 
|  | 1953 | BUG_ON(i != DLM_CONVERTING_LIST); | 
|  | 1954 | mlog(0, "node died with convert pending " | 
|  | 1955 | "on %.*s. move back to granted list.\n", | 
|  | 1956 | res->lockname.len, res->lockname.name); | 
|  | 1957 | dlm_revert_pending_convert(res, lock); | 
|  | 1958 | lock->convert_pending = 0; | 
|  | 1959 | } else if (lock->lock_pending) { | 
|  | 1960 | /* remove pending lock requests completely */ | 
|  | 1961 | BUG_ON(i != DLM_BLOCKED_LIST); | 
|  | 1962 | mlog(0, "node died with lock pending " | 
|  | 1963 | "on %.*s. remove from blocked list and skip.\n", | 
|  | 1964 | res->lockname.len, res->lockname.name); | 
|  | 1965 | /* lock will be floating until ref in | 
|  | 1966 | * dlmlock_remote is freed after the network | 
|  | 1967 | * call returns.  ok for it to not be on any | 
|  | 1968 | * list since no ast can be called | 
|  | 1969 | * (the master is dead). */ | 
|  | 1970 | dlm_revert_pending_lock(res, lock); | 
|  | 1971 | lock->lock_pending = 0; | 
|  | 1972 | } else if (lock->unlock_pending) { | 
|  | 1973 | /* if an unlock was in progress, treat as | 
|  | 1974 | * if this had completed successfully | 
|  | 1975 | * before sending this lock state to the | 
|  | 1976 | * new master.  note that the dlm_unlock | 
|  | 1977 | * call is still responsible for calling | 
|  | 1978 | * the unlockast.  that will happen after | 
|  | 1979 | * the network call times out.  for now, | 
|  | 1980 | * just move lists to prepare the new | 
|  | 1981 | * recovery master.  */ | 
|  | 1982 | BUG_ON(i != DLM_GRANTED_LIST); | 
|  | 1983 | mlog(0, "node died with unlock pending " | 
|  | 1984 | "on %.*s. remove from blocked list and skip.\n", | 
|  | 1985 | res->lockname.len, res->lockname.name); | 
|  | 1986 | dlm_commit_pending_unlock(res, lock); | 
|  | 1987 | lock->unlock_pending = 0; | 
|  | 1988 | } else if (lock->cancel_pending) { | 
|  | 1989 | /* if a cancel was in progress, treat as | 
|  | 1990 | * if this had completed successfully | 
|  | 1991 | * before sending this lock state to the | 
|  | 1992 | * new master */ | 
|  | 1993 | BUG_ON(i != DLM_CONVERTING_LIST); | 
|  | 1994 | mlog(0, "node died with cancel pending " | 
|  | 1995 | "on %.*s. move back to granted list.\n", | 
|  | 1996 | res->lockname.len, res->lockname.name); | 
|  | 1997 | dlm_commit_pending_cancel(res, lock); | 
|  | 1998 | lock->cancel_pending = 0; | 
|  | 1999 | } | 
|  | 2000 | dlm_lock_put(lock); | 
|  | 2001 | } | 
|  | 2002 | } | 
|  | 2003 | } | 
|  | 2004 |  | 
|  | 2005 |  | 
|  | 2006 |  | 
|  | 2007 | /* removes all recovered locks from the recovery list. | 
|  | 2008 | * sets the res->owner to the new master. | 
|  | 2009 | * unsets the RECOVERY flag and wakes waiters. */ | 
|  | 2010 | static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | 
|  | 2011 | u8 dead_node, u8 new_master) | 
|  | 2012 | { | 
|  | 2013 | int i; | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2014 | struct list_head *iter, *iter2; | 
|  | 2015 | struct hlist_node *hash_iter; | 
|  | 2016 | struct hlist_head *bucket; | 
|  | 2017 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2018 | struct dlm_lock_resource *res; | 
|  | 2019 |  | 
|  | 2020 | mlog_entry_void(); | 
|  | 2021 |  | 
|  | 2022 | assert_spin_locked(&dlm->spinlock); | 
|  | 2023 |  | 
|  | 2024 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | 
|  | 2025 | res = list_entry (iter, struct dlm_lock_resource, recovering); | 
|  | 2026 | if (res->owner == dead_node) { | 
|  | 2027 | list_del_init(&res->recovering); | 
|  | 2028 | spin_lock(&res->spinlock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2029 | /* new_master has our reference from | 
|  | 2030 | * the lock state sent during recovery */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2031 | dlm_change_lockres_owner(dlm, res, new_master); | 
|  | 2032 | res->state &= ~DLM_LOCK_RES_RECOVERING; | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2033 | if (__dlm_lockres_has_locks(res)) | 
| Kurt Hackel | 69d72b0 | 2006-05-01 10:57:51 -0700 | [diff] [blame] | 2034 | __dlm_dirty_lockres(dlm, res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2035 | spin_unlock(&res->spinlock); | 
|  | 2036 | wake_up(&res->wq); | 
| Kurt Hackel | 69d72b0 | 2006-05-01 10:57:51 -0700 | [diff] [blame] | 2037 | dlm_lockres_put(res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2038 | } | 
|  | 2039 | } | 
|  | 2040 |  | 
|  | 2041 | /* this will become unnecessary eventually, but | 
|  | 2042 | * for now we need to run the whole hash, clear | 
|  | 2043 | * the RECOVERING state and set the owner | 
|  | 2044 | * if necessary */ | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2045 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 
| Daniel Phillips | 03d864c | 2006-03-10 18:08:16 -0800 | [diff] [blame] | 2046 | bucket = dlm_lockres_hash(dlm, i); | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2047 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2048 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 
|  | 2049 | if (res->owner == dead_node) { | 
|  | 2050 | mlog(0, "(this=%u) res %.*s owner=%u " | 
|  | 2051 | "was not on recovering list, but " | 
|  | 2052 | "clearing state anyway\n", | 
|  | 2053 | dlm->node_num, res->lockname.len, | 
|  | 2054 | res->lockname.name, new_master); | 
|  | 2055 | } else if (res->owner == dlm->node_num) { | 
|  | 2056 | mlog(0, "(this=%u) res %.*s owner=%u " | 
|  | 2057 | "was not on recovering list, " | 
|  | 2058 | "owner is THIS node, clearing\n", | 
|  | 2059 | dlm->node_num, res->lockname.len, | 
|  | 2060 | res->lockname.name, new_master); | 
|  | 2061 | } else | 
|  | 2062 | continue; | 
|  | 2063 |  | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 2064 | if (!list_empty(&res->recovering)) { | 
|  | 2065 | mlog(0, "%s:%.*s: lockres was " | 
|  | 2066 | "marked RECOVERING, owner=%u\n", | 
|  | 2067 | dlm->name, res->lockname.len, | 
|  | 2068 | res->lockname.name, res->owner); | 
|  | 2069 | list_del_init(&res->recovering); | 
| Kurt Hackel | 69d72b0 | 2006-05-01 10:57:51 -0700 | [diff] [blame] | 2070 | dlm_lockres_put(res); | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 2071 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2072 | spin_lock(&res->spinlock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2073 | /* new_master has our reference from | 
|  | 2074 | * the lock state sent during recovery */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2075 | dlm_change_lockres_owner(dlm, res, new_master); | 
|  | 2076 | res->state &= ~DLM_LOCK_RES_RECOVERING; | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2077 | if (__dlm_lockres_has_locks(res)) | 
| Kurt Hackel | 69d72b0 | 2006-05-01 10:57:51 -0700 | [diff] [blame] | 2078 | __dlm_dirty_lockres(dlm, res); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2079 | spin_unlock(&res->spinlock); | 
|  | 2080 | wake_up(&res->wq); | 
|  | 2081 | } | 
|  | 2082 | } | 
|  | 2083 | } | 
|  | 2084 | } | 
|  | 2085 |  | 
|  | 2086 | static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) | 
|  | 2087 | { | 
|  | 2088 | if (local) { | 
|  | 2089 | if (lock->ml.type != LKM_EXMODE && | 
|  | 2090 | lock->ml.type != LKM_PRMODE) | 
|  | 2091 | return 1; | 
|  | 2092 | } else if (lock->ml.type == LKM_EXMODE) | 
|  | 2093 | return 1; | 
|  | 2094 | return 0; | 
|  | 2095 | } | 
|  | 2096 |  | 
|  | 2097 | static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, | 
|  | 2098 | struct dlm_lock_resource *res, u8 dead_node) | 
|  | 2099 | { | 
|  | 2100 | struct list_head *iter, *queue; | 
|  | 2101 | struct dlm_lock *lock; | 
|  | 2102 | int blank_lvb = 0, local = 0; | 
|  | 2103 | int i; | 
|  | 2104 | u8 search_node; | 
|  | 2105 |  | 
|  | 2106 | assert_spin_locked(&dlm->spinlock); | 
|  | 2107 | assert_spin_locked(&res->spinlock); | 
|  | 2108 |  | 
|  | 2109 | if (res->owner == dlm->node_num) | 
|  | 2110 | /* if this node owned the lockres, and if the dead node | 
|  | 2111 | * had an EX when he died, blank out the lvb */ | 
|  | 2112 | search_node = dead_node; | 
|  | 2113 | else { | 
|  | 2114 | /* if this is a secondary lockres, and we had no EX or PR | 
|  | 2115 | * locks granted, we can no longer trust the lvb */ | 
|  | 2116 | search_node = dlm->node_num; | 
|  | 2117 | local = 1;  /* check local state for valid lvb */ | 
|  | 2118 | } | 
|  | 2119 |  | 
|  | 2120 | for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { | 
|  | 2121 | queue = dlm_list_idx_to_ptr(res, i); | 
|  | 2122 | list_for_each(iter, queue) { | 
|  | 2123 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 2124 | if (lock->ml.node == search_node) { | 
|  | 2125 | if (dlm_lvb_needs_invalidation(lock, local)) { | 
|  | 2126 | /* zero the lksb lvb and lockres lvb */ | 
|  | 2127 | blank_lvb = 1; | 
|  | 2128 | memset(lock->lksb->lvb, 0, DLM_LVB_LEN); | 
|  | 2129 | } | 
|  | 2130 | } | 
|  | 2131 | } | 
|  | 2132 | } | 
|  | 2133 |  | 
|  | 2134 | if (blank_lvb) { | 
|  | 2135 | mlog(0, "clearing %.*s lvb, dead node %u had EX\n", | 
|  | 2136 | res->lockname.len, res->lockname.name, dead_node); | 
|  | 2137 | memset(res->lvb, 0, DLM_LVB_LEN); | 
|  | 2138 | } | 
|  | 2139 | } | 
|  | 2140 |  | 
|  | 2141 | static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | 
|  | 2142 | struct dlm_lock_resource *res, u8 dead_node) | 
|  | 2143 | { | 
|  | 2144 | struct list_head *iter, *tmpiter; | 
|  | 2145 | struct dlm_lock *lock; | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2146 | unsigned int freed = 0; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2147 |  | 
|  | 2148 | /* this node is the lockres master: | 
|  | 2149 | * 1) remove any stale locks for the dead node | 
|  | 2150 | * 2) if the dead node had an EX when he died, blank out the lvb | 
|  | 2151 | */ | 
|  | 2152 | assert_spin_locked(&dlm->spinlock); | 
|  | 2153 | assert_spin_locked(&res->spinlock); | 
|  | 2154 |  | 
|  | 2155 | /* TODO: check pending_asts, pending_basts here */ | 
|  | 2156 | list_for_each_safe(iter, tmpiter, &res->granted) { | 
|  | 2157 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 2158 | if (lock->ml.node == dead_node) { | 
|  | 2159 | list_del_init(&lock->list); | 
|  | 2160 | dlm_lock_put(lock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2161 | freed++; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2162 | } | 
|  | 2163 | } | 
|  | 2164 | list_for_each_safe(iter, tmpiter, &res->converting) { | 
|  | 2165 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 2166 | if (lock->ml.node == dead_node) { | 
|  | 2167 | list_del_init(&lock->list); | 
|  | 2168 | dlm_lock_put(lock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2169 | freed++; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2170 | } | 
|  | 2171 | } | 
|  | 2172 | list_for_each_safe(iter, tmpiter, &res->blocked) { | 
|  | 2173 | lock = list_entry (iter, struct dlm_lock, list); | 
|  | 2174 | if (lock->ml.node == dead_node) { | 
|  | 2175 | list_del_init(&lock->list); | 
|  | 2176 | dlm_lock_put(lock); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2177 | freed++; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2178 | } | 
|  | 2179 | } | 
|  | 2180 |  | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2181 | if (freed) { | 
|  | 2182 | mlog(0, "%s:%.*s: freed %u locks for dead node %u, " | 
|  | 2183 | "dropping ref from lockres\n", dlm->name, | 
|  | 2184 | res->lockname.len, res->lockname.name, freed, dead_node); | 
|  | 2185 | BUG_ON(!test_bit(dead_node, res->refmap)); | 
|  | 2186 | dlm_lockres_clear_refmap_bit(dead_node, res); | 
|  | 2187 | } else if (test_bit(dead_node, res->refmap)) { | 
|  | 2188 | mlog(0, "%s:%.*s: dead node %u had a ref, but had " | 
|  | 2189 | "no locks and had not purged before dying\n", dlm->name, | 
|  | 2190 | res->lockname.len, res->lockname.name, dead_node); | 
|  | 2191 | dlm_lockres_clear_refmap_bit(dead_node, res); | 
|  | 2192 | } | 
|  | 2193 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2194 | /* do not kick thread yet */ | 
|  | 2195 | __dlm_dirty_lockres(dlm, res); | 
|  | 2196 | } | 
|  | 2197 |  | 
|  | 2198 | /* if this node is the recovery master, and there are no | 
|  | 2199 | * locks for a given lockres owned by this node that are in | 
|  | 2200 | * either PR or EX mode, zero out the lvb before requesting. | 
|  | 2201 | * | 
|  | 2202 | */ | 
|  | 2203 |  | 
|  | 2204 |  | 
|  | 2205 | static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | 
|  | 2206 | { | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2207 | struct hlist_node *iter; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2208 | struct dlm_lock_resource *res; | 
|  | 2209 | int i; | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2210 | struct hlist_head *bucket; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2211 | struct dlm_lock *lock; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2212 |  | 
|  | 2213 |  | 
|  | 2214 | /* purge any stale mles */ | 
|  | 2215 | dlm_clean_master_list(dlm, dead_node); | 
|  | 2216 |  | 
|  | 2217 | /* | 
|  | 2218 | * now clean up all lock resources.  there are two rules: | 
|  | 2219 | * | 
|  | 2220 | * 1) if the dead node was the master, move the lockres | 
|  | 2221 | *    to the recovering list.  set the RECOVERING flag. | 
|  | 2222 | *    this lockres needs to be cleaned up before it can | 
|  | 2223 | *    be used further. | 
|  | 2224 | * | 
|  | 2225 | * 2) if this node was the master, remove all locks from | 
|  | 2226 | *    each of the lockres queues that were owned by the | 
|  | 2227 | *    dead node.  once recovery finishes, the dlm thread | 
|  | 2228 | *    can be kicked again to see if any ASTs or BASTs | 
|  | 2229 | *    need to be fired as a result. | 
|  | 2230 | */ | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2231 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 
| Daniel Phillips | 03d864c | 2006-03-10 18:08:16 -0800 | [diff] [blame] | 2232 | bucket = dlm_lockres_hash(dlm, i); | 
| Mark Fasheh | 81f2094 | 2006-02-28 17:31:22 -0800 | [diff] [blame] | 2233 | hlist_for_each_entry(res, iter, bucket, hash_node) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2234 | /* always prune any $RECOVERY entries for dead nodes, | 
|  | 2235 | * otherwise hangs can occur during later recovery */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2236 | if (dlm_is_recovery_lock(res->lockname.name, | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2237 | res->lockname.len)) { | 
|  | 2238 | spin_lock(&res->spinlock); | 
|  | 2239 | list_for_each_entry(lock, &res->granted, list) { | 
|  | 2240 | if (lock->ml.node == dead_node) { | 
|  | 2241 | mlog(0, "AHA! there was " | 
|  | 2242 | "a $RECOVERY lock for dead " | 
|  | 2243 | "node %u (%s)!\n", | 
|  | 2244 | dead_node, dlm->name); | 
|  | 2245 | list_del_init(&lock->list); | 
|  | 2246 | dlm_lock_put(lock); | 
|  | 2247 | break; | 
|  | 2248 | } | 
|  | 2249 | } | 
|  | 2250 | spin_unlock(&res->spinlock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2251 | continue; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2252 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2253 | spin_lock(&res->spinlock); | 
|  | 2254 | /* zero the lvb if necessary */ | 
|  | 2255 | dlm_revalidate_lvb(dlm, res, dead_node); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2256 | if (res->owner == dead_node) { | 
|  | 2257 | if (res->state & DLM_LOCK_RES_DROPPING_REF) | 
|  | 2258 | mlog(0, "%s:%.*s: owned by " | 
|  | 2259 | "dead node %u, this node was " | 
|  | 2260 | "dropping its ref when it died. " | 
|  | 2261 | "continue, dropping the flag.\n", | 
|  | 2262 | dlm->name, res->lockname.len, | 
|  | 2263 | res->lockname.name, dead_node); | 
|  | 2264 |  | 
|  | 2265 | /* the wake_up for this will happen when the | 
|  | 2266 | * RECOVERING flag is dropped later */ | 
|  | 2267 | res->state &= ~DLM_LOCK_RES_DROPPING_REF; | 
|  | 2268 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2269 | dlm_move_lockres_to_recovery_list(dlm, res); | 
| Kurt Hackel | ba2bf21 | 2006-12-01 14:47:20 -0800 | [diff] [blame] | 2270 | } else if (res->owner == dlm->node_num) { | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2271 | dlm_free_dead_locks(dlm, res, dead_node); | 
|  | 2272 | __dlm_lockres_calc_usage(dlm, res); | 
|  | 2273 | } | 
|  | 2274 | spin_unlock(&res->spinlock); | 
|  | 2275 | } | 
|  | 2276 | } | 
|  | 2277 |  | 
|  | 2278 | } | 
|  | 2279 |  | 
|  | 2280 | static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) | 
|  | 2281 | { | 
|  | 2282 | assert_spin_locked(&dlm->spinlock); | 
|  | 2283 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2284 | if (dlm->reco.new_master == idx) { | 
|  | 2285 | mlog(0, "%s: recovery master %d just died\n", | 
|  | 2286 | dlm->name, idx); | 
|  | 2287 | if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { | 
|  | 2288 | /* finalize1 was reached, so it is safe to clear | 
|  | 2289 | * the new_master and dead_node.  that recovery | 
|  | 2290 | * is complete. */ | 
|  | 2291 | mlog(0, "%s: dead master %d had reached " | 
|  | 2292 | "finalize1 state, clearing\n", dlm->name, idx); | 
|  | 2293 | dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; | 
|  | 2294 | __dlm_reset_recovery(dlm); | 
|  | 2295 | } | 
|  | 2296 | } | 
|  | 2297 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2298 | /* check to see if the node is already considered dead */ | 
|  | 2299 | if (!test_bit(idx, dlm->live_nodes_map)) { | 
|  | 2300 | mlog(0, "for domain %s, node %d is already dead. " | 
|  | 2301 | "another node likely did recovery already.\n", | 
|  | 2302 | dlm->name, idx); | 
|  | 2303 | return; | 
|  | 2304 | } | 
|  | 2305 |  | 
|  | 2306 | /* check to see if we do not care about this node */ | 
|  | 2307 | if (!test_bit(idx, dlm->domain_map)) { | 
|  | 2308 | /* This also catches the case that we get a node down | 
|  | 2309 | * but haven't joined the domain yet. */ | 
|  | 2310 | mlog(0, "node %u already removed from domain!\n", idx); | 
|  | 2311 | return; | 
|  | 2312 | } | 
|  | 2313 |  | 
|  | 2314 | clear_bit(idx, dlm->live_nodes_map); | 
|  | 2315 |  | 
|  | 2316 | /* Clean up join state on node death. */ | 
|  | 2317 | if (dlm->joining_node == idx) { | 
|  | 2318 | mlog(0, "Clearing join state for node %u\n", idx); | 
|  | 2319 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | 
|  | 2320 | } | 
|  | 2321 |  | 
|  | 2322 | /* make sure local cleanup occurs before the heartbeat events */ | 
|  | 2323 | if (!test_bit(idx, dlm->recovery_map)) | 
|  | 2324 | dlm_do_local_recovery_cleanup(dlm, idx); | 
|  | 2325 |  | 
|  | 2326 | /* notify anything attached to the heartbeat events */ | 
|  | 2327 | dlm_hb_event_notify_attached(dlm, idx, 0); | 
|  | 2328 |  | 
|  | 2329 | mlog(0, "node %u being removed from domain map!\n", idx); | 
|  | 2330 | clear_bit(idx, dlm->domain_map); | 
|  | 2331 | /* wake up migration waiters if a node goes down. | 
|  | 2332 | * perhaps later we can genericize this for other waiters. */ | 
|  | 2333 | wake_up(&dlm->migration_wq); | 
|  | 2334 |  | 
|  | 2335 | if (test_bit(idx, dlm->recovery_map)) | 
|  | 2336 | mlog(0, "domain %s, node %u already added " | 
|  | 2337 | "to recovery map!\n", dlm->name, idx); | 
|  | 2338 | else | 
|  | 2339 | set_bit(idx, dlm->recovery_map); | 
|  | 2340 | } | 
|  | 2341 |  | 
|  | 2342 | void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) | 
|  | 2343 | { | 
|  | 2344 | struct dlm_ctxt *dlm = data; | 
|  | 2345 |  | 
|  | 2346 | if (!dlm_grab(dlm)) | 
|  | 2347 | return; | 
|  | 2348 |  | 
|  | 2349 | spin_lock(&dlm->spinlock); | 
|  | 2350 | __dlm_hb_node_down(dlm, idx); | 
|  | 2351 | spin_unlock(&dlm->spinlock); | 
|  | 2352 |  | 
|  | 2353 | dlm_put(dlm); | 
|  | 2354 | } | 
|  | 2355 |  | 
|  | 2356 | void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) | 
|  | 2357 | { | 
|  | 2358 | struct dlm_ctxt *dlm = data; | 
|  | 2359 |  | 
|  | 2360 | if (!dlm_grab(dlm)) | 
|  | 2361 | return; | 
|  | 2362 |  | 
|  | 2363 | spin_lock(&dlm->spinlock); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2364 | set_bit(idx, dlm->live_nodes_map); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2365 | /* do NOT notify mle attached to the heartbeat events. | 
|  | 2366 | * new nodes are not interesting in mastery until joined. */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2367 | spin_unlock(&dlm->spinlock); | 
|  | 2368 |  | 
|  | 2369 | dlm_put(dlm); | 
|  | 2370 | } | 
|  | 2371 |  | 
|  | 2372 | static void dlm_reco_ast(void *astdata) | 
|  | 2373 | { | 
|  | 2374 | struct dlm_ctxt *dlm = astdata; | 
|  | 2375 | mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", | 
|  | 2376 | dlm->node_num, dlm->name); | 
|  | 2377 | } | 
|  | 2378 | static void dlm_reco_bast(void *astdata, int blocked_type) | 
|  | 2379 | { | 
|  | 2380 | struct dlm_ctxt *dlm = astdata; | 
|  | 2381 | mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", | 
|  | 2382 | dlm->node_num, dlm->name); | 
|  | 2383 | } | 
|  | 2384 | static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) | 
|  | 2385 | { | 
|  | 2386 | mlog(0, "unlockast for recovery lock fired!\n"); | 
|  | 2387 | } | 
|  | 2388 |  | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2389 | /* | 
|  | 2390 | * dlm_pick_recovery_master will continually attempt to use | 
|  | 2391 | * dlmlock() on the special "$RECOVERY" lockres with the | 
|  | 2392 | * LKM_NOQUEUE flag to get an EX.  every thread that enters | 
|  | 2393 | * this function on each node racing to become the recovery | 
|  | 2394 | * master will not stop attempting this until either: | 
|  | 2395 | * a) this node gets the EX (and becomes the recovery master), | 
|  | 2396 | * or b) dlm->reco.new_master gets set to some nodenum | 
|  | 2397 | * != O2NM_INVALID_NODE_NUM (another node will do the reco). | 
|  | 2398 | * so each time a recovery master is needed, the entire cluster | 
|  | 2399 | * will sync at this point.  if the new master dies, that will | 
|  | 2400 | * be detected in dlm_do_recovery */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2401 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) | 
|  | 2402 | { | 
|  | 2403 | enum dlm_status ret; | 
|  | 2404 | struct dlm_lockstatus lksb; | 
|  | 2405 | int status = -EINVAL; | 
|  | 2406 |  | 
|  | 2407 | mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", | 
|  | 2408 | dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2409 | again: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2410 | memset(&lksb, 0, sizeof(lksb)); | 
|  | 2411 |  | 
|  | 2412 | ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, | 
| Mark Fasheh | 3384f3d | 2006-09-08 11:38:29 -0700 | [diff] [blame] | 2413 | DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN, | 
|  | 2414 | dlm_reco_ast, dlm, dlm_reco_bast); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2415 |  | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2416 | mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", | 
|  | 2417 | dlm->name, ret, lksb.status); | 
|  | 2418 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2419 | if (ret == DLM_NORMAL) { | 
|  | 2420 | mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", | 
|  | 2421 | dlm->name, dlm->node_num); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2422 |  | 
|  | 2423 | /* got the EX lock.  check to see if another node | 
|  | 2424 | * just became the reco master */ | 
|  | 2425 | if (dlm_reco_master_ready(dlm)) { | 
|  | 2426 | mlog(0, "%s: got reco EX lock, but %u will " | 
|  | 2427 | "do the recovery\n", dlm->name, | 
|  | 2428 | dlm->reco.new_master); | 
|  | 2429 | status = -EEXIST; | 
|  | 2430 | } else { | 
| Kurt Hackel | 898effa | 2006-01-18 17:01:25 -0800 | [diff] [blame] | 2431 | status = 0; | 
|  | 2432 |  | 
|  | 2433 | /* see if recovery was already finished elsewhere */ | 
|  | 2434 | spin_lock(&dlm->spinlock); | 
|  | 2435 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | 
|  | 2436 | status = -EINVAL; | 
|  | 2437 | mlog(0, "%s: got reco EX lock, but " | 
|  | 2438 | "node got recovered already\n", dlm->name); | 
|  | 2439 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { | 
|  | 2440 | mlog(ML_ERROR, "%s: new master is %u " | 
|  | 2441 | "but no dead node!\n", | 
|  | 2442 | dlm->name, dlm->reco.new_master); | 
|  | 2443 | BUG(); | 
|  | 2444 | } | 
|  | 2445 | } | 
|  | 2446 | spin_unlock(&dlm->spinlock); | 
|  | 2447 | } | 
|  | 2448 |  | 
|  | 2449 | /* if this node has actually become the recovery master, | 
|  | 2450 | * set the master and send the messages to begin recovery */ | 
|  | 2451 | if (!status) { | 
|  | 2452 | mlog(0, "%s: dead=%u, this=%u, sending " | 
|  | 2453 | "begin_reco now\n", dlm->name, | 
|  | 2454 | dlm->reco.dead_node, dlm->node_num); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2455 | status = dlm_send_begin_reco_message(dlm, | 
|  | 2456 | dlm->reco.dead_node); | 
|  | 2457 | /* this always succeeds */ | 
|  | 2458 | BUG_ON(status); | 
|  | 2459 |  | 
|  | 2460 | /* set the new_master to this node */ | 
|  | 2461 | spin_lock(&dlm->spinlock); | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 2462 | dlm_set_reco_master(dlm, dlm->node_num); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2463 | spin_unlock(&dlm->spinlock); | 
|  | 2464 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2465 |  | 
|  | 2466 | /* recovery lock is a special case.  ast will not get fired, | 
|  | 2467 | * so just go ahead and unlock it. */ | 
|  | 2468 | ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2469 | if (ret == DLM_DENIED) { | 
|  | 2470 | mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); | 
|  | 2471 | ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); | 
|  | 2472 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2473 | if (ret != DLM_NORMAL) { | 
|  | 2474 | /* this would really suck. this could only happen | 
|  | 2475 | * if there was a network error during the unlock | 
|  | 2476 | * because of node death.  this means the unlock | 
|  | 2477 | * is actually "done" and the lock structure is | 
|  | 2478 | * even freed.  we can continue, but only | 
|  | 2479 | * because this specific lock name is special. */ | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2480 | mlog(ML_ERROR, "dlmunlock returned %d\n", ret); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2481 | } | 
|  | 2482 | } else if (ret == DLM_NOTQUEUED) { | 
|  | 2483 | mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", | 
|  | 2484 | dlm->name, dlm->node_num); | 
|  | 2485 | /* another node is master. wait on | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2486 | * reco.new_master != O2NM_INVALID_NODE_NUM | 
|  | 2487 | * for at most one second */ | 
|  | 2488 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 
|  | 2489 | dlm_reco_master_ready(dlm), | 
|  | 2490 | msecs_to_jiffies(1000)); | 
|  | 2491 | if (!dlm_reco_master_ready(dlm)) { | 
|  | 2492 | mlog(0, "%s: reco master taking awhile\n", | 
|  | 2493 | dlm->name); | 
|  | 2494 | goto again; | 
|  | 2495 | } | 
|  | 2496 | /* another node has informed this one that it is reco master */ | 
|  | 2497 | mlog(0, "%s: reco master %u is ready to recover %u\n", | 
|  | 2498 | dlm->name, dlm->reco.new_master, dlm->reco.dead_node); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2499 | status = -EEXIST; | 
| Kurt Hackel | c8df412 | 2006-05-01 13:47:50 -0700 | [diff] [blame] | 2500 | } else if (ret == DLM_RECOVERING) { | 
|  | 2501 | mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", | 
|  | 2502 | dlm->name, dlm->node_num); | 
|  | 2503 | goto again; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2504 | } else { | 
|  | 2505 | struct dlm_lock_resource *res; | 
|  | 2506 |  | 
|  | 2507 | /* dlmlock returned something other than NOTQUEUED or NORMAL */ | 
|  | 2508 | mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " | 
|  | 2509 | "lksb.status=%s\n", dlm->name, dlm_errname(ret), | 
|  | 2510 | dlm_errname(lksb.status)); | 
|  | 2511 | res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, | 
|  | 2512 | DLM_RECOVERY_LOCK_NAME_LEN); | 
|  | 2513 | if (res) { | 
|  | 2514 | dlm_print_one_lock_resource(res); | 
|  | 2515 | dlm_lockres_put(res); | 
|  | 2516 | } else { | 
|  | 2517 | mlog(ML_ERROR, "recovery lock not found\n"); | 
|  | 2518 | } | 
|  | 2519 | BUG(); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2520 | } | 
|  | 2521 |  | 
|  | 2522 | return status; | 
|  | 2523 | } | 
|  | 2524 |  | 
|  | 2525 | static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | 
|  | 2526 | { | 
|  | 2527 | struct dlm_begin_reco br; | 
|  | 2528 | int ret = 0; | 
|  | 2529 | struct dlm_node_iter iter; | 
|  | 2530 | int nodenum; | 
|  | 2531 | int status; | 
|  | 2532 |  | 
|  | 2533 | mlog_entry("%u\n", dead_node); | 
|  | 2534 |  | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 2535 | mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2536 |  | 
|  | 2537 | spin_lock(&dlm->spinlock); | 
|  | 2538 | dlm_node_iter_init(dlm->domain_map, &iter); | 
|  | 2539 | spin_unlock(&dlm->spinlock); | 
|  | 2540 |  | 
|  | 2541 | clear_bit(dead_node, iter.node_map); | 
|  | 2542 |  | 
|  | 2543 | memset(&br, 0, sizeof(br)); | 
|  | 2544 | br.node_idx = dlm->node_num; | 
|  | 2545 | br.dead_node = dead_node; | 
|  | 2546 |  | 
|  | 2547 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 
|  | 2548 | ret = 0; | 
|  | 2549 | if (nodenum == dead_node) { | 
|  | 2550 | mlog(0, "not sending begin reco to dead node " | 
|  | 2551 | "%u\n", dead_node); | 
|  | 2552 | continue; | 
|  | 2553 | } | 
|  | 2554 | if (nodenum == dlm->node_num) { | 
|  | 2555 | mlog(0, "not sending begin reco to self\n"); | 
|  | 2556 | continue; | 
|  | 2557 | } | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2558 | retry: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2559 | ret = -EINVAL; | 
|  | 2560 | mlog(0, "attempting to send begin reco msg to %d\n", | 
|  | 2561 | nodenum); | 
|  | 2562 | ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, | 
|  | 2563 | &br, sizeof(br), nodenum, &status); | 
|  | 2564 | /* negative status is handled ok by caller here */ | 
|  | 2565 | if (ret >= 0) | 
|  | 2566 | ret = status; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2567 | if (dlm_is_host_down(ret)) { | 
|  | 2568 | /* node is down.  not involved in recovery | 
|  | 2569 | * so just keep going */ | 
|  | 2570 | mlog(0, "%s: node %u was down when sending " | 
|  | 2571 | "begin reco msg (%d)\n", dlm->name, nodenum, ret); | 
|  | 2572 | ret = 0; | 
|  | 2573 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2574 | if (ret < 0) { | 
|  | 2575 | struct dlm_lock_resource *res; | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2576 | /* this is now a serious problem, possibly ENOMEM | 
|  | 2577 | * in the network stack.  must retry */ | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2578 | mlog_errno(ret); | 
|  | 2579 | mlog(ML_ERROR, "begin reco of dlm %s to node %u " | 
|  | 2580 | " returned %d\n", dlm->name, nodenum, ret); | 
|  | 2581 | res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, | 
|  | 2582 | DLM_RECOVERY_LOCK_NAME_LEN); | 
|  | 2583 | if (res) { | 
|  | 2584 | dlm_print_one_lock_resource(res); | 
|  | 2585 | dlm_lockres_put(res); | 
|  | 2586 | } else { | 
|  | 2587 | mlog(ML_ERROR, "recovery lock not found\n"); | 
|  | 2588 | } | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2589 | /* sleep for a bit in hopes that we can avoid | 
|  | 2590 | * another ENOMEM */ | 
|  | 2591 | msleep(100); | 
|  | 2592 | goto retry; | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2593 | } else if (ret == EAGAIN) { | 
|  | 2594 | mlog(0, "%s: trying to start recovery of node " | 
|  | 2595 | "%u, but node %u is waiting for last recovery " | 
|  | 2596 | "to complete, backoff for a bit\n", dlm->name, | 
|  | 2597 | dead_node, nodenum); | 
|  | 2598 | /* TODO Look into replacing msleep with cond_resched() */ | 
|  | 2599 | msleep(100); | 
|  | 2600 | goto retry; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2601 | } | 
|  | 2602 | } | 
|  | 2603 |  | 
|  | 2604 | return ret; | 
|  | 2605 | } | 
|  | 2606 |  | 
| Kurt Hackel | d74c980 | 2007-01-17 17:04:25 -0800 | [diff] [blame] | 2607 | int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, | 
|  | 2608 | void **ret_data) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2609 | { | 
|  | 2610 | struct dlm_ctxt *dlm = data; | 
|  | 2611 | struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; | 
|  | 2612 |  | 
|  | 2613 | /* ok to return 0, domain has gone away */ | 
|  | 2614 | if (!dlm_grab(dlm)) | 
|  | 2615 | return 0; | 
|  | 2616 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2617 | spin_lock(&dlm->spinlock); | 
|  | 2618 | if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { | 
|  | 2619 | mlog(0, "%s: node %u wants to recover node %u (%u:%u) " | 
|  | 2620 | "but this node is in finalize state, waiting on finalize2\n", | 
|  | 2621 | dlm->name, br->node_idx, br->dead_node, | 
|  | 2622 | dlm->reco.dead_node, dlm->reco.new_master); | 
|  | 2623 | spin_unlock(&dlm->spinlock); | 
|  | 2624 | return EAGAIN; | 
|  | 2625 | } | 
|  | 2626 | spin_unlock(&dlm->spinlock); | 
|  | 2627 |  | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 2628 | mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", | 
|  | 2629 | dlm->name, br->node_idx, br->dead_node, | 
|  | 2630 | dlm->reco.dead_node, dlm->reco.new_master); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2631 |  | 
|  | 2632 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); | 
|  | 2633 |  | 
|  | 2634 | spin_lock(&dlm->spinlock); | 
|  | 2635 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2636 | if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { | 
|  | 2637 | mlog(0, "%s: new_master %u died, changing " | 
|  | 2638 | "to %u\n", dlm->name, dlm->reco.new_master, | 
|  | 2639 | br->node_idx); | 
|  | 2640 | } else { | 
|  | 2641 | mlog(0, "%s: new_master %u NOT DEAD, changing " | 
|  | 2642 | "to %u\n", dlm->name, dlm->reco.new_master, | 
|  | 2643 | br->node_idx); | 
|  | 2644 | /* may not have seen the new master as dead yet */ | 
|  | 2645 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2646 | } | 
|  | 2647 | if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2648 | mlog(ML_NOTICE, "%s: dead_node previously set to %u, " | 
|  | 2649 | "node %u changing it to %u\n", dlm->name, | 
|  | 2650 | dlm->reco.dead_node, br->node_idx, br->dead_node); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2651 | } | 
| Kurt Hackel | ab27eb6 | 2006-04-27 18:03:49 -0700 | [diff] [blame] | 2652 | dlm_set_reco_master(dlm, br->node_idx); | 
|  | 2653 | dlm_set_reco_dead_node(dlm, br->dead_node); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2654 | if (!test_bit(br->dead_node, dlm->recovery_map)) { | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2655 | mlog(0, "recovery master %u sees %u as dead, but this " | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2656 | "node has not yet.  marking %u as dead\n", | 
|  | 2657 | br->node_idx, br->dead_node, br->dead_node); | 
| Kurt Hackel | e2faea4 | 2006-01-12 14:24:55 -0800 | [diff] [blame] | 2658 | if (!test_bit(br->dead_node, dlm->domain_map) || | 
|  | 2659 | !test_bit(br->dead_node, dlm->live_nodes_map)) | 
|  | 2660 | mlog(0, "%u not in domain/live_nodes map " | 
|  | 2661 | "so setting it in reco map manually\n", | 
|  | 2662 | br->dead_node); | 
| Kurt Hackel | c03872f | 2006-03-06 14:08:49 -0800 | [diff] [blame] | 2663 | /* force the recovery cleanup in __dlm_hb_node_down | 
|  | 2664 | * both of these will be cleared in a moment */ | 
|  | 2665 | set_bit(br->dead_node, dlm->domain_map); | 
|  | 2666 | set_bit(br->dead_node, dlm->live_nodes_map); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2667 | __dlm_hb_node_down(dlm, br->dead_node); | 
|  | 2668 | } | 
|  | 2669 | spin_unlock(&dlm->spinlock); | 
|  | 2670 |  | 
|  | 2671 | dlm_kick_recovery_thread(dlm); | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 2672 |  | 
|  | 2673 | mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", | 
|  | 2674 | dlm->name, br->node_idx, br->dead_node, | 
|  | 2675 | dlm->reco.dead_node, dlm->reco.new_master); | 
|  | 2676 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2677 | dlm_put(dlm); | 
|  | 2678 | return 0; | 
|  | 2679 | } | 
|  | 2680 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2681 | #define DLM_FINALIZE_STAGE2  0x01 | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2682 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) | 
|  | 2683 | { | 
|  | 2684 | int ret = 0; | 
|  | 2685 | struct dlm_finalize_reco fr; | 
|  | 2686 | struct dlm_node_iter iter; | 
|  | 2687 | int nodenum; | 
|  | 2688 | int status; | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2689 | int stage = 1; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2690 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2691 | mlog(0, "finishing recovery for node %s:%u, " | 
|  | 2692 | "stage %d\n", dlm->name, dlm->reco.dead_node, stage); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2693 |  | 
|  | 2694 | spin_lock(&dlm->spinlock); | 
|  | 2695 | dlm_node_iter_init(dlm->domain_map, &iter); | 
|  | 2696 | spin_unlock(&dlm->spinlock); | 
|  | 2697 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2698 | stage2: | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2699 | memset(&fr, 0, sizeof(fr)); | 
|  | 2700 | fr.node_idx = dlm->node_num; | 
|  | 2701 | fr.dead_node = dlm->reco.dead_node; | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2702 | if (stage == 2) | 
|  | 2703 | fr.flags |= DLM_FINALIZE_STAGE2; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2704 |  | 
|  | 2705 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 
|  | 2706 | if (nodenum == dlm->node_num) | 
|  | 2707 | continue; | 
|  | 2708 | ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, | 
|  | 2709 | &fr, sizeof(fr), nodenum, &status); | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2710 | if (ret >= 0) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2711 | ret = status; | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2712 | if (ret < 0) { | 
|  | 2713 | mlog_errno(ret); | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2714 | if (dlm_is_host_down(ret)) { | 
|  | 2715 | /* this has no effect on this recovery | 
|  | 2716 | * session, so set the status to zero to | 
|  | 2717 | * finish out the last recovery */ | 
|  | 2718 | mlog(ML_ERROR, "node %u went down after this " | 
|  | 2719 | "node finished recovery.\n", nodenum); | 
|  | 2720 | ret = 0; | 
| Kurt Hackel | c27069e | 2006-05-01 13:51:49 -0700 | [diff] [blame] | 2721 | continue; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2722 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2723 | break; | 
|  | 2724 | } | 
|  | 2725 | } | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2726 | if (stage == 1) { | 
|  | 2727 | /* reset the node_iter back to the top and send finalize2 */ | 
|  | 2728 | iter.curnode = -1; | 
|  | 2729 | stage = 2; | 
|  | 2730 | goto stage2; | 
|  | 2731 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2732 |  | 
|  | 2733 | return ret; | 
|  | 2734 | } | 
|  | 2735 |  | 
| Kurt Hackel | d74c980 | 2007-01-17 17:04:25 -0800 | [diff] [blame] | 2736 | int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, | 
|  | 2737 | void **ret_data) | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2738 | { | 
|  | 2739 | struct dlm_ctxt *dlm = data; | 
|  | 2740 | struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2741 | int stage = 1; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2742 |  | 
|  | 2743 | /* ok to return 0, domain has gone away */ | 
|  | 2744 | if (!dlm_grab(dlm)) | 
|  | 2745 | return 0; | 
|  | 2746 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2747 | if (fr->flags & DLM_FINALIZE_STAGE2) | 
|  | 2748 | stage = 2; | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2749 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2750 | mlog(0, "%s: node %u finalizing recovery stage%d of " | 
|  | 2751 | "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, | 
|  | 2752 | fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); | 
|  | 2753 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2754 | spin_lock(&dlm->spinlock); | 
|  | 2755 |  | 
|  | 2756 | if (dlm->reco.new_master != fr->node_idx) { | 
|  | 2757 | mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " | 
|  | 2758 | "%u is supposed to be the new master, dead=%u\n", | 
|  | 2759 | fr->node_idx, dlm->reco.new_master, fr->dead_node); | 
|  | 2760 | BUG(); | 
|  | 2761 | } | 
|  | 2762 | if (dlm->reco.dead_node != fr->dead_node) { | 
|  | 2763 | mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " | 
|  | 2764 | "node %u, but node %u is supposed to be dead\n", | 
|  | 2765 | fr->node_idx, fr->dead_node, dlm->reco.dead_node); | 
|  | 2766 | BUG(); | 
|  | 2767 | } | 
|  | 2768 |  | 
| Kurt Hackel | 466d1a4 | 2006-05-01 11:11:13 -0700 | [diff] [blame] | 2769 | switch (stage) { | 
|  | 2770 | case 1: | 
|  | 2771 | dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); | 
|  | 2772 | if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { | 
|  | 2773 | mlog(ML_ERROR, "%s: received finalize1 from " | 
|  | 2774 | "new master %u for dead node %u, but " | 
|  | 2775 | "this node has already received it!\n", | 
|  | 2776 | dlm->name, fr->node_idx, fr->dead_node); | 
|  | 2777 | dlm_print_reco_node_status(dlm); | 
|  | 2778 | BUG(); | 
|  | 2779 | } | 
|  | 2780 | dlm->reco.state |= DLM_RECO_STATE_FINALIZE; | 
|  | 2781 | spin_unlock(&dlm->spinlock); | 
|  | 2782 | break; | 
|  | 2783 | case 2: | 
|  | 2784 | if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { | 
|  | 2785 | mlog(ML_ERROR, "%s: received finalize2 from " | 
|  | 2786 | "new master %u for dead node %u, but " | 
|  | 2787 | "this node did not have finalize1!\n", | 
|  | 2788 | dlm->name, fr->node_idx, fr->dead_node); | 
|  | 2789 | dlm_print_reco_node_status(dlm); | 
|  | 2790 | BUG(); | 
|  | 2791 | } | 
|  | 2792 | dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; | 
|  | 2793 | spin_unlock(&dlm->spinlock); | 
|  | 2794 | dlm_reset_recovery(dlm); | 
|  | 2795 | dlm_kick_recovery_thread(dlm); | 
|  | 2796 | break; | 
|  | 2797 | default: | 
|  | 2798 | BUG(); | 
|  | 2799 | } | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2800 |  | 
| Kurt Hackel | d6dea6e | 2006-04-27 18:08:51 -0700 | [diff] [blame] | 2801 | mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", | 
|  | 2802 | dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); | 
|  | 2803 |  | 
| Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 2804 | dlm_put(dlm); | 
|  | 2805 | return 0; | 
|  | 2806 | } |