pacemaker  1.1.16-94ff4df
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules
election.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2004-2016 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/time.h>
11 #include <sys/resource.h>
12 
13 #include <crm/msg_xml.h>
14 #include <crm/common/xml.h>
15 
16 #include <crm/common/mainloop.h>
17 #include <crm/cluster/internal.h>
18 #include <crm/cluster/election.h>
19 #include <crm/crm.h>
20 
21 #define STORM_INTERVAL 2 /* in seconds */
22 #define STORM_MULTIPLIER 5 /* multiplied by the number of nodes */
23 
24 struct election_s
25 {
26  enum election_result state;
27  guint count;
28  char *name;
29  char *uname;
30  GSourceFunc cb;
31  GHashTable *voted;
32  mainloop_timer_t *timeout; /* When to stop if not everyone casts a vote */
33 };
34 
35 static void election_complete(election_t *e)
36 {
37  crm_info("Election %s complete", e->name);
38  e->state = election_won;
39 
40  if(e->cb) {
41  e->cb(e);
42  }
43 
44  election_reset(e);
45 }
46 
47 static gboolean election_timer_cb(gpointer user_data)
48 {
49  election_t *e = user_data;
50 
51  crm_info("Election %s %p timed out", e->name, e);
52  election_complete(e);
53  return FALSE;
54 }
55 
56 enum election_result
58 {
59  if(e) {
60  return e->state;
61  }
62  return election_error;
63 }
64 
65 election_t *
66 election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
67 {
68  static guint count = 0;
69  election_t *e = calloc(1, sizeof(election_t));
70 
71  if(e != NULL) {
72  if(name) {
73  e->name = crm_strdup_printf("election-%s", name);
74  } else {
75  e->name = crm_strdup_printf("election-%u", count++);
76  }
77 
78  e->cb = cb;
79  e->uname = strdup(uname);
80  e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, election_timer_cb, e);
81  crm_trace("Created %s %p", e->name, e);
82  }
83  return e;
84 }
85 
86 void
88 {
89  if(e && uname && e->voted) {
90  g_hash_table_remove(e->voted, uname);
91  }
92 }
93 
94 void
96 {
97  crm_trace("Resetting election %s", e->name);
98  if(e) {
99  mainloop_timer_stop(e->timeout);
100  }
101  if (e && e->voted) {
102  crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
103  g_hash_table_destroy(e->voted);
104  e->voted = NULL;
105  }
106 }
107 
108 void
110 {
111  if(e) {
112  election_reset(e);
113  crm_trace("Destroying %s", e->name);
114  mainloop_timer_del(e->timeout);
115  free(e->uname);
116  free(e->name);
117  free(e);
118  }
119 }
120 
121 static void
122 election_timeout_start(election_t *e)
123 {
124  if(e) {
125  mainloop_timer_start(e->timeout);
126  }
127 }
128 
129 void
131 {
132  if(e) {
133  mainloop_timer_stop(e->timeout);
134  }
135 }
136 
137 void
139 {
140  if(e) {
141  mainloop_timer_set_period(e->timeout, period);
142  } else {
143  crm_err("No election defined");
144  }
145 }
146 
147 static int
148 crm_uptime(struct timeval *output)
149 {
150  static time_t expires = 0;
151  static struct rusage info;
152 
153  time_t tm_now = time(NULL);
154 
155  if (expires < tm_now) {
156  int rc = 0;
157 
158  info.ru_utime.tv_sec = 0;
159  info.ru_utime.tv_usec = 0;
160  rc = getrusage(RUSAGE_SELF, &info);
161 
162  output->tv_sec = 0;
163  output->tv_usec = 0;
164 
165  if (rc < 0) {
166  crm_perror(LOG_ERR, "Could not calculate the current uptime");
167  expires = 0;
168  return -1;
169  }
170 
171  crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
172  (long)info.ru_utime.tv_usec);
173  }
174 
175  expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
176  output->tv_sec = info.ru_utime.tv_sec;
177  output->tv_usec = info.ru_utime.tv_usec;
178 
179  return 1;
180 }
181 
182 static int
183 crm_compare_age(struct timeval your_age)
184 {
185  struct timeval our_age;
186 
187  crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
188 
189  if (our_age.tv_sec > your_age.tv_sec) {
190  crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
191  return 1;
192  } else if (our_age.tv_sec < your_age.tv_sec) {
193  crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
194  return -1;
195  } else if (our_age.tv_usec > your_age.tv_usec) {
196  crm_debug("Win: %ld.%ld vs %ld.%ld (usec)",
197  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
198  return 1;
199  } else if (our_age.tv_usec < your_age.tv_usec) {
200  crm_debug("Lose: %ld.%ld vs %ld.%ld (usec)",
201  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
202  return -1;
203  }
204 
205  return 0;
206 }
207 
208 void
210 {
211  struct timeval age;
212  xmlNode *vote = NULL;
213  crm_node_t *our_node;
214 
215  if(e == NULL) {
216  crm_trace("Not voting in election: not initialized");
217  return;
218  }
219 
220  our_node = crm_get_peer(0, e->uname);
221  if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
222  crm_trace("Cannot vote yet: %p", our_node);
223  return;
224  }
225 
226  e->state = election_in_progress;
227  vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
228 
229  e->count++;
230  crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
231  crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
232 
233  crm_uptime(&age);
234  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
235  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
236 
237  send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
238  free_xml(vote);
239 
240  crm_debug("Started election %d", e->count);
241  if (e->voted) {
242  g_hash_table_destroy(e->voted);
243  e->voted = NULL;
244  }
245 
246  election_timeout_start(e);
247  return;
248 }
249 
250 bool
252 {
253  int voted_size = 0;
254  int num_members = crm_active_peers();
255 
256  if(e == NULL) {
257  crm_trace("not initialized");
258  return FALSE;
259  }
260 
261  if (e->voted) {
262  voted_size = g_hash_table_size(e->voted);
263  }
264  /* in the case of #voted > #members, it is better to
265  * wait for the timeout and give the cluster time to
266  * stabilize
267  */
268  if (voted_size >= num_members) {
269  /* we won and everyone has voted */
271  if (voted_size > num_members) {
272  GHashTableIter gIter;
273  const crm_node_t *node;
274  char *key = NULL;
275 
276  g_hash_table_iter_init(&gIter, crm_peer_cache);
277  while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
278  if (crm_is_peer_active(node)) {
279  crm_err("member: %s proc=%.32x", node->uname, node->processes);
280  }
281  }
282 
283  g_hash_table_iter_init(&gIter, e->voted);
284  while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
285  crm_err("voted: %s", key);
286  }
287 
288  }
289 
290  election_complete(e);
291  return TRUE;
292 
293  } else {
294  crm_debug("Still waiting on %d non-votes (%d total)",
295  num_members - voted_size, num_members);
296  }
297 
298  return FALSE;
299 }
300 
301 #define loss_dampen 2 /* in seconds */
302 
303 /* A_ELECTION_COUNT */
304 enum election_result
305 election_count_vote(election_t *e, xmlNode *vote, bool can_win)
306 {
307  int age = 0;
308  int election_id = -1;
309  int log_level = LOG_INFO;
310  gboolean use_born_on = FALSE;
311  gboolean done = FALSE;
312  gboolean we_lose = FALSE;
313  const char *op = NULL;
314  const char *from = NULL;
315  const char *reason = "unknown";
316  const char *election_owner = NULL;
317  crm_node_t *our_node = NULL, *your_node = NULL;
318 
319  static int election_wins = 0;
320 
321  xmlNode *novote = NULL;
322  time_t tm_now = time(NULL);
323  static time_t expires = 0;
324  static time_t last_election_loss = 0;
325 
326  /* if the membership copy is NULL we REALLY shouldn't be voting
327  * the question is how we managed to get here.
328  */
329 
330  CRM_CHECK(vote != NULL, return election_error);
331 
332  if(e == NULL) {
333  crm_info("Not voting in election: not initialized");
334  return election_lost;
335 
336  } else if(crm_peer_cache == NULL) {
337  crm_info("Not voting in election: no peer cache");
338  return election_lost;
339  }
340 
341  op = crm_element_value(vote, F_CRM_TASK);
342  from = crm_element_value(vote, F_CRM_HOST_FROM);
343  election_owner = crm_element_value(vote, F_CRM_ELECTION_OWNER);
344  crm_element_value_int(vote, F_CRM_ELECTION_ID, &election_id);
345 
346  your_node = crm_get_peer(0, from);
347  our_node = crm_get_peer(0, e->uname);
348 
349  if (e->voted == NULL) {
350  crm_debug("Created voted hash");
351  e->voted = g_hash_table_new_full(crm_str_hash, g_str_equal,
353  }
354 
355  if (is_heartbeat_cluster()) {
356  use_born_on = TRUE;
357  } else if (is_classic_ais_cluster()) {
358  use_born_on = TRUE;
359  }
360 
361  if(can_win == FALSE) {
362  reason = "Not eligible";
363  we_lose = TRUE;
364 
365  } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
366  reason = "We are not part of the cluster";
367  log_level = LOG_ERR;
368  we_lose = TRUE;
369 
370  } else if (election_id != e->count && crm_str_eq(our_node->uuid, election_owner, TRUE)) {
371  log_level = LOG_TRACE;
372  reason = "Superseded";
373  done = TRUE;
374 
375  } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
376  /* Possibly we cached the message in the FSA queue at a point that it wasn't */
377  reason = "Peer is not part of our cluster";
378  log_level = LOG_WARNING;
379  done = TRUE;
380 
381  } else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
382  char *op_copy = strdup(op);
383  char *uname_copy = strdup(from);
384 
385  CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
386 
387  /* update the list of nodes that have voted */
388  g_hash_table_replace(e->voted, uname_copy, op_copy);
389  reason = "Recorded";
390  done = TRUE;
391 
392  } else {
393  struct timeval your_age;
394  const char *your_version = crm_element_value(vote, F_CRM_VERSION);
395  int tv_sec = 0;
396  int tv_usec = 0;
397 
400 
401  your_age.tv_sec = tv_sec;
402  your_age.tv_usec = tv_usec;
403 
404  age = crm_compare_age(your_age);
405  if (crm_str_eq(from, e->uname, TRUE)) {
406  char *op_copy = strdup(op);
407  char *uname_copy = strdup(from);
408 
409  CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
410 
411  /* update ourselves in the list of nodes that have voted */
412  g_hash_table_replace(e->voted, uname_copy, op_copy);
413  reason = "Recorded";
414  done = TRUE;
415 
416  } else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
417  reason = "Version";
418  we_lose = TRUE;
419 
420  } else if (compare_version(your_version, CRM_FEATURE_SET) > 0) {
421  reason = "Version";
422 
423  } else if (age < 0) {
424  reason = "Uptime";
425  we_lose = TRUE;
426 
427  } else if (age > 0) {
428  reason = "Uptime";
429 
430  /* TODO: Check for y(our) born < 0 */
431  } else if (use_born_on && your_node->born < our_node->born) {
432  reason = "Born";
433  we_lose = TRUE;
434 
435  } else if (use_born_on && your_node->born > our_node->born) {
436  reason = "Born";
437 
438  } else if (e->uname == NULL) {
439  reason = "Unknown host name";
440  we_lose = TRUE;
441 
442  } else if (strcasecmp(e->uname, from) > 0) {
443  reason = "Host name";
444  we_lose = TRUE;
445 
446  } else {
447  reason = "Host name";
448  CRM_ASSERT(strcasecmp(e->uname, from) < 0);
449 /* can't happen...
450  * } else if(strcasecmp(e->uname, from) == 0) {
451  *
452  */
453  }
454  }
455 
456  if (expires < tm_now) {
457  election_wins = 0;
458  expires = tm_now + STORM_INTERVAL;
459 
460  } else if (done == FALSE && we_lose == FALSE) {
461  int peers = 1 + g_hash_table_size(crm_peer_cache);
462 
463  /* If every node has to vote down every other node, thats N*(N-1) total elections
464  * Allow some leway before _really_ complaining
465  */
466  election_wins++;
467  if (election_wins > (peers * peers)) {
468  crm_warn("Election storm detected: %d elections in %d seconds", election_wins,
470  election_wins = 0;
471  expires = tm_now + STORM_INTERVAL;
472  crm_write_blackbox(0, NULL);
473  }
474  }
475 
476  if (done) {
477  do_crm_log(log_level + 1, "Election %d (current: %d, owner: %s): Processed %s from %s (%s)",
478  election_id, e->count, election_owner, op, from, reason);
479  return e->state;
480 
481  } else if (we_lose == FALSE) {
482  do_crm_log(log_level, "Election %d (owner: %s) pass: %s from %s (%s)",
483  election_id, election_owner, op, from, reason);
484 
485  if (last_election_loss == 0
486  || tm_now - last_election_loss > (time_t) loss_dampen) {
487 
488  last_election_loss = 0;
490 
491  /* Start a new election by voting down this, and other, peers */
492  e->state = election_start;
493  return e->state;
494  }
495 
496  crm_info("Election %d ignore: We already lost an election less than %ds ago (%s)",
497  election_id, loss_dampen, ctime(&last_election_loss));
498  }
499 
500  novote = create_request(CRM_OP_NOVOTE, NULL, from,
502 
503  do_crm_log(log_level, "Election %d (owner: %s) lost: %s from %s (%s)",
504  election_id, election_owner, op, from, reason);
505 
507 
508  crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
509  crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
510 
511  send_cluster_message(your_node, crm_msg_crmd, novote, TRUE);
512  free_xml(novote);
513 
514  last_election_loss = tm_now;
515  e->state = election_lost;
516  return e->state;
517 }
#define F_CRM_TASK
Definition: msg_xml.h:56
#define LOG_TRACE
Definition: logging.h:29
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:164
void crm_write_blackbox(int nsig, struct qb_log_callsite *callsite)
Definition: logging.c:419
A dumping ground.
void mainloop_timer_start(mainloop_timer_t *t)
Definition: mainloop.c:1186
guint mainloop_timer_set_period(mainloop_timer_t *t, guint period_ms)
Definition: mainloop.c:1204
void mainloop_timer_del(mainloop_timer_t *t)
Definition: mainloop.c:1242
gboolean is_heartbeat_cluster(void)
Definition: cluster.c:645
gboolean crm_is_peer_active(const crm_node_t *node)
Definition: membership.c:293
uint64_t born
Definition: cluster.h:74
char * uuid
Definition: cluster.h:83
#define STORM_INTERVAL
Definition: election.c:21
#define CRM_FEATURE_SET
Definition: crm.h:36
#define F_CRM_HOST_FROM
Definition: msg_xml.h:61
struct mainloop_timer_s mainloop_timer_t
Definition: mainloop.h:37
crm_node_t * crm_get_peer(unsigned int id, const char *uname)
Definition: membership.c:674
void election_timeout_stop(election_t *e)
Definition: election.c:130
#define CRM_OP_NOVOTE
Definition: crm.h:111
guint crm_active_peers(void)
Definition: membership.c:391
void mainloop_timer_stop(mainloop_timer_t *t)
Definition: mainloop.c:1195
#define F_CRM_ELECTION_AGE_S
Definition: msg_xml.h:68
Wrappers for and extensions to glib mainloop.
struct election_s election_t
Definition: election.h:27
enum election_result election_count_vote(election_t *e, xmlNode *vote, bool can_win)
Definition: election.c:305
char uname[MAX_NAME]
Definition: internal.h:53
#define crm_warn(fmt, args...)
Definition: logging.h:249
uint32_t processes
Definition: cluster.h:79
#define crm_debug(fmt, args...)
Definition: logging.h:253
election_result
Definition: election.h:29
void election_vote(election_t *e)
Definition: election.c:209
#define crm_trace(fmt, args...)
Definition: logging.h:254
#define do_crm_log(level, fmt, args...)
Log a message.
Definition: logging.h:129
Wrappers for and extensions to libxml2.
int crm_element_value_int(xmlNode *data, const char *name, int *dest)
Definition: xml.c:3745
const char * crm_element_value(xmlNode *data, const char *name)
Definition: xml.c:4987
void free_xml(xmlNode *child)
Definition: xml.c:2587
gboolean crm_str_eq(const char *a, const char *b, gboolean use_case)
Definition: strings.c:213
void election_timeout_set_period(election_t *e, guint period)
Definition: election.c:138
election_t * election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
Definition: election.c:66
void election_fini(election_t *e)
Definition: election.c:109
#define CRM_SYSTEM_CRMD
Definition: crm.h:90
#define CRM_OP_VOTE
Definition: crm.h:110
const char * crm_xml_add(xmlNode *node, const char *name, const char *value)
Definition: xml.c:2434
const char * crm_xml_add_int(xmlNode *node, const char *name, int value)
Definition: xml.c:2522
#define F_CRM_ELECTION_AGE_US
Definition: msg_xml.h:69
#define loss_dampen
Definition: election.c:301
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:226
void election_reset(election_t *e)
Definition: election.c:95
#define crm_err(fmt, args...)
Definition: logging.h:248
int compare_version(const char *version1, const char *version2)
Definition: utils.c:466
#define CRM_ASSERT(expr)
Definition: error.h:35
mainloop_timer_t * mainloop_timer_add(const char *name, guint period_ms, bool repeat, GSourceFunc cb, void *userdata)
Definition: mainloop.c:1221
char * uname
Definition: cluster.h:82
bool election_check(election_t *e)
Definition: election.c:251
#define F_CRM_ELECTION_ID
Definition: msg_xml.h:67
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
#define crm_str_hash
Definition: crm.h:204
gboolean send_cluster_message(crm_node_t *node, enum crm_ais_msg_types service, xmlNode *data, gboolean ordered)
Definition: cluster.c:271
#define create_request(task, xml_data, host_to, sys_to, sys_from, uuid_from)
Definition: ipc.h:34
GHashTable * crm_peer_cache
Definition: membership.c:42
#define crm_info(fmt, args...)
Definition: logging.h:251
void g_hash_destroy_str(gpointer data)
Definition: strings.c:74
#define F_CRM_VERSION
Definition: msg_xml.h:63
void election_remove(election_t *e, const char *uname)
Definition: election.c:87
enum election_result election_state(election_t *e)
Definition: election.c:57
gboolean is_classic_ais_cluster(void)
Definition: cluster.c:624
Functions for conducting elections.
#define F_CRM_ELECTION_OWNER
Definition: msg_xml.h:70