roll in Michael's "limp along" fix for when a slave drops temporarily from the mcast...
[l2tpns.git] / cluster.c
index 6f4695d..d9c619e 100644 (file)
--- a/cluster.c
+++ b/cluster.c
@@ -1,5 +1,6 @@
 // L2TPNS Clustering Stuff
 // L2TPNS Clustering Stuff
-// $Id: cluster.c,v 1.3 2004-06-23 03:52:24 fred_nerk Exp $
+
+char const *cvs_id_cluster = "$Id: cluster.c,v 1.6 2004-07-05 06:54:01 bodea Exp $";
 
 #include <stdio.h>
 #include <sys/file.h>
 
 #include <stdio.h>
 #include <sys/file.h>
@@ -134,7 +135,7 @@ int cluster_init()
        return -1;
     }
 
        return -1;
     }
 
-    config->cluster_last_hb = config->current_time;
+    config->cluster_last_hb = TIME;
     config->cluster_seq_number = -1;
 
     return cluster_sockfd;
     config->cluster_seq_number = -1;
 
     return cluster_sockfd;
@@ -342,6 +343,7 @@ static void send_heartbeat(int seq, char * data, int size)
     if (size > sizeof(past_hearts[0].data)) {
        log(0,0,0,0, "Tried to heartbeat something larger than the maximum packet!\n");
        kill(0, SIGTERM);
     if (size > sizeof(past_hearts[0].data)) {
        log(0,0,0,0, "Tried to heartbeat something larger than the maximum packet!\n");
        kill(0, SIGTERM);
+       exit(1);
     }
     i = seq % HB_HISTORY_SIZE;
     past_hearts[i].seq = seq;
     }
     i = seq % HB_HISTORY_SIZE;
     past_hearts[i].seq = seq;
@@ -435,18 +437,32 @@ void cluster_check_master(void)
        int i, count, tcount, high_sid = 0;
        int last_free = 0;
        int had_peers = have_peers;
        int i, count, tcount, high_sid = 0;
        int last_free = 0;
        int had_peers = have_peers;
-       clockt t = config->current_time;
+       clockt t = TIME;
+       static int probed = 0;
+
+               // Is the master late? If so, try probing it...
+       if (TIME > (config->cluster_last_hb + config->cluster_hb_timeout/8 + 11)) {
+               if (!probed) {
+                       if (config->cluster_master_address) {
+                               peer_send_message(config->cluster_master_address,
+                                       C_LASTSEEN, config->cluster_seq_number, NULL, 0);
+                               probed = 1;
+                       }
+               }
+       } else {        // We got a recent heartbeat; reset the probe flag.
+               probed = 0;
+       }
 
 
-       if (config->current_time < (config->cluster_last_hb + HB_TIMEOUT) )
+       if (TIME < (config->cluster_last_hb + config->cluster_hb_timeout) )
                return;         // Everything's ok. return.
 
        if (!config->cluster_iam_master)
                log(0,0,0,0, "Master timed out! Holding election...\n");
 
                return;         // Everything's ok. return.
 
        if (!config->cluster_iam_master)
                log(0,0,0,0, "Master timed out! Holding election...\n");
 
-       config->cluster_last_hb = config->current_time + 1;
+       config->cluster_last_hb = TIME + 1;
 
        for (i = have_peers = 0; i < num_peers ; ++i) {
 
        for (i = have_peers = 0; i < num_peers ; ++i) {
-               if ((peers[i].timestamp + HB_TIMEOUT) < t)
+               if ((peers[i].timestamp + config->cluster_hb_timeout) < t)
                        continue;       // Stale peer! Skip them.
 
                if (!peers[i].basetime)
                        continue;       // Stale peer! Skip them.
 
                if (!peers[i].basetime)
@@ -721,7 +737,7 @@ void cluster_heartbeat(int highsession, int freesession, int hightunnel)
        }
 
        if (p > (buff + sizeof(buff))) {        // Did we somehow manage to overun the buffer?
        }
 
        if (p > (buff + sizeof(buff))) {        // Did we somehow manage to overun the buffer?
-               log(0,0,0,0, "Overrun the heartbeat buffer! This is fatal. Exiting. (size %d)\n", p - buff);
+               log(0,0,0,0, "FATAL: Overran the heartbeat buffer! This is fatal. Exiting. (size %d)\n", p - buff);
                kill(0, SIGTERM);
        }
 
                kill(0, SIGTERM);
        }
 
@@ -744,6 +760,9 @@ void cluster_heartbeat(int highsession, int freesession, int hightunnel)
 
                //
                // Fill out the packet with tunnels from the tunnel table...
 
                //
                // Fill out the packet with tunnels from the tunnel table...
+               // This effectively means we walk the tunnel table more quickly
+               // than the session table. This is good because stuffing up a 
+               // tunnel is a much bigger deal than stuffing up a session.
                //
        while ( (p + sizeof(u32) * 2 + sizeof(tunnelt) ) < (buff + MAX_HEART_SIZE) ) {
 
                //
        while ( (p + sizeof(u32) * 2 + sizeof(tunnelt) ) < (buff + MAX_HEART_SIZE) ) {
 
@@ -766,8 +785,8 @@ void cluster_heartbeat(int highsession, int freesession, int hightunnel)
                kill(0, SIGTERM);
        }
 
                kill(0, SIGTERM);
        }
 
-       log(3,0,0,0, "Sending heartbeat with %d changes (%d x-sess, %d x-tunnels, %d highsess, %d hightun size %d)\n",
-                       config->cluster_num_changes, count, tcount, config->cluster_highest_sessionid,
+       log(3,0,0,0, "Sending heartbeat #%d with %d changes (%d x-sess, %d x-tunnels, %d highsess, %d hightun size %d)\n",
+                       h.seq, config->cluster_num_changes, count, tcount, config->cluster_highest_sessionid,
                        config->cluster_highest_tunnelid, (p-buff));
 
        config->cluster_num_changes = 0;
                        config->cluster_highest_tunnelid, (p-buff));
 
        config->cluster_num_changes = 0;
@@ -891,7 +910,7 @@ int cluster_add_peer(u32 peer, time_t basetime, pingt *p)
 
                // This peer already exists. Just update the timestamp.
                peers[i].basetime = basetime;
 
                // This peer already exists. Just update the timestamp.
                peers[i].basetime = basetime;
-               peers[i].timestamp = config->current_time;
+               peers[i].timestamp = TIME;
                break;
        }
 
                break;
        }
 
@@ -904,7 +923,7 @@ int cluster_add_peer(u32 peer, time_t basetime, pingt *p)
                {
                        if (peers[i].peer != peer)
                                continue;
                {
                        if (peers[i].peer != peer)
                                continue;
-                       if ((peers[i].timestamp + HB_TIMEOUT * 10) < config->current_time) // Stale.
+                       if ((peers[i].timestamp + config->cluster_hb_timeout * 10) < TIME) // Stale.
                                break;
                }
 
                                break;
                }
 
@@ -917,7 +936,7 @@ int cluster_add_peer(u32 peer, time_t basetime, pingt *p)
 
                peers[i].peer = peer;
                peers[i].basetime = basetime;
 
                peers[i].peer = peer;
                peers[i].basetime = basetime;
-               peers[i].timestamp = config->current_time;
+               peers[i].timestamp = TIME;
                if (i == num_peers)
                        ++num_peers;
 
                if (i == num_peers)
                        ++num_peers;
 
@@ -1064,16 +1083,18 @@ static int cluster_process_heartbeat_v2(u8 * data, int size, int more, u8 * p, u
 
                log(0,0,0,0, "I just got a packet claiming to be from a master but _I_ am the master!\n");
                if (!h->basetime) {
 
                log(0,0,0,0, "I just got a packet claiming to be from a master but _I_ am the master!\n");
                if (!h->basetime) {
-                       log(0,0,0,0, "Heartbeat from addr %s with zero basetime!\n", inet_toa(htonl(addr)) );
+                       log(0,0,0,0, "Heartbeat from addr %s with zero basetime!\n", inet_toa(addr) );
                        return -1; // Skip it.
                }
                if (basetime > h->basetime) {
                        return -1; // Skip it.
                }
                if (basetime > h->basetime) {
-                       log(0,0,0,0, "They're (%s) an older master than me so I'm gone!\n", inet_toa(htonl(addr)));
+                       log(0,0,0,0, "They're (%s) an older master than me so I'm gone!\n", inet_toa(addr));
                        kill(0, SIGTERM);
                        kill(0, SIGTERM);
+                       exit(1);
                }
                if (basetime == h->basetime && my_address < addr) { // Tie breaker.
                        log(0,0,0,0, "They're a higher IP address than me, so I'm gone!\n");
                        kill(0, SIGTERM);
                }
                if (basetime == h->basetime && my_address < addr) { // Tie breaker.
                        log(0,0,0,0, "They're a higher IP address than me, so I'm gone!\n");
                        kill(0, SIGTERM);
+                       exit(1);
                }
                return -1; // Skip it.
        }
                }
                return -1; // Skip it.
        }
@@ -1081,14 +1102,14 @@ static int cluster_process_heartbeat_v2(u8 * data, int size, int more, u8 * p, u
        if (config->cluster_seq_number == -1)   // Don't have one. Just align to the master...
                config->cluster_seq_number = h->seq;
 
        if (config->cluster_seq_number == -1)   // Don't have one. Just align to the master...
                config->cluster_seq_number = h->seq;
 
-       config->cluster_last_hb = config->current_time; // Reset to ensure that we don't become master!!
+       config->cluster_last_hb = TIME; // Reset to ensure that we don't become master!!
 
        if (config->cluster_seq_number != h->seq) {     // Out of sequence heartbeat!
                log(1,0,0,0, "HB: Got seq# %d but was expecting %d. asking for resend.\n", h->seq, config->cluster_seq_number);
 
                peer_send_message(addr, C_LASTSEEN, config->cluster_seq_number, NULL, 0);
 
 
        if (config->cluster_seq_number != h->seq) {     // Out of sequence heartbeat!
                log(1,0,0,0, "HB: Got seq# %d but was expecting %d. asking for resend.\n", h->seq, config->cluster_seq_number);
 
                peer_send_message(addr, C_LASTSEEN, config->cluster_seq_number, NULL, 0);
 
-               config->cluster_last_hb = config->current_time; // Reset to ensure that we don't become master!!
+               config->cluster_last_hb = TIME; // Reset to ensure that we don't become master!!
 
                        // Just drop the packet. The master will resend it as part of the catchup.
 
 
                        // Just drop the packet. The master will resend it as part of the catchup.
 
@@ -1187,7 +1208,7 @@ static int cluster_process_heartbeat_v2(u8 * data, int size, int more, u8 * p, u
        }
 
        config->cluster_master_address = addr;
        }
 
        config->cluster_master_address = addr;
-       config->cluster_last_hb = config->current_time; // Successfully received a heartbeat!
+       config->cluster_last_hb = TIME; // Successfully received a heartbeat!
        return 0;
 
 shortpacket:
        return 0;
 
 shortpacket:
@@ -1282,12 +1303,12 @@ int processcluster(char * data, int size, u32 addr)
                }
 
                if (addr != config->cluster_master_address) {
                }
 
                if (addr != config->cluster_master_address) {
-                       log(0,0,0,0, "Received a C_KILL from %s which doesn't match config->cluster_master_address (%x)",
+                       log(0,0,0,0, "Received a C_KILL from %s which doesn't match config->cluster_master_address (%x)\n",
                                inet_toa(addr), config->cluster_master_address);
                        // We can only warn about it. The master might really have switched!
                }
 
                                inet_toa(addr), config->cluster_master_address);
                        // We can only warn about it. The master might really have switched!
                }
 
-               log(0,0,0,0, "Received a valid C_KILL: I'm going to die now.");
+               log(0,0,0,0, "Received a valid C_KILL: I'm going to die now.\n");
                kill(0, SIGTERM);
                exit(0);        // Lets be paranoid;
                return -1;              // Just signalling the compiler.
                kill(0, SIGTERM);
                exit(0);        // Lets be paranoid;
                return -1;              // Just signalling the compiler.
@@ -1313,6 +1334,9 @@ int cmd_show_cluster(struct cli_def *cli, char *command, char **argv, int argc)
 {
        int i;
 
 {
        int i;
 
+       if (CLI_HELP_REQUESTED)
+               return CLI_HELP_NO_ARGS;
+
         cli_print(cli, "Cluster status   : %s", config->cluster_iam_master ? "Master" : "Slave" );
        cli_print(cli, "My address       : %s", inet_toa(my_address));
        cli_print(cli, "VIP address      : %s", inet_toa(config->bind_address));
         cli_print(cli, "Cluster status   : %s", config->cluster_iam_master ? "Master" : "Slave" );
        cli_print(cli, "My address       : %s", inet_toa(my_address));
        cli_print(cli, "VIP address      : %s", inet_toa(config->bind_address));
@@ -1322,7 +1346,7 @@ int cmd_show_cluster(struct cli_def *cli, char *command, char **argv, int argc)
         if (!config->cluster_iam_master) {
                cli_print(cli, "My master        : %s (last heartbeat %.1f seconds old)",
                        config->cluster_master_address ? inet_toa(config->cluster_master_address) : "Not defined",
         if (!config->cluster_iam_master) {
                cli_print(cli, "My master        : %s (last heartbeat %.1f seconds old)",
                        config->cluster_master_address ? inet_toa(config->cluster_master_address) : "Not defined",
-                       0.1 * (config->current_time - config->cluster_last_hb));
+                       0.1 * (TIME - config->cluster_last_hb));
                 cli_print(cli, "Uptodate         : %s", config->cluster_iam_uptodate ? "Yes" : "No");
                cli_print(cli, "Next sequence number expected: %d", config->cluster_seq_number);
                cli_print(cli, "%d sessions undefined of %d", config->cluster_undefined_sessions, config->cluster_highest_sessionid);
                 cli_print(cli, "Uptodate         : %s", config->cluster_iam_uptodate ? "Yes" : "No");
                cli_print(cli, "Next sequence number expected: %d", config->cluster_seq_number);
                cli_print(cli, "%d sessions undefined of %d", config->cluster_undefined_sessions, config->cluster_highest_sessionid);
@@ -1339,7 +1363,7 @@ int cmd_show_cluster(struct cli_def *cli, char *command, char **argv, int argc)
                cli_print(cli, "%20s  %10s %8s", "Address", "Basetime", "Age");
        for (i = 0; i < num_peers; ++i) {
                cli_print(cli, "%20s  %10d %8d", inet_toa(peers[i].peer),
                cli_print(cli, "%20s  %10s %8s", "Address", "Basetime", "Age");
        for (i = 0; i < num_peers; ++i) {
                cli_print(cli, "%20s  %10d %8d", inet_toa(peers[i].peer),
-                       peers[i].basetime, config->current_time - peers[i].timestamp);
+                       peers[i].basetime, TIME - peers[i].timestamp);
        }
        return CLI_OK;
 }
        }
        return CLI_OK;
 }