comment typo
[l2tpns.git] / l2tpns.c
index 9e9b34a..7ce6e6b 100644 (file)
--- a/l2tpns.c
+++ b/l2tpns.c
@@ -4,7 +4,7 @@
 // Copyright (c) 2002 FireBrick (Andrews & Arnold Ltd / Watchfront Ltd) - GPL licenced
 // vim: sw=8 ts=8
 
-char const *cvs_id_l2tpns = "$Id: l2tpns.c,v 1.103 2005/05/13 01:29:40 bodea Exp $";
+char const *cvs_id_l2tpns = "$Id: l2tpns.c,v 1.111 2005/06/14 04:47:24 bodea Exp $";
 
 #include <arpa/inet.h>
 #include <assert.h>
@@ -52,7 +52,7 @@ char const *cvs_id_l2tpns = "$Id: l2tpns.c,v 1.103 2005/05/13 01:29:40 bodea Exp
 
 #ifdef BGP
 #include "bgp.h"
-#endif /* BGP */
+#endif
 
 // Globals
 configt *config = NULL;                // all configuration
@@ -64,13 +64,14 @@ int snoopfd = -1;           // UDP file handle for sending out intercept data
 int *radfds = NULL;            // RADIUS requests file handles
 int ifrfd = -1;                        // File descriptor for routing, etc
 int ifr6fd = -1;               // File descriptor for IPv6 routing, etc
-static int rand_fd = -1;       // Random data source
+int rand_fd = -1;              // Random data source
+int cluster_sockfd = -1;       // Intra-cluster communications socket.
+int epollfd = -1;              // event polling
 time_t basetime = 0;           // base clock
 char hostname[1000] = "";      // us.
 static int tunidx;             // ifr_ifindex of tun device
 static int syslog_log = 0;     // are we logging to syslog
 static FILE *log_stream = 0;   // file handle for direct logging (i.e. direct into file, not via syslog).
-extern int cluster_sockfd;     // Intra-cluster communications socket.
 uint32_t last_id = 0;          // Unique ID for radius accounting
 
 struct cli_session_actions *cli_session_actions = NULL;        // Pending session changes requested by CLI
@@ -90,7 +91,7 @@ uint32_t eth_tx = 0;
 static uint32_t ip_pool_size = 1;      // Size of the pool of addresses used for dynamic address allocation.
 time_t time_now = 0;                   // Current time in seconds since epoch.
 static char time_now_string[64] = {0}; // Current time as a string.
-static char main_quit = 0;             // True if we're in the process of exiting.
+char main_quit = 0;                    // True if we're in the process of exiting.
 linked_list *loaded_plugins;
 linked_list *plugins[MAX_PLUGIN_TYPES];
 
@@ -113,6 +114,7 @@ config_descriptt config_values[] = {
        CONFIG("radius_interim", radius_interim, INT),
        CONFIG("radius_secret", radiussecret, STRING),
        CONFIG("radius_authtypes", radius_authtypes_s, STRING),
+       CONFIG("allow_duplicate_users", allow_duplicate_users, BOOL),
        CONFIG("bind_address", bind_address, IPv4),
        CONFIG("peer_address", peer_address, IPv4),
        CONFIG("send_garp", send_garp, BOOL),
@@ -121,7 +123,6 @@ config_descriptt config_values[] = {
        CONFIG("accounting_dir", accounting_dir, STRING),
        CONFIG("setuid", target_uid, INT),
        CONFIG("dump_speed", dump_speed, BOOL),
-       CONFIG("cleanup_interval", cleanup_interval, INT),
        CONFIG("multi_read_count", multi_read_count, INT),
        CONFIG("scheduler_fifo", scheduler_fifo, BOOL),
        CONFIG("lock_pages", lock_pages, BOOL),
@@ -131,6 +132,7 @@ config_descriptt config_values[] = {
        CONFIG("cluster_interface", cluster_interface, STRING),
        CONFIG("cluster_hb_interval", cluster_hb_interval, INT),
        CONFIG("cluster_hb_timeout", cluster_hb_timeout, INT),
+       CONFIG("cluster_master_min_adv", cluster_master_min_adv, INT),
        CONFIG("ipv6_prefix", ipv6_prefix, IPv6),
        { NULL, 0, 0, 0 },
 };
@@ -175,8 +177,7 @@ static void free_ip_address(sessionidt s);
 static void dump_acct_info(int all);
 static void sighup_handler(int sig);
 static void sigalrm_handler(int sig);
-static void sigterm_handler(int sig);
-static void sigquit_handler(int sig);
+static void shutdown_handler(int sig);
 static void sigchild_handler(int sig);
 static void build_chap_response(char *challenge, uint8_t id, uint16_t challenge_length, char **challenge_response);
 static void update_config(void);
@@ -189,11 +190,19 @@ static void processcontrol(uint8_t *buf, int len, struct sockaddr_in *addr, int
 static tunnelidt new_tunnel(void);
 static void unhide_value(uint8_t *value, size_t len, uint16_t type, uint8_t *vector, size_t vec_len);
 
-// return internal time (10ths since process startup)
-static clockt now(void)
+// on slaves, alow BGP to withdraw cleanly before exiting
+#define QUIT_DELAY     5
+
+// quit actions (master)
+#define QUIT_FAILOVER  1 // SIGTERM: exit when all control messages have been acked (for cluster failover)
+#define QUIT_SHUTDOWN  2 // SIGQUIT: shutdown sessions/tunnels, reject new connections
+
+// return internal time (10ths since process startup), set f if given
+static clockt now(double *f)
 {
        struct timeval t;
        gettimeofday(&t, 0);
+       if (f) *f = t.tv_sec + t.tv_usec / 1000000.0;
        return (t.tv_sec - basetime) * 10 + t.tv_usec / 100000 + 1;
 }
 
@@ -203,7 +212,7 @@ static clockt now(void)
 clockt backoff(uint8_t try)
 {
        if (try > 5) try = 5;                  // max backoff
-       return now() + 10 * (1 << try);
+       return now(NULL) + 10 * (1 << try);
 }
 
 
@@ -297,6 +306,16 @@ void _log_hex(int level, const char *title, const char *data, int maxsize)
        }
 }
 
+// update a counter, accumulating 2^32 wraps
+void increment_counter(uint32_t *counter, uint32_t *wrap, uint32_t delta)
+{
+       uint32_t new = *counter + delta;
+       if (new < *counter)
+               (*wrap)++;
+
+       *counter = new;
+}
+
 // initialise the random generator
 static void initrandom(char *source)
 {
@@ -307,7 +326,8 @@ static void initrandom(char *source)
                return;
 
        // close previous source, if any
-       if (rand_fd >= 0) close(rand_fd);
+       if (rand_fd >= 0)
+               close(rand_fd);
 
        rand_fd = -1;
 
@@ -324,13 +344,6 @@ static void initrandom(char *source)
                                        path, strerror(errno));
                }
        }
-
-       // no source: seed prng
-       {
-               unsigned seed = time_now ^ getpid();
-               LOG(4, 0, 0, "Seeding the pseudo random generator: %u\n", seed);
-               srand(seed);
-       }
 }
 
 // fill buffer with random data
@@ -351,7 +364,7 @@ void random_data(uint8_t *buf, int len)
                                        strerror(errno));
 
                                // fall back to rand()
-                               initrandom(0);
+                               initrandom(NULL);
                        }
 
                        n = 0;
@@ -997,7 +1010,8 @@ static void processipout(uint8_t * buf, int len)
                if (rate++ < config->icmp_rate) // Only send a max of icmp_rate per second.
                {
                        LOG(4, 0, 0, "IP: Sending ICMP host unreachable to %s\n", fmtaddr(*(in_addr_t *)(buf + 12), 0));
-                       host_unreachable(*(in_addr_t *)(buf + 12), *(uint16_t *)(buf + 4), ip, buf, (len < 64) ? 64 : len);
+                       host_unreachable(*(in_addr_t *)(buf + 12), *(uint16_t *)(buf + 4),
+                               config->bind_address ? config->bind_address : my_address, buf, len);
                }
                return;
        }
@@ -1076,11 +1090,13 @@ static void processipout(uint8_t * buf, int len)
        if (sp->snoop_ip && sp->snoop_port)
                snoop_send_packet(buf, len, sp->snoop_ip, sp->snoop_port);
 
-       sp->cout += len; // byte count
-       sp->total_cout += len; // byte count
+       increment_counter(&sp->cout, &sp->cout_wrap, len); // byte count
+       sp->cout_delta += len;
        sp->pout++;
        udp_tx += len;
+
        sess_local[s].cout += len;      // To send to master..
+       sess_local[s].pout++;
 }
 
 // process outgoing (to tunnel) IPv6
@@ -1185,11 +1201,13 @@ static void processipv6out(uint8_t * buf, int len)
        if (sp->snoop_ip && sp->snoop_port)
                snoop_send_packet(buf, len, sp->snoop_ip, sp->snoop_port);
 
-       sp->cout += len; // byte count
-       sp->total_cout += len; // byte count
+       increment_counter(&sp->cout, &sp->cout_wrap, len); // byte count
+       sp->cout_delta += len;
        sp->pout++;
        udp_tx += len;
+
        sess_local[s].cout += len;      // To send to master..
+       sess_local[s].pout++;
 }
 
 //
@@ -1235,11 +1253,13 @@ static void send_ipout(sessionidt s, uint8_t *buf, int len)
        if (sp->snoop_ip && sp->snoop_port)
                snoop_send_packet(buf, len, sp->snoop_ip, sp->snoop_port);
 
-       sp->cout += len; // byte count
-       sp->total_cout += len; // byte count
+       increment_counter(&sp->cout, &sp->cout_wrap, len); // byte count
+       sp->cout_delta += len;
        sp->pout++;
        udp_tx += len;
+
        sess_local[s].cout += len;      // To send to master..
+       sess_local[s].pout++;
 }
 
 // add an AVP (16 bit)
@@ -1566,7 +1586,7 @@ void sendipcp(tunnelidt t, sessionidt s)
        if (!q) return;
 
        *q = ConfigReq;
-       q[1] = r << RADIUS_SHIFT;                    // ID, dont care, we only send one type of request
+       q[1] = r >> RADIUS_SHIFT;                    // ID, dont care, we only send one type of request
        *(uint16_t *) (q + 2) = htons(10);
        q[4] = 3;
        q[5] = 6;
@@ -1588,7 +1608,7 @@ void sendipcp(tunnelidt t, sessionidt s)
                if (!q) return;
 
                *q = ConfigReq;
-               q[1] = r << RADIUS_SHIFT;               // ID, don't care, we
+               q[1] = r >> RADIUS_SHIFT;               // ID, don't care, we
                                                        // only send one type
                                                        // of request
                *(uint16_t *) (q + 2) = htons(14);
@@ -1694,7 +1714,7 @@ static void tunnelshutdown(tunnelidt t, char *reason, int result, int error, cha
        // close session
        for (s = 1; s <= config->cluster_highest_sessionid ; ++s)
                if (session[s].tunnel == t)
-                       sessionshutdown(s, reason, 3, 0);
+                       sessionshutdown(s, reason, 0, 0);
 
        tunnel[t].state = TUNNELDIE;
        tunnel[t].die = TIME + 700; // Clean up in 70 seconds
@@ -2004,8 +2024,6 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                                continue;
                                        }
 
-                                       LOG(4, s, t, "Hidden AVP\n");
-
                                        // Unhide the AVP
                                        unhide_value(b, n, mtype, session[s].random_vector, session[s].random_vector_length);
 
@@ -2026,7 +2044,9 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                        n = orig_len;
                                }
 
-                               LOG(4, s, t, "   AVP %d (%s) len %d\n", mtype, avp_name(mtype), n);
+                               LOG(4, s, t, "   AVP %d (%s) len %d%s%s\n", mtype, avp_name(mtype), n,
+                                       flags & 0x40 ? ", hidden" : "", flags & 0x80 ? ", mandatory" : "");
+
                                switch (mtype)
                                {
                                case 0:     // message type
@@ -2248,6 +2268,8 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                case 36:    // Random Vector
                                        LOG(4, s, t, "   Random Vector received.  Enabled AVP Hiding.\n");
                                        memset(session[s].random_vector, 0, sizeof(session[s].random_vector));
+                                       if (n > sizeof(session[s].random_vector))
+                                               n = sizeof(session[s].random_vector);
                                        memcpy(session[s].random_vector, b, n);
                                        session[s].random_vector_length = n;
                                        break;
@@ -2270,6 +2292,8 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                switch (message)
                                {
                                case 1:       // SCCRQ - Start Control Connection Request
+                                       tunnel[t].state = TUNNELOPENING;
+                                       if (main_quit != QUIT_SHUTDOWN)
                                        {
                                                controlt *c = controlnew(2); // sending SCCRP
                                                control16(c, 2, version, 1); // protocol version
@@ -2279,7 +2303,10 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                                control16(c, 9, t, 1); // assigned tunnel
                                                controladd(c, t, 0); // send the resply
                                        }
-                                       tunnel[t].state = TUNNELOPENING;
+                                       else
+                                       {
+                                               tunnelshutdown(t, "Shutting down", 6, 0, 0);
+                                       }
                                        break;
                                case 2:       // SCCRP
                                        tunnel[t].state = TUNNELOPEN;
@@ -2305,7 +2332,7 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                        // TBA
                                        break;
                                case 10:      // ICRQ
-                                       if (sessionfree)
+                                       if (sessionfree && main_quit != QUIT_SHUTDOWN)
                                        {
                                                uint16_t r;
 
@@ -2347,8 +2374,12 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
 
                                        {
                                                controlt *c = controlnew(14); // CDN
-                                               control16(c, 1, 4, 1); // temporary lack of resources
-                                               controladd(c, session[s].tunnel, asession); // send the message
+                                               if (main_quit == QUIT_SHUTDOWN)
+                                                       control16(c, 1, 2, 7); // try another
+                                               else
+                                                       control16(c, 1, 4, 0); // temporary lack of resources
+
+                                               controladd(c, t, asession); // send the message
                                        }
                                        return;
                                case 11:      // ICRP
@@ -2550,42 +2581,61 @@ static void processtun(uint8_t * buf, int len)
        // Else discard.
 }
 
-//
-// Maximum number of actions to complete.
-// This is to avoid sending out too many packets
-// at once.
-#define MAX_ACTIONS 500
-
-static int regular_cleanups(void)
+// Handle retries, timeouts.  Runs every 1/10th sec, want to ensure
+// that we look at the whole of the tunnel, radius and session tables
+// every second
+static void regular_cleanups(double period)
 {
-       static sessionidt s = 0;        // Next session to check for actions on.
-       tunnelidt t;
-       int count=0,i;
-       uint16_t r;
-       static clockt next_acct = 0;
-       static clockt next_shut_acct = 0;
+       // Next tunnel, radius and session to check for actions on.
+       static tunnelidt t = 0;
+       static int r = 0;
+       static sessionidt s = 0;
+
+       int t_actions = 0;
+       int r_actions = 0;
+       int s_actions = 0;
+
+       int t_slice;
+       int r_slice;
+       int s_slice;
+
+       int i;
        int a;
 
-       LOG(3, 0, 0, "Begin regular cleanup\n");
+       // divide up tables into slices based on the last run
+       t_slice = config->cluster_highest_tunnelid  * period;
+       r_slice = (MAXRADIUS - 1)                   * period;
+       s_slice = config->cluster_highest_sessionid * period;
 
-       for (r = 1; r < MAXRADIUS; r++)
-       {
-               if (!radius[r].state)
-                       continue;
-               if (radius[r].retry)
-               {
-                       if (radius[r].retry <= TIME)
-                               radiusretry(r);
-               } else
-                       radius[r].retry = backoff(radius[r].try+1);     // Is this really needed? --mo
-       }
-       for (t = 1; t <= config->cluster_highest_tunnelid; t++)
+       if (t_slice < 1)
+           t_slice = 1;
+       else if (t_slice > config->cluster_highest_tunnelid)
+           t_slice = config->cluster_highest_tunnelid;
+
+       if (r_slice < 1)
+           r_slice = 1;
+       else if (r_slice > (MAXRADIUS - 1))
+           r_slice = MAXRADIUS - 1;
+
+       if (s_slice < 1)
+           s_slice = 1;
+       else if (s_slice > config->cluster_highest_sessionid)
+           s_slice = config->cluster_highest_sessionid;
+
+       LOG(4, 0, 0, "Begin regular cleanup (last %f seconds ago)\n", period);
+
+       for (i = 0; i < t_slice; i++)
        {
+               t++;
+               if (t > config->cluster_highest_tunnelid)
+                       t = 1;
+
                // check for expired tunnels
                if (tunnel[t].die && tunnel[t].die <= TIME)
                {
                        STAT(tunnel_timeout);
                        tunnelkill(t, "Expired");
+                       t_actions++;
                        continue;
                }
                // check for message resend
@@ -2605,6 +2655,8 @@ static int regular_cleanups(void)
                                                tunnelsend(c->buf, c->length, t);
                                                c = c->next;
                                        }
+
+                               t_actions++;
                        }
                }
                // Send hello
@@ -2613,6 +2665,7 @@ static int regular_cleanups(void)
                        controlt *c = controlnew(6); // sending HELLO
                        controladd(c, t, 0); // send the message
                        LOG(3, 0, t, "Sending HELLO message\n");
+                       t_actions++;
                }
 
                // Check for tunnel changes requested from the CLI
@@ -2623,13 +2676,28 @@ static int regular_cleanups(void)
                        {
                                LOG(2, 0, t, "Dropping tunnel by CLI\n");
                                tunnelshutdown(t, "Requested by administrator", 1, 0, 0);
+                               t_actions++;
                        }
                }
+       }
 
+       for (i = 0; i < r_slice; i++)
+       {
+               r++;
+               if (r >= MAXRADIUS)
+                       r = 1;
+
+               if (!radius[r].state)
+                       continue;
+
+               if (radius[r].retry <= TIME)
+               {
+                       radiusretry(r);
+                       r_actions++;
+               }
        }
 
-       count = 0;
-       for (i = 1; i <= config->cluster_highest_sessionid; i++)
+       for (i = 0; i < s_slice; i++)
        {
                s++;
                if (s > config->cluster_highest_sessionid)
@@ -2644,7 +2712,7 @@ static int regular_cleanups(void)
                        if (session[s].die <= TIME)
                        {
                                sessionkill(s, "Expired");
-                               if (++count >= MAX_ACTIONS) break;
+                               s_actions++;
                        }
                        continue;
                }
@@ -2654,6 +2722,7 @@ static int regular_cleanups(void)
                        // IPCP has not completed yet. Resend
                        LOG(3, s, session[s].tunnel, "No ACK for initial IPCP ConfigReq... resending\n");
                        sendipcp(session[s].tunnel, s);
+                       s_actions++;
                }
 
                // Drop sessions who have not responded within IDLE_TIMEOUT seconds
@@ -2661,7 +2730,7 @@ static int regular_cleanups(void)
                {
                        sessionshutdown(s, "No response to LCP ECHO requests.", 3, 0);
                        STAT(session_timeout);
-                       if (++count >= MAX_ACTIONS) break;
+                       s_actions++;
                        continue;
                }
 
@@ -2681,7 +2750,7 @@ static int regular_cleanups(void)
                        LOG(4, s, session[s].tunnel, "No data in %d seconds, sending LCP ECHO\n",
                                        (int)(time_now - session[s].last_packet));
                        tunnelsend(b, 24, session[s].tunnel); // send it
-                       if (++count >= MAX_ACTIONS) break;
+                       s_actions++;
                }
 
                // Check for actions requested from the CLI
@@ -2695,6 +2764,7 @@ static int regular_cleanups(void)
                                LOG(2, s, session[s].tunnel, "Dropping session by CLI\n");
                                sessionshutdown(s, "Requested by administrator.", 3, 0);
                                a = 0; // dead, no need to check for other actions
+                               s_actions++;
                        }
 
                        if (a & CLI_SESS_NOSNOOP)
@@ -2702,6 +2772,7 @@ static int regular_cleanups(void)
                                LOG(2, s, session[s].tunnel, "Unsnooping session by CLI\n");
                                session[s].snoop_ip = 0;
                                session[s].snoop_port = 0;
+                               s_actions++;
                                send++;
                        }
                        else if (a & CLI_SESS_SNOOP)
@@ -2712,6 +2783,7 @@ static int regular_cleanups(void)
 
                                session[s].snoop_ip = cli_session_actions[s].snoop_ip;
                                session[s].snoop_port = cli_session_actions[s].snoop_port;
+                               s_actions++;
                                send++;
                        }
 
@@ -2719,6 +2791,7 @@ static int regular_cleanups(void)
                        {
                                LOG(2, s, session[s].tunnel, "Un-throttling session by CLI\n");
                                throttle_session(s, 0, 0);
+                               s_actions++;
                                send++;
                        }
                        else if (a & CLI_SESS_THROTTLE)
@@ -2728,6 +2801,7 @@ static int regular_cleanups(void)
                                    cli_session_actions[s].throttle_out);
 
                                throttle_session(s, cli_session_actions[s].throttle_in, cli_session_actions[s].throttle_out);
+                               s_actions++;
                                send++;
                        }
 
@@ -2735,6 +2809,7 @@ static int regular_cleanups(void)
                        {
                                LOG(2, s, session[s].tunnel, "Un-filtering session by CLI\n");
                                filter_session(s, 0, 0);
+                               s_actions++;
                                send++;
                        }
                        else if (a & CLI_SESS_FILTER)
@@ -2744,13 +2819,12 @@ static int regular_cleanups(void)
                                    cli_session_actions[s].filter_out);
 
                                filter_session(s, cli_session_actions[s].filter_in, cli_session_actions[s].filter_out);
+                               s_actions++;
                                send++;
                        }
 
                        if (send)
                                cluster_send_session(s);
-
-                       if (++count >= MAX_ACTIONS) break;
                }
 
                // RADIUS interim accounting
@@ -2771,38 +2845,14 @@ static int regular_cleanups(void)
 
                        radiussend(r, RADIUSINTERIM);
                        sess_local[s].last_interim = time_now;
-
-                       if (++count >= MAX_ACTIONS)
-                               break;
-               }
-       }
-
-       if (*config->accounting_dir)
-       {
-               if (next_acct <= TIME)
-               {
-                       // Dump accounting data
-                       next_acct = TIME + ACCT_TIME;
-                       next_shut_acct = TIME + ACCT_SHUT_TIME;
-                       dump_acct_info(1);
-               }
-               else if (next_shut_acct <= TIME)
-               {
-                       // Dump accounting data for shutdown sessions
-                       next_shut_acct = TIME + ACCT_SHUT_TIME;
-                       if (shut_acct_n)
-                               dump_acct_info(0);
+                       s_actions++;
                }
        }
 
-       if (count >= MAX_ACTIONS)
-               return 1;       // Didn't finish!
-
-       LOG(3, 0, 0, "End regular cleanup (%d actions), next in %d seconds\n", count, config->cleanup_interval);
-       return 0;
+       LOG(4, 0, 0, "End regular cleanup: checked %d/%d/%d tunnels/radius/sessions; %d/%d/%d actions\n",
+               t_slice, r_slice, s_slice, t_actions, r_actions, s_actions);
 }
 
-
 //
 // Are we in the middle of a tunnel update, or radius
 // requests??
@@ -2812,6 +2862,53 @@ static int still_busy(void)
        int i;
        static clockt last_talked = 0;
        static clockt start_busy_wait = 0;
+
+       if (!config->cluster_iam_master)
+       {
+#ifdef BGP
+               static time_t stopped_bgp = 0;
+               if (bgp_configured)
+               {
+                       if (!stopped_bgp)
+                       {
+                               LOG(1, 0, 0, "Shutting down in %d seconds, stopping BGP...\n", QUIT_DELAY);
+
+                               for (i = 0; i < BGP_NUM_PEERS; i++)
+                                       if (bgp_peers[i].state == Established)
+                                               bgp_stop(&bgp_peers[i]);
+
+                               stopped_bgp = time_now;
+
+                               // we don't want to become master
+                               cluster_send_ping(0);
+
+                               return 1;
+                       }
+
+                       if (time_now < (stopped_bgp + QUIT_DELAY))
+                               return 1;
+               }
+#endif /* BGP */
+
+               return 0;
+       }
+
+       if (main_quit == QUIT_SHUTDOWN)
+       {
+               static int dropped = 0;
+               if (!dropped)
+               {
+                       int i;
+
+                       LOG(1, 0, 0, "Dropping sessions and tunnels\n");
+                       for (i = 1; i < MAXTUNNEL; i++)
+                               if (tunnel[i].ip || tunnel[i].state)
+                                       tunnelshutdown(i, "L2TPNS Closing", 6, 0, 0);
+
+                       dropped = 1;
+               }
+       }
+
        if (start_busy_wait == 0)
                start_busy_wait = TIME;
 
@@ -2853,80 +2950,104 @@ static int still_busy(void)
        return 0;
 }
 
-static fd_set readset;
-static int readset_n = 0;
+#ifdef HAVE_EPOLL
+# include <sys/epoll.h>
+#else
+# define FAKE_EPOLL_IMPLEMENTATION /* include the functions */
+# include "fake_epoll.h"
+#endif
+
+// the base set of fds polled: control, cli, udp, tun, cluster
+#define BASE_FDS       5
+
+// additional polled fds
+#ifdef BGP
+# define EXTRA_FDS     BGP_NUM_PEERS
+#else
+# define EXTRA_FDS     0
+#endif
 
 // main loop - gets packets on tun or udp and processes them
 static void mainloop(void)
 {
        int i;
        uint8_t buf[65536];
-       struct timeval to;
        clockt next_cluster_ping = 0;   // send initial ping immediately
-       time_t next_clean = time_now + config->cleanup_interval;
+       struct epoll_event events[BASE_FDS + RADIUS_FDS + EXTRA_FDS];
+       int maxevent = sizeof(events)/sizeof(*events);
+
+       if ((epollfd = epoll_create(maxevent)) < 0)
+       {
+               LOG(0, 0, 0, "epoll_create failed: %s\n", strerror(errno));
+               exit(1);
+       }
 
        LOG(4, 0, 0, "Beginning of main loop.  udpfd=%d, tunfd=%d, cluster_sockfd=%d, controlfd=%d\n",
                udpfd, tunfd, cluster_sockfd, controlfd);
 
-       FD_ZERO(&readset);
-       FD_SET(udpfd, &readset);
-       FD_SET(tunfd, &readset);
-       FD_SET(controlfd, &readset);
-       FD_SET(clifd, &readset);
-       if (cluster_sockfd) FD_SET(cluster_sockfd, &readset);
-       readset_n = udpfd;
-       if (tunfd > readset_n)          readset_n = tunfd;
-       if (controlfd > readset_n)      readset_n = controlfd;
-       if (clifd > readset_n)          readset_n = clifd;
-       if (cluster_sockfd > readset_n) readset_n = cluster_sockfd;
-
-       while (!main_quit || still_busy())
+       /* setup our fds to poll for input */
        {
-               fd_set r;
-               int n = readset_n;
+               static struct event_data d[BASE_FDS];
+               struct epoll_event e;
+
+               e.events = EPOLLIN;
+               i = 0;
+
+               d[i].type = FD_TYPE_CONTROL;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, controlfd, &e);
+
+               d[i].type = FD_TYPE_CLI;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, clifd, &e);
+
+               d[i].type = FD_TYPE_UDP;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, udpfd, &e);
+
+               d[i].type = FD_TYPE_TUN;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, tunfd, &e);
+
+               d[i].type = FD_TYPE_CLUSTER;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, cluster_sockfd, &e);
+       }
+
 #ifdef BGP
-               fd_set w;
-               int bgp_set[BGP_NUM_PEERS];
+       signal(SIGPIPE, SIG_IGN);
+       bgp_setup(config->as_number);
+       if (config->bind_address)
+               bgp_add_route(config->bind_address, 0xffffffff);
+
+       for (i = 0; i < BGP_NUM_PEERS; i++)
+       {
+               if (config->neighbour[i].name[0])
+                       bgp_start(&bgp_peers[i], config->neighbour[i].name,
+                               config->neighbour[i].as, config->neighbour[i].keepalive,
+                               config->neighbour[i].hold, 0); /* 0 = routing disabled */
+       }
 #endif /* BGP */
 
+       while (!main_quit || still_busy())
+       {
+               int more = 0;
+               int n;
+
                if (config->reload_config)
                {
                        // Update the config state based on config settings
                        update_config();
                }
 
-               memcpy(&r, &readset, sizeof(fd_set));
-               to.tv_sec = 0;
-               to.tv_usec = 100000; // 1/10th of a second.
-
 #ifdef BGP
-               FD_ZERO(&w);
-               for (i = 0; i < BGP_NUM_PEERS; i++)
-               {
-                       bgp_set[i] = bgp_select_state(&bgp_peers[i]);
-                       if (bgp_set[i] & 1)
-                       {
-                               FD_SET(bgp_peers[i].sock, &r);
-                               if (bgp_peers[i].sock > n)
-                                       n = bgp_peers[i].sock;
-                       }
-
-                       if (bgp_set[i] & 2)
-                       {
-                               FD_SET(bgp_peers[i].sock, &w);
-                               if (bgp_peers[i].sock > n)
-                                       n = bgp_peers[i].sock;
-                       }
-               }
-
-               n = select(n + 1, &r, &w, 0, &to);
-#else /* BGP */
-               n = select(n + 1, &r, 0, 0, &to);
+               bgp_set_poll();
 #endif /* BGP */
 
+               n = epoll_wait(epollfd, events, maxevent, 100); // timeout 100ms (1/10th sec)
                STAT(select_called);
 
-               TIME = now();
+               TIME = now(NULL);
                if (n < 0)
                {
                        if (errno == EINTR ||
@@ -2934,70 +3055,85 @@ static void mainloop(void)
                                continue;
 
                        LOG(0, 0, 0, "Error returned from select(): %s\n", strerror(errno));
-                       main_quit++;
-                       break;
+                       break; // exit
                }
-               else if (n)
+
+               if (n)
                {
                        struct sockaddr_in addr;
                        int alen, c, s;
+                       int udp_ready = 0;
+                       int tun_ready = 0;
+                       int cluster_ready = 0;
                        int udp_pkts = 0;
                        int tun_pkts = 0;
                        int cluster_pkts = 0;
+#ifdef BGP
+                       uint32_t bgp_events[BGP_NUM_PEERS];
+                       memset(bgp_events, 0, sizeof(bgp_events));
+#endif /* BGP */
 
-                       // nsctl commands
-                       if (FD_ISSET(controlfd, &r))
+                       for (c = n, i = 0; i < c; i++)
                        {
-                               alen = sizeof(addr);
-                               processcontrol(buf, recvfrom(controlfd, buf, sizeof(buf), MSG_WAITALL, (void *) &addr, &alen), &addr, alen);
-                               n--;
-                       }
+                               struct event_data *d = events[i].data.ptr;
+                               switch (d->type)
+                               {
+                               case FD_TYPE_CONTROL: // nsctl commands
+                                       alen = sizeof(addr);
+                                       processcontrol(buf, recvfrom(controlfd, buf, sizeof(buf), MSG_WAITALL, (void *) &addr, &alen), &addr, alen);
+                                       n--;
+                                       break;
 
-                       // RADIUS responses
-                       if (config->cluster_iam_master)
-                       {
-                               for (i = 0; i < config->num_radfds; i++)
+                               case FD_TYPE_CLI: // CLI connections
                                {
-                                       if (FD_ISSET(radfds[i], &r))
+                                       int cli;
+                                       
+                                       alen = sizeof(addr);
+                                       if ((cli = accept(clifd, (struct sockaddr *)&addr, &alen)) >= 0)
                                        {
-                                               processrad(buf, recv(radfds[i], buf, sizeof(buf), 0), i);
-                                               n--;
+                                               cli_do(cli);
+                                               close(cli);
                                        }
-                               }
-                       }
+                                       else
+                                               LOG(0, 0, 0, "accept error: %s\n", strerror(errno));
 
-                       // CLI connections
-                       if (FD_ISSET(clifd, &r))
-                       {
-                               int cli;
-                               
-                               alen = sizeof(addr);
-                               if ((cli = accept(clifd, (struct sockaddr *)&addr, &alen)) >= 0)
-                               {
-                                       cli_do(cli);
-                                       close(cli);
+                                       n--;
+                                       break;
                                }
-                               else
-                                       LOG(0, 0, 0, "accept error: %s\n", strerror(errno));
 
-                               n--;
-                       }
+                               // these are handled below, with multiple interleaved reads
+                               case FD_TYPE_UDP:       udp_ready++; break;
+                               case FD_TYPE_TUN:       tun_ready++; break;
+                               case FD_TYPE_CLUSTER:   cluster_ready++; break;
+
+                               case FD_TYPE_RADIUS: // RADIUS response
+                                       s = recv(radfds[d->index], buf, sizeof(buf), 0);
+                                       if (s >= 0 && config->cluster_iam_master)
+                                               processrad(buf, s, d->index);
+
+                                       n--;
+                                       break;
 
 #ifdef BGP
-                       for (i = 0; i < BGP_NUM_PEERS; i++)
-                       {
-                               int isr = bgp_set[i] ? FD_ISSET(bgp_peers[i].sock, &r) : 0;
-                               int isw = bgp_set[i] ? FD_ISSET(bgp_peers[i].sock, &w) : 0;
-                               bgp_process(&bgp_peers[i], isr, isw);
-                               if (isr) n--;
-                               if (isw) n--;
+                               case FD_TYPE_BGP:
+                                       bgp_events[d->index] = events[i].events;
+                                       n--;
+                                       break;
+#endif /* BGP */
+
+                               default:
+                                       LOG(0, 0, 0, "Unexpected fd type returned from epoll_wait: %d\n", d->type);
+                               }
                        }
+
+#ifdef BGP
+                       bgp_process(bgp_events);
 #endif /* BGP */
 
                        for (c = 0; n && c < config->multi_read_count; c++)
                        {
                                // L2TP
-                               if (FD_ISSET(udpfd, &r))
+                               if (udp_ready)
                                {
                                        alen = sizeof(addr);
                                        if ((s = recvfrom(udpfd, buf, sizeof(buf), 0, (void *) &addr, &alen)) > 0)
@@ -3007,13 +3143,13 @@ static void mainloop(void)
                                        }
                                        else
                                        {
-                                               FD_CLR(udpfd, &r);
+                                               udp_ready = 0;
                                                n--;
                                        }
                                }
 
                                // incoming IP
-                               if (FD_ISSET(tunfd, &r))
+                               if (tun_ready)
                                {
                                        if ((s = read(tunfd, buf, sizeof(buf))) > 0)
                                        {
@@ -3022,13 +3158,13 @@ static void mainloop(void)
                                        }
                                        else
                                        {
-                                               FD_CLR(tunfd, &r);
+                                               tun_ready = 0;
                                                n--;
                                        }
                                }
 
                                // cluster
-                               if (FD_ISSET(cluster_sockfd, &r))
+                               if (cluster_ready)
                                {
                                        alen = sizeof(addr);
                                        if ((s = recvfrom(cluster_sockfd, buf, sizeof(buf), MSG_WAITALL, (void *) &addr, &alen)) > 0)
@@ -3038,7 +3174,7 @@ static void mainloop(void)
                                        }
                                        else
                                        {
-                                               FD_CLR(cluster_sockfd, &r);
+                                               cluster_ready = 0;
                                                n--;
                                        }
                                }
@@ -3053,11 +3189,12 @@ static void mainloop(void)
                                        config->multi_read_count, udp_pkts, tun_pkts, cluster_pkts);
 
                                STAT(multi_read_exceeded);
+                               more++;
                        }
                }
 
                        // Runs on every machine (master and slaves).
-               if (cluster_sockfd && next_cluster_ping <= TIME)
+               if (next_cluster_ping <= TIME)
                {
                        // Check to see which of the cluster is still alive..
 
@@ -3075,9 +3212,11 @@ static void mainloop(void)
                                next_cluster_ping = TIME + config->cluster_hb_interval;
                }
 
+               if (!config->cluster_iam_master)
+                       continue;
+
                        // Run token bucket filtering queue..
                        // Only run it every 1/10th of a second.
-                       // Runs on all machines both master and slave.
                {
                        static clockt last_run = 0;
                        if (last_run != TIME)
@@ -3087,20 +3226,42 @@ static void mainloop(void)
                        }
                }
 
-               /* Handle timeouts. Make sure that this gets run anyway, even if there was
-                * something to read, else under load this will never actually run....
-                *
-                */
-               if (config->cluster_iam_master && next_clean <= time_now)
+                       // Handle timeouts, retries etc.
                {
-                       if (regular_cleanups())
+                       static double last_clean = 0;
+                       double this_clean;
+                       double diff;
+
+                       TIME = now(&this_clean);
+                       diff = this_clean - last_clean;
+
+                       // Run during idle time (after we've handled
+                       // all incoming packets) or every 1/10th sec
+                       if (!more || diff > 0.1)
                        {
-                               // Did it finish?
-                               next_clean = time_now + 1 ;     // Didn't finish. Check quickly.
+                               regular_cleanups(diff);
+                               last_clean = this_clean;
                        }
-                       else
+               }
+
+               if (*config->accounting_dir)
+               {
+                       static clockt next_acct = 0;
+                       static clockt next_shut_acct = 0;
+
+                       if (next_acct <= TIME)
+                       {
+                               // Dump accounting data
+                               next_acct = TIME + ACCT_TIME;
+                               next_shut_acct = TIME + ACCT_SHUT_TIME;
+                               dump_acct_info(1);
+                       }
+                       else if (next_shut_acct <= TIME)
                        {
-                               next_clean = time_now + config->cleanup_interval; // Did. Move to next interval.
+                               // Dump accounting data for shutdown sessions
+                               next_shut_acct = TIME + ACCT_SHUT_TIME;
+                               if (shut_acct_n)
+                                       dump_acct_info(0);
                        }
                }
        }
@@ -3115,6 +3276,7 @@ static void mainloop(void)
 
        //
        // Important!!! We MUST not process any packets past this point!
+       LOG(1, 0, 0, "Shutdown complete\n");
 }
 
 static void stripdomain(char *host)
@@ -3199,6 +3361,7 @@ static void initdata(int optdebug, char *optconfig)
        config->debug = optdebug;
        config->num_tbfs = MAXTBFS;
        config->rl_rate = 28; // 28kbps
+       config->cluster_master_min_adv = 1;
        strcpy(config->random_device, RANDOMDEVICE);
 
        log_stream = stderr;
@@ -3588,7 +3751,7 @@ void snoop_send_packet(char *packet, uint16_t size, in_addr_t destination, uint1
 
 static int dump_session(FILE **f, sessiont *s)
 {
-       if (!s->opened || !s->ip || !(s->cin || s->cout) || !*s->user || s->walled_garden)
+       if (!s->opened || !s->ip || !(s->cin_delta || s->cout_delta) || !*s->user || s->walled_garden)
                return 1;
 
        if (!*f)
@@ -3609,10 +3772,12 @@ static int dump_session(FILE **f, sessiont *s)
                LOG(3, 0, 0, "Dumping accounting information to %s\n", filename);
                fprintf(*f, "# dslwatch.pl dump file V1.01\n"
                        "# host: %s\n"
+                       "# endpoint: %s\n"
                        "# time: %ld\n"
                        "# uptime: %ld\n"
                        "# format: username ip qos uptxoctets downrxoctets\n",
                        hostname,
+                       fmtaddr(config->bind_address ? config->bind_address : my_address, 0),
                        now,
                        now - basetime);
        }
@@ -3622,11 +3787,10 @@ static int dump_session(FILE **f, sessiont *s)
                s->user,                                                // username
                fmtaddr(htonl(s->ip), 0),                               // ip
                (s->throttle_in || s->throttle_out) ? 2 : 1,            // qos
-               (uint32_t) s->cin,                                      // uptxoctets
-               (uint32_t) s->cout);                                    // downrxoctets
+               (uint32_t) s->cin_delta,                                // uptxoctets
+               (uint32_t) s->cout_delta);                              // downrxoctets
 
-       s->pin = s->cin = 0;
-       s->pout = s->cout = 0;
+       s->cin_delta = s->cout_delta = 0;
 
        return 1;
 }
@@ -3754,19 +3918,6 @@ int main(int argc, char *argv[])
        if (cluster_init() < 0)
                exit(1);
 
-#ifdef BGP
-       signal(SIGPIPE, SIG_IGN);
-       bgp_setup(config->as_number);
-       bgp_add_route(config->bind_address, 0xffffffff);
-       for (i = 0; i < BGP_NUM_PEERS; i++)
-       {
-               if (config->neighbour[i].name[0])
-                       bgp_start(&bgp_peers[i], config->neighbour[i].name,
-                               config->neighbour[i].as, config->neighbour[i].keepalive,
-                               config->neighbour[i].hold, 0); /* 0 = routing disabled */
-       }
-#endif /* BGP */
-
        inittun();
        LOG(1, 0, 0, "Set up on interface %s\n", config->tundevice);
 
@@ -3774,11 +3925,18 @@ int main(int argc, char *argv[])
        initrad();
        initippool();
 
-       signal(SIGHUP, sighup_handler);
-       signal(SIGTERM, sigterm_handler);
-       signal(SIGINT, sigterm_handler);
-       signal(SIGQUIT, sigquit_handler);
+       // seed prng
+       {
+               unsigned seed = time_now ^ getpid();
+               LOG(4, 0, 0, "Seeding the pseudo random generator: %u\n", seed);
+               srand(seed);
+       }
+
+       signal(SIGHUP,  sighup_handler);
        signal(SIGCHLD, sigchild_handler);
+       signal(SIGTERM, shutdown_handler);
+       signal(SIGINT,  shutdown_handler);
+       signal(SIGQUIT, shutdown_handler);
 
        // Prevent us from getting paged out
        if (config->lock_pages)
@@ -3797,14 +3955,6 @@ int main(int argc, char *argv[])
 
        mainloop();
 
-#ifdef BGP
-       /* try to shut BGP down cleanly; with luck the sockets will be
-          writable since we're out of the select */
-       for (i = 0; i < BGP_NUM_PEERS; i++)
-               if (bgp_peers[i].state == Established)
-                       bgp_stop(&bgp_peers[i]);
-#endif /* BGP */
-
        /* remove plugins (so cleanup code gets run) */
        plugins_done();
 
@@ -3864,33 +4014,10 @@ static void sigalrm_handler(int sig)
 
 }
 
-static void sigterm_handler(int sig)
+static void shutdown_handler(int sig)
 {
-       LOG(1, 0, 0, "Shutting down cleanly\n");
-       main_quit++;
-}
-
-static void sigquit_handler(int sig)
-{
-       int i;
-
-       LOG(1, 0, 0, "Shutting down without saving sessions\n");
-
-       if (config->cluster_iam_master)
-       {
-               for (i = 1; i < MAXSESSION; i++)
-               {
-                       if (session[i].opened)
-                               sessionkill(i, "L2TPNS Closing");
-               }
-               for (i = 1; i < MAXTUNNEL; i++)
-               {
-                       if (tunnel[i].ip || tunnel[i].state)
-                               tunnelshutdown(i, "L2TPNS Closing", 6, 0, 0);
-               }
-       }
-
-       main_quit++;
+       LOG(1, 0, 0, "Shutting down\n");
+       main_quit = (sig == SIGQUIT) ? QUIT_SHUTDOWN : QUIT_FAILOVER;
 }
 
 static void sigchild_handler(int sig)
@@ -4005,8 +4132,6 @@ static void update_config()
        if (!config->numradiusservers)
                LOG(0, 0, 0, "No RADIUS servers defined!\n");
 
-       config->num_radfds = 2 << RADIUS_SHIFT;
-
        // parse radius_authtypes_s
        config->radius_authtypes = config->radius_authprefer = 0;
        p = config->radius_authtypes_s;
@@ -4081,7 +4206,6 @@ static void update_config()
        }
 
        memcpy(config->old_plugins, config->plugins, sizeof(config->plugins));
-       if (!config->cleanup_interval) config->cleanup_interval = 10;
        if (!config->multi_read_count) config->multi_read_count = 10;
        if (!config->cluster_address) config->cluster_address = inet_addr(DEFAULT_MCAST_ADDR);
        if (!*config->cluster_interface)
@@ -4168,7 +4292,7 @@ int sessionsetup(tunnelidt t, sessionidt s)
                if (!session[s].ip)
                {
                        LOG(0, s, t, "   No IP allocated.  The IP address pool is FULL!\n");
-                       sessionshutdown(s, "No IP addresses available.", 2, 7);
+                       sessionshutdown(s, "No IP addresses available.", 2, 7); // try another
                        return 0;
                }
                LOG(3, s, t, "   No IP allocated.  Assigned %s from pool\n",
@@ -4188,8 +4312,16 @@ int sessionsetup(tunnelidt t, sessionidt s)
                for (i = 1; i <= config->cluster_highest_sessionid; i++)
                {
                        if (i == s) continue;
-                       if (ip == session[i].ip) sessionkill(i, "Duplicate IP address");
-                       if (!session[s].walled_garden && !session[i].walled_garden && strcasecmp(user, session[i].user) == 0)
+                       if (!session[s].opened) continue;
+                       if (ip == session[i].ip)
+                       {
+                               sessionkill(i, "Duplicate IP address");
+                               continue;
+                       }
+
+                       if (config->allow_duplicate_users) continue;
+                       if (session[s].walled_garden || session[i].walled_garden) continue;
+                       if (!strcasecmp(user, session[i].user))
                                sessionkill(i, "Duplicate session for users");
                }
        }
@@ -4757,6 +4889,9 @@ static tunnelidt new_tunnel()
 void become_master(void)
 {
        int s, i;
+       static struct event_data d[RADIUS_FDS];
+       struct epoll_event e;
+
        run_plugins(PLUGIN_BECOME_MASTER, NULL);
 
        // running a bunch of iptables commands is slow and can cause
@@ -4775,11 +4910,14 @@ void become_master(void)
        }
 
        // add radius fds
-       for (i = 0; i < config->num_radfds; i++)
+       e.events = EPOLLIN;
+       for (i = 0; i < RADIUS_FDS; i++)
        {
-               FD_SET(radfds[i], &readset);
-               if (radfds[i] > readset_n)
-                       readset_n = radfds[i];
+               d[i].type = FD_TYPE_RADIUS;
+               d[i].index = i;
+               e.data.ptr = &d[i];
+
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, radfds[i], &e);
        }
 }
 
@@ -4873,11 +5011,11 @@ static void unhide_value(uint8_t *value, size_t len, uint16_t type, uint8_t *vec
        uint8_t digest[16];
        uint8_t *last;
        size_t d = 0;
+       uint16_t m = htons(type);
 
        // Compute initial pad
        MD5Init(&ctx);
-       MD5Update(&ctx, (uint8_t) (type >> 8) & 0xff, 1);
-       MD5Update(&ctx, (uint8_t)  type       & 0xff, 1);
+       MD5Update(&ctx, (unsigned char *) &m, 2);
        MD5Update(&ctx, config->l2tpsecret, strlen(config->l2tpsecret));
        MD5Update(&ctx, vector, vec_len);
        MD5Final(digest, &ctx);