set a default limit
[l2tpns.git] / l2tpns.c
index 6317c71..733ca6d 100644 (file)
--- a/l2tpns.c
+++ b/l2tpns.c
@@ -4,7 +4,7 @@
 // Copyright (c) 2002 FireBrick (Andrews & Arnold Ltd / Watchfront Ltd) - GPL licenced
 // vim: sw=8 ts=8
 
-char const *cvs_id_l2tpns = "$Id: l2tpns.c,v 1.100 2005/05/09 20:48:00 bodea Exp $";
+char const *cvs_id_l2tpns = "$Id: l2tpns.c,v 1.108 2005/06/04 15:42:35 bodea Exp $";
 
 #include <arpa/inet.h>
 #include <assert.h>
@@ -52,7 +52,7 @@ char const *cvs_id_l2tpns = "$Id: l2tpns.c,v 1.100 2005/05/09 20:48:00 bodea Exp
 
 #ifdef BGP
 #include "bgp.h"
-#endif /* BGP */
+#endif
 
 // Globals
 configt *config = NULL;                // all configuration
@@ -64,13 +64,14 @@ int snoopfd = -1;           // UDP file handle for sending out intercept data
 int *radfds = NULL;            // RADIUS requests file handles
 int ifrfd = -1;                        // File descriptor for routing, etc
 int ifr6fd = -1;               // File descriptor for IPv6 routing, etc
-static int rand_fd = -1;       // Random data source
+int rand_fd = -1;              // Random data source
+int cluster_sockfd = -1;       // Intra-cluster communications socket.
+int epollfd = -1;              // event polling
 time_t basetime = 0;           // base clock
 char hostname[1000] = "";      // us.
 static int tunidx;             // ifr_ifindex of tun device
 static int syslog_log = 0;     // are we logging to syslog
 static FILE *log_stream = 0;   // file handle for direct logging (i.e. direct into file, not via syslog).
-extern int cluster_sockfd;     // Intra-cluster communications socket.
 uint32_t last_id = 0;          // Unique ID for radius accounting
 
 struct cli_session_actions *cli_session_actions = NULL;        // Pending session changes requested by CLI
@@ -90,7 +91,7 @@ uint32_t eth_tx = 0;
 static uint32_t ip_pool_size = 1;      // Size of the pool of addresses used for dynamic address allocation.
 time_t time_now = 0;                   // Current time in seconds since epoch.
 static char time_now_string[64] = {0}; // Current time as a string.
-static char main_quit = 0;             // True if we're in the process of exiting.
+char main_quit = 0;                    // True if we're in the process of exiting.
 linked_list *loaded_plugins;
 linked_list *plugins[MAX_PLUGIN_TYPES];
 
@@ -113,6 +114,7 @@ config_descriptt config_values[] = {
        CONFIG("radius_interim", radius_interim, INT),
        CONFIG("radius_secret", radiussecret, STRING),
        CONFIG("radius_authtypes", radius_authtypes_s, STRING),
+       CONFIG("allow_duplicate_users", allow_duplicate_users, BOOL),
        CONFIG("bind_address", bind_address, IPv4),
        CONFIG("peer_address", peer_address, IPv4),
        CONFIG("send_garp", send_garp, BOOL),
@@ -121,7 +123,6 @@ config_descriptt config_values[] = {
        CONFIG("accounting_dir", accounting_dir, STRING),
        CONFIG("setuid", target_uid, INT),
        CONFIG("dump_speed", dump_speed, BOOL),
-       CONFIG("cleanup_interval", cleanup_interval, INT),
        CONFIG("multi_read_count", multi_read_count, INT),
        CONFIG("scheduler_fifo", scheduler_fifo, BOOL),
        CONFIG("lock_pages", lock_pages, BOOL),
@@ -131,6 +132,7 @@ config_descriptt config_values[] = {
        CONFIG("cluster_interface", cluster_interface, STRING),
        CONFIG("cluster_hb_interval", cluster_hb_interval, INT),
        CONFIG("cluster_hb_timeout", cluster_hb_timeout, INT),
+       CONFIG("cluster_master_min_adv", cluster_master_min_adv, INT),
        CONFIG("ipv6_prefix", ipv6_prefix, IPv6),
        { NULL, 0, 0, 0 },
 };
@@ -189,11 +191,15 @@ static void processcontrol(uint8_t *buf, int len, struct sockaddr_in *addr, int
 static tunnelidt new_tunnel(void);
 static void unhide_value(uint8_t *value, size_t len, uint16_t type, uint8_t *vector, size_t vec_len);
 
-// return internal time (10ths since process startup)
-static clockt now(void)
+// on slaves, alow BGP to withdraw cleanly before exiting
+#define QUIT_DELAY     5
+
+// return internal time (10ths since process startup), set f if given
+static clockt now(double *f)
 {
        struct timeval t;
        gettimeofday(&t, 0);
+       if (f) *f = t.tv_sec + t.tv_usec / 1000000.0;
        return (t.tv_sec - basetime) * 10 + t.tv_usec / 100000 + 1;
 }
 
@@ -203,7 +209,7 @@ static clockt now(void)
 clockt backoff(uint8_t try)
 {
        if (try > 5) try = 5;                  // max backoff
-       return now() + 10 * (1 << try);
+       return now(NULL) + 10 * (1 << try);
 }
 
 
@@ -297,6 +303,16 @@ void _log_hex(int level, const char *title, const char *data, int maxsize)
        }
 }
 
+// update a counter, accumulating 2^32 wraps
+void increment_counter(uint32_t *counter, uint32_t *wrap, uint32_t delta)
+{
+       uint32_t new = *counter + delta;
+       if (new < *counter)
+               (*wrap)++;
+
+       *counter = new;
+}
+
 // initialise the random generator
 static void initrandom(char *source)
 {
@@ -997,7 +1013,8 @@ static void processipout(uint8_t * buf, int len)
                if (rate++ < config->icmp_rate) // Only send a max of icmp_rate per second.
                {
                        LOG(4, 0, 0, "IP: Sending ICMP host unreachable to %s\n", fmtaddr(*(in_addr_t *)(buf + 12), 0));
-                       host_unreachable(*(in_addr_t *)(buf + 12), *(uint16_t *)(buf + 4), ip, buf, (len < 64) ? 64 : len);
+                       host_unreachable(*(in_addr_t *)(buf + 12), *(uint16_t *)(buf + 4),
+                               config->bind_address ? config->bind_address : my_address, buf, len);
                }
                return;
        }
@@ -1076,11 +1093,13 @@ static void processipout(uint8_t * buf, int len)
        if (sp->snoop_ip && sp->snoop_port)
                snoop_send_packet(buf, len, sp->snoop_ip, sp->snoop_port);
 
-       sp->cout += len; // byte count
-       sp->total_cout += len; // byte count
+       increment_counter(&sp->cout, &sp->cout_wrap, len); // byte count
+       sp->cout_delta += len;
        sp->pout++;
        udp_tx += len;
+
        sess_local[s].cout += len;      // To send to master..
+       sess_local[s].pout++;
 }
 
 // process outgoing (to tunnel) IPv6
@@ -1185,11 +1204,13 @@ static void processipv6out(uint8_t * buf, int len)
        if (sp->snoop_ip && sp->snoop_port)
                snoop_send_packet(buf, len, sp->snoop_ip, sp->snoop_port);
 
-       sp->cout += len; // byte count
-       sp->total_cout += len; // byte count
+       increment_counter(&sp->cout, &sp->cout_wrap, len); // byte count
+       sp->cout_delta += len;
        sp->pout++;
        udp_tx += len;
+
        sess_local[s].cout += len;      // To send to master..
+       sess_local[s].pout++;
 }
 
 //
@@ -1235,11 +1256,13 @@ static void send_ipout(sessionidt s, uint8_t *buf, int len)
        if (sp->snoop_ip && sp->snoop_port)
                snoop_send_packet(buf, len, sp->snoop_ip, sp->snoop_port);
 
-       sp->cout += len; // byte count
-       sp->total_cout += len; // byte count
+       increment_counter(&sp->cout, &sp->cout_wrap, len); // byte count
+       sp->cout_delta += len;
        sp->pout++;
        udp_tx += len;
+
        sess_local[s].cout += len;      // To send to master..
+       sess_local[s].pout++;
 }
 
 // add an AVP (16 bit)
@@ -1323,11 +1346,11 @@ static void controlnull(tunnelidt t)
 }
 
 // add a control message to a tunnel, and send if within window
-static void controladd(controlt * c, tunnelidt t, sessionidt s)
+static void controladd(controlt * c, tunnelidt t, sessionidt far)
 {
        *(uint16_t *) (c->buf + 2) = htons(c->length); // length
        *(uint16_t *) (c->buf + 4) = htons(tunnel[t].far); // tunnel
-       *(uint16_t *) (c->buf + 6) = htons(s ? session[s].far : 0); // session
+       *(uint16_t *) (c->buf + 6) = htons(far); // session
        *(uint16_t *) (c->buf + 8) = htons(tunnel[t].ns); // sequence
        tunnel[t].ns++;              // advance sequence
        // link in message in to queue
@@ -1518,7 +1541,7 @@ void sessionshutdown(sessionidt s, char *reason, int result, int error)
                        control16(c, 1, result, 1);
 
                control16(c, 14, s, 1);   // assigned session (our end)
-               controladd(c, session[s].tunnel, s); // send the message
+               controladd(c, session[s].tunnel, session[s].far); // send the message
        }
 
        if (!session[s].die)
@@ -1566,7 +1589,7 @@ void sendipcp(tunnelidt t, sessionidt s)
        if (!q) return;
 
        *q = ConfigReq;
-       q[1] = r << RADIUS_SHIFT;                    // ID, dont care, we only send one type of request
+       q[1] = r >> RADIUS_SHIFT;                    // ID, dont care, we only send one type of request
        *(uint16_t *) (q + 2) = htons(10);
        q[4] = 3;
        q[5] = 6;
@@ -1588,7 +1611,7 @@ void sendipcp(tunnelidt t, sessionidt s)
                if (!q) return;
 
                *q = ConfigReq;
-               q[1] = r << RADIUS_SHIFT;               // ID, don't care, we
+               q[1] = r >> RADIUS_SHIFT;               // ID, don't care, we
                                                        // only send one type
                                                        // of request
                *(uint16_t *) (q + 2) = htons(14);
@@ -2277,7 +2300,7 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                                controls(c, 7, tunnel[t].hostname, 1); // host name (TBA)
                                                if (chapresponse) controlb(c, 13, chapresponse, 16, 1); // Challenge response
                                                control16(c, 9, t, 1); // assigned tunnel
-                                               controladd(c, t, s); // send the resply
+                                               controladd(c, t, 0); // send the resply
                                        }
                                        tunnel[t].state = TUNNELOPENING;
                                        break;
@@ -2305,16 +2328,9 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                        // TBA
                                        break;
                                case 10:      // ICRQ
-                                       if (!sessionfree)
-                                       {
-                                               STAT(session_overflow);
-                                               LOG(1, 0, t, "No free sessions\n");
-                                               return;
-                                       }
-                                       else
+                                       if (sessionfree)
                                        {
                                                uint16_t r;
-                                               controlt *c;
 
                                                s = sessionfree;
                                                sessionfree = session[s].next;
@@ -2324,28 +2340,40 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                                        config->cluster_highest_sessionid = s;
 
                                                // make a RADIUS session
-                                               if (!(r = radiusnew(s)))
+                                               if ((r = radiusnew(s)))
                                                {
-                                                       LOG(1, s, t, "No free RADIUS sessions for ICRQ\n");
-                                                       sessionclear(s);
-                                                       return;
+                                                       controlt *c = controlnew(11); // sending ICRP
+                                                       session[s].opened = time_now;
+                                                       session[s].tunnel = t;
+                                                       session[s].far = asession;
+                                                       session[s].last_packet = time_now;
+                                                       LOG(3, s, t, "New session (%d/%d)\n", tunnel[t].far, session[s].far);
+                                                       control16(c, 14, s, 1); // assigned session
+                                                       controladd(c, t, asession); // send the reply
+
+                                                       strncpy(radius[r].calling, calling, sizeof(radius[r].calling) - 1);
+                                                       strncpy(session[s].called, called, sizeof(session[s].called) - 1);
+                                                       strncpy(session[s].calling, calling, sizeof(session[s].calling) - 1);
+                                                       STAT(session_created);
+                                                       break;
                                                }
 
-                                               c = controlnew(11); // sending ICRP
-                                               session[s].opened = time_now;
-                                               session[s].tunnel = t;
-                                               session[s].far = asession;
-                                               session[s].last_packet = time_now;
-                                               LOG(3, s, t, "New session (%d/%d)\n", tunnel[t].far, session[s].far);
-                                               control16(c, 14, s, 1); // assigned session
-                                               controladd(c, t, s); // send the reply
-
-                                               strncpy(radius[r].calling, calling, sizeof(radius[r].calling) - 1);
-                                               strncpy(session[s].called, called, sizeof(session[s].called) - 1);
-                                               strncpy(session[s].calling, calling, sizeof(session[s].calling) - 1);
-                                               STAT(session_created);
+
+                                               LOG(1, s, t, "No free RADIUS sessions for ICRQ\n");
+                                               sessionclear(s);
                                        }
-                                       break;
+                                       else
+                                       {
+                                               STAT(session_overflow);
+                                               LOG(1, 0, t, "No free sessions\n");
+                                       }
+
+                                       {
+                                               controlt *c = controlnew(14); // CDN
+                                               control16(c, 1, 4, 1); // temporary lack of resources
+                                               controladd(c, session[s].tunnel, asession); // send the message
+                                       }
+                                       return;
                                case 11:      // ICRP
                                        // TBA
                                        break;
@@ -2356,8 +2384,11 @@ void processudp(uint8_t * buf, int len, struct sockaddr_in *addr)
                                        LOG(3, s, t, "Magic %X Flags %X\n", amagic, aflags);
                                        controlnull(t); // ack
                                        // proxy authentication type is not supported
-                                       if (authtype && !(config->radius_authtypes & authtype))
-                                               sendlcp(t, s, config->radius_authprefer);
+                                       if (!(config->radius_authtypes & authtype))
+                                               authtype = config->radius_authprefer;
+
+                                       // start LCP
+                                       sendlcp(t, s, authtype);
                                        break;
                                case 14:      // CDN
                                        controlnull(t); // ack
@@ -2542,42 +2573,61 @@ static void processtun(uint8_t * buf, int len)
        // Else discard.
 }
 
-//
-// Maximum number of actions to complete.
-// This is to avoid sending out too many packets
-// at once.
-#define MAX_ACTIONS 500
-
-static int regular_cleanups(void)
+// Handle retries, timeouts.  Runs every 1/10th sec, want to ensure
+// that we look at the whole of the tunnel, radius and session tables
+// every second
+static void regular_cleanups(double period)
 {
-       static sessionidt s = 0;        // Next session to check for actions on.
-       tunnelidt t;
-       int count=0,i;
-       uint16_t r;
-       static clockt next_acct = 0;
-       static clockt next_shut_acct = 0;
+       // Next tunnel, radius and session to check for actions on.
+       static tunnelidt t = 0;
+       static int r = 0;
+       static sessionidt s = 0;
+
+       int t_actions = 0;
+       int r_actions = 0;
+       int s_actions = 0;
+
+       int t_slice;
+       int r_slice;
+       int s_slice;
+
+       int i;
        int a;
 
-       LOG(3, 0, 0, "Begin regular cleanup\n");
+       // divide up tables into slices based on the last run
+       t_slice = config->cluster_highest_tunnelid  * period;
+       r_slice = (MAXRADIUS - 1)                   * period;
+       s_slice = config->cluster_highest_sessionid * period;
 
-       for (r = 1; r < MAXRADIUS; r++)
-       {
-               if (!radius[r].state)
-                       continue;
-               if (radius[r].retry)
-               {
-                       if (radius[r].retry <= TIME)
-                               radiusretry(r);
-               } else
-                       radius[r].retry = backoff(radius[r].try+1);     // Is this really needed? --mo
-       }
-       for (t = 1; t <= config->cluster_highest_tunnelid; t++)
+       if (t_slice < 1)
+           t_slice = 1;
+       else if (t_slice > config->cluster_highest_tunnelid)
+           t_slice = config->cluster_highest_tunnelid;
+
+       if (r_slice < 1)
+           r_slice = 1;
+       else if (r_slice > (MAXRADIUS - 1))
+           r_slice = MAXRADIUS - 1;
+
+       if (s_slice < 1)
+           s_slice = 1;
+       else if (s_slice > config->cluster_highest_sessionid)
+           s_slice = config->cluster_highest_sessionid;
+
+       LOG(4, 0, 0, "Begin regular cleanup (last %f seconds ago)\n", period);
+
+       for (i = 0; i < t_slice; i++)
        {
+               t++;
+               if (t > config->cluster_highest_tunnelid)
+                       t = 1;
+
                // check for expired tunnels
                if (tunnel[t].die && tunnel[t].die <= TIME)
                {
                        STAT(tunnel_timeout);
                        tunnelkill(t, "Expired");
+                       t_actions++;
                        continue;
                }
                // check for message resend
@@ -2597,6 +2647,8 @@ static int regular_cleanups(void)
                                                tunnelsend(c->buf, c->length, t);
                                                c = c->next;
                                        }
+
+                               t_actions++;
                        }
                }
                // Send hello
@@ -2605,6 +2657,7 @@ static int regular_cleanups(void)
                        controlt *c = controlnew(6); // sending HELLO
                        controladd(c, t, 0); // send the message
                        LOG(3, 0, t, "Sending HELLO message\n");
+                       t_actions++;
                }
 
                // Check for tunnel changes requested from the CLI
@@ -2615,13 +2668,28 @@ static int regular_cleanups(void)
                        {
                                LOG(2, 0, t, "Dropping tunnel by CLI\n");
                                tunnelshutdown(t, "Requested by administrator", 1, 0, 0);
+                               t_actions++;
                        }
                }
+       }
+
+       for (i = 0; i < r_slice; i++)
+       {
+               r++;
+               if (r >= MAXRADIUS)
+                       r = 1;
 
+               if (!radius[r].state)
+                       continue;
+
+               if (radius[r].retry <= TIME)
+               {
+                       radiusretry(r);
+                       r_actions++;
+               }
        }
 
-       count = 0;
-       for (i = 1; i <= config->cluster_highest_sessionid; i++)
+       for (i = 0; i < s_slice; i++)
        {
                s++;
                if (s > config->cluster_highest_sessionid)
@@ -2636,7 +2704,7 @@ static int regular_cleanups(void)
                        if (session[s].die <= TIME)
                        {
                                sessionkill(s, "Expired");
-                               if (++count >= MAX_ACTIONS) break;
+                               s_actions++;
                        }
                        continue;
                }
@@ -2646,6 +2714,7 @@ static int regular_cleanups(void)
                        // IPCP has not completed yet. Resend
                        LOG(3, s, session[s].tunnel, "No ACK for initial IPCP ConfigReq... resending\n");
                        sendipcp(session[s].tunnel, s);
+                       s_actions++;
                }
 
                // Drop sessions who have not responded within IDLE_TIMEOUT seconds
@@ -2653,7 +2722,7 @@ static int regular_cleanups(void)
                {
                        sessionshutdown(s, "No response to LCP ECHO requests.", 3, 0);
                        STAT(session_timeout);
-                       if (++count >= MAX_ACTIONS) break;
+                       s_actions++;
                        continue;
                }
 
@@ -2673,7 +2742,7 @@ static int regular_cleanups(void)
                        LOG(4, s, session[s].tunnel, "No data in %d seconds, sending LCP ECHO\n",
                                        (int)(time_now - session[s].last_packet));
                        tunnelsend(b, 24, session[s].tunnel); // send it
-                       if (++count >= MAX_ACTIONS) break;
+                       s_actions++;
                }
 
                // Check for actions requested from the CLI
@@ -2687,6 +2756,7 @@ static int regular_cleanups(void)
                                LOG(2, s, session[s].tunnel, "Dropping session by CLI\n");
                                sessionshutdown(s, "Requested by administrator.", 3, 0);
                                a = 0; // dead, no need to check for other actions
+                               s_actions++;
                        }
 
                        if (a & CLI_SESS_NOSNOOP)
@@ -2694,6 +2764,7 @@ static int regular_cleanups(void)
                                LOG(2, s, session[s].tunnel, "Unsnooping session by CLI\n");
                                session[s].snoop_ip = 0;
                                session[s].snoop_port = 0;
+                               s_actions++;
                                send++;
                        }
                        else if (a & CLI_SESS_SNOOP)
@@ -2704,6 +2775,7 @@ static int regular_cleanups(void)
 
                                session[s].snoop_ip = cli_session_actions[s].snoop_ip;
                                session[s].snoop_port = cli_session_actions[s].snoop_port;
+                               s_actions++;
                                send++;
                        }
 
@@ -2711,6 +2783,7 @@ static int regular_cleanups(void)
                        {
                                LOG(2, s, session[s].tunnel, "Un-throttling session by CLI\n");
                                throttle_session(s, 0, 0);
+                               s_actions++;
                                send++;
                        }
                        else if (a & CLI_SESS_THROTTLE)
@@ -2720,6 +2793,7 @@ static int regular_cleanups(void)
                                    cli_session_actions[s].throttle_out);
 
                                throttle_session(s, cli_session_actions[s].throttle_in, cli_session_actions[s].throttle_out);
+                               s_actions++;
                                send++;
                        }
 
@@ -2727,6 +2801,7 @@ static int regular_cleanups(void)
                        {
                                LOG(2, s, session[s].tunnel, "Un-filtering session by CLI\n");
                                filter_session(s, 0, 0);
+                               s_actions++;
                                send++;
                        }
                        else if (a & CLI_SESS_FILTER)
@@ -2736,13 +2811,12 @@ static int regular_cleanups(void)
                                    cli_session_actions[s].filter_out);
 
                                filter_session(s, cli_session_actions[s].filter_in, cli_session_actions[s].filter_out);
+                               s_actions++;
                                send++;
                        }
 
                        if (send)
                                cluster_send_session(s);
-
-                       if (++count >= MAX_ACTIONS) break;
                }
 
                // RADIUS interim accounting
@@ -2763,38 +2837,14 @@ static int regular_cleanups(void)
 
                        radiussend(r, RADIUSINTERIM);
                        sess_local[s].last_interim = time_now;
-
-                       if (++count >= MAX_ACTIONS)
-                               break;
-               }
-       }
-
-       if (*config->accounting_dir)
-       {
-               if (next_acct <= TIME)
-               {
-                       // Dump accounting data
-                       next_acct = TIME + ACCT_TIME;
-                       next_shut_acct = TIME + ACCT_SHUT_TIME;
-                       dump_acct_info(1);
-               }
-               else if (next_shut_acct <= TIME)
-               {
-                       // Dump accounting data for shutdown sessions
-                       next_shut_acct = TIME + ACCT_SHUT_TIME;
-                       if (shut_acct_n)
-                               dump_acct_info(0);
+                       s_actions++;
                }
        }
 
-       if (count >= MAX_ACTIONS)
-               return 1;       // Didn't finish!
-
-       LOG(3, 0, 0, "End regular cleanup (%d actions), next in %d seconds\n", count, config->cleanup_interval);
-       return 0;
+       LOG(4, 0, 0, "End regular cleanup: checked %d/%d/%d tunnels/radius/sessions; %d/%d/%d actions\n",
+               t_slice, r_slice, s_slice, t_actions, r_actions, s_actions);
 }
 
-
 //
 // Are we in the middle of a tunnel update, or radius
 // requests??
@@ -2804,6 +2854,37 @@ static int still_busy(void)
        int i;
        static clockt last_talked = 0;
        static clockt start_busy_wait = 0;
+
+       if (!config->cluster_iam_master)
+       {
+#ifdef BGP
+               static time_t stopped_bgp = 0;
+               if (bgp_configured)
+               {
+                       if (!stopped_bgp)
+                       {
+                               LOG(1, 0, 0, "Shutting down in %d seconds, stopping BGP...\n", QUIT_DELAY);
+
+                               for (i = 0; i < BGP_NUM_PEERS; i++)
+                                       if (bgp_peers[i].state == Established)
+                                               bgp_stop(&bgp_peers[i]);
+
+                               stopped_bgp = time_now;
+
+                               // we don't want to become master
+                               cluster_send_ping(0);
+
+                               return 1;
+                       }
+
+                       if (time_now < (stopped_bgp + QUIT_DELAY))
+                               return 1;
+               }
+#endif /* BGP */
+
+               return 0;
+       }
+
        if (start_busy_wait == 0)
                start_busy_wait = TIME;
 
@@ -2845,80 +2926,104 @@ static int still_busy(void)
        return 0;
 }
 
-static fd_set readset;
-static int readset_n = 0;
+#ifdef HAVE_EPOLL
+# include <sys/epoll.h>
+#else
+# define FAKE_EPOLL_IMPLEMENTATION /* include the functions */
+# include "fake_epoll.h"
+#endif
+
+// the base set of fds polled: control, cli, udp, tun, cluster
+#define BASE_FDS       5
+
+// additional polled fds
+#ifdef BGP
+# define EXTRA_FDS     BGP_NUM_PEERS
+#else
+# define EXTRA_FDS     0
+#endif
 
 // main loop - gets packets on tun or udp and processes them
 static void mainloop(void)
 {
        int i;
        uint8_t buf[65536];
-       struct timeval to;
        clockt next_cluster_ping = 0;   // send initial ping immediately
-       time_t next_clean = time_now + config->cleanup_interval;
+       struct epoll_event events[BASE_FDS + RADIUS_FDS + EXTRA_FDS];
+       int maxevent = sizeof(events)/sizeof(*events);
+
+       if ((epollfd = epoll_create(maxevent)) < 0)
+       {
+               LOG(0, 0, 0, "epoll_create failed: %s\n", strerror(errno));
+               exit(1);
+       }
 
        LOG(4, 0, 0, "Beginning of main loop.  udpfd=%d, tunfd=%d, cluster_sockfd=%d, controlfd=%d\n",
                udpfd, tunfd, cluster_sockfd, controlfd);
 
-       FD_ZERO(&readset);
-       FD_SET(udpfd, &readset);
-       FD_SET(tunfd, &readset);
-       FD_SET(controlfd, &readset);
-       FD_SET(clifd, &readset);
-       if (cluster_sockfd) FD_SET(cluster_sockfd, &readset);
-       readset_n = udpfd;
-       if (tunfd > readset_n)          readset_n = tunfd;
-       if (controlfd > readset_n)      readset_n = controlfd;
-       if (clifd > readset_n)          readset_n = clifd;
-       if (cluster_sockfd > readset_n) readset_n = cluster_sockfd;
-
-       while (!main_quit || still_busy())
+       /* setup our fds to poll for input */
        {
-               fd_set r;
-               int n = readset_n;
+               static struct event_data d[BASE_FDS];
+               struct epoll_event e;
+
+               e.events = EPOLLIN;
+               i = 0;
+
+               d[i].type = FD_TYPE_CONTROL;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, controlfd, &e);
+
+               d[i].type = FD_TYPE_CLI;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, clifd, &e);
+
+               d[i].type = FD_TYPE_UDP;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, udpfd, &e);
+
+               d[i].type = FD_TYPE_TUN;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, tunfd, &e);
+
+               d[i].type = FD_TYPE_CLUSTER;
+               e.data.ptr = &d[i++];
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, cluster_sockfd, &e);
+       }
+
 #ifdef BGP
-               fd_set w;
-               int bgp_set[BGP_NUM_PEERS];
+       signal(SIGPIPE, SIG_IGN);
+       bgp_setup(config->as_number);
+       if (config->bind_address)
+               bgp_add_route(config->bind_address, 0xffffffff);
+
+       for (i = 0; i < BGP_NUM_PEERS; i++)
+       {
+               if (config->neighbour[i].name[0])
+                       bgp_start(&bgp_peers[i], config->neighbour[i].name,
+                               config->neighbour[i].as, config->neighbour[i].keepalive,
+                               config->neighbour[i].hold, 0); /* 0 = routing disabled */
+       }
 #endif /* BGP */
 
+       while (!main_quit || still_busy())
+       {
+               int more = 0;
+               int n;
+
                if (config->reload_config)
                {
                        // Update the config state based on config settings
                        update_config();
                }
 
-               memcpy(&r, &readset, sizeof(fd_set));
-               to.tv_sec = 0;
-               to.tv_usec = 100000; // 1/10th of a second.
-
 #ifdef BGP
-               FD_ZERO(&w);
-               for (i = 0; i < BGP_NUM_PEERS; i++)
-               {
-                       bgp_set[i] = bgp_select_state(&bgp_peers[i]);
-                       if (bgp_set[i] & 1)
-                       {
-                               FD_SET(bgp_peers[i].sock, &r);
-                               if (bgp_peers[i].sock > n)
-                                       n = bgp_peers[i].sock;
-                       }
-
-                       if (bgp_set[i] & 2)
-                       {
-                               FD_SET(bgp_peers[i].sock, &w);
-                               if (bgp_peers[i].sock > n)
-                                       n = bgp_peers[i].sock;
-                       }
-               }
-
-               n = select(n + 1, &r, &w, 0, &to);
-#else /* BGP */
-               n = select(n + 1, &r, 0, 0, &to);
+               bgp_set_poll();
 #endif /* BGP */
 
+               n = epoll_wait(epollfd, events, maxevent, 100); // timeout 100ms (1/10th sec)
                STAT(select_called);
 
-               TIME = now();
+               TIME = now(NULL);
                if (n < 0)
                {
                        if (errno == EINTR ||
@@ -2929,67 +3034,83 @@ static void mainloop(void)
                        main_quit++;
                        break;
                }
-               else if (n)
+
+               if (n)
                {
                        struct sockaddr_in addr;
                        int alen, c, s;
+                       int udp_ready = 0;
+                       int tun_ready = 0;
+                       int cluster_ready = 0;
                        int udp_pkts = 0;
                        int tun_pkts = 0;
                        int cluster_pkts = 0;
+#ifdef BGP
+                       uint32_t bgp_events[BGP_NUM_PEERS];
+                       memset(bgp_events, 0, sizeof(bgp_events));
+#endif /* BGP */
 
-                       // nsctl commands
-                       if (FD_ISSET(controlfd, &r))
+                       for (c = n, i = 0; i < c; i++)
                        {
-                               alen = sizeof(addr);
-                               processcontrol(buf, recvfrom(controlfd, buf, sizeof(buf), MSG_WAITALL, (void *) &addr, &alen), &addr, alen);
-                               n--;
-                       }
+                               struct event_data *d = events[i].data.ptr;
+                               switch (d->type)
+                               {
+                               case FD_TYPE_CONTROL: // nsctl commands
+                                       alen = sizeof(addr);
+                                       processcontrol(buf, recvfrom(controlfd, buf, sizeof(buf), MSG_WAITALL, (void *) &addr, &alen), &addr, alen);
+                                       n--;
+                                       break;
 
-                       // RADIUS responses
-                       if (config->cluster_iam_master)
-                       {
-                               for (i = 0; i < config->num_radfds; i++)
+                               case FD_TYPE_CLI: // CLI connections
                                {
-                                       if (FD_ISSET(radfds[i], &r))
+                                       int cli;
+                                       
+                                       alen = sizeof(addr);
+                                       if ((cli = accept(clifd, (struct sockaddr *)&addr, &alen)) >= 0)
                                        {
-                                               processrad(buf, recv(radfds[i], buf, sizeof(buf), 0), i);
-                                               n--;
+                                               cli_do(cli);
+                                               close(cli);
                                        }
-                               }
-                       }
+                                       else
+                                               LOG(0, 0, 0, "accept error: %s\n", strerror(errno));
 
-                       // CLI connections
-                       if (FD_ISSET(clifd, &r))
-                       {
-                               int cli;
-                               
-                               alen = sizeof(addr);
-                               if ((cli = accept(clifd, (struct sockaddr *)&addr, &alen)) >= 0)
-                               {
-                                       cli_do(cli);
-                                       close(cli);
+                                       n--;
+                                       break;
                                }
-                               else
-                                       LOG(0, 0, 0, "accept error: %s\n", strerror(errno));
 
-                               n--;
-                       }
+                               // these are handled below, with multiple interleaved reads
+                               case FD_TYPE_UDP:       udp_ready++; break;
+                               case FD_TYPE_TUN:       tun_ready++; break;
+                               case FD_TYPE_CLUSTER:   cluster_ready++; break;
+
+                               case FD_TYPE_RADIUS: // RADIUS response
+                                       s = recv(radfds[d->index], buf, sizeof(buf), 0);
+                                       if (s >= 0 && config->cluster_iam_master)
+                                               processrad(buf, s, d->index);
+
+                                       n--;
+                                       break;
 
 #ifdef BGP
-                       for (i = 0; i < BGP_NUM_PEERS; i++)
-                       {
-                               int isr = bgp_set[i] ? FD_ISSET(bgp_peers[i].sock, &r) : 0;
-                               int isw = bgp_set[i] ? FD_ISSET(bgp_peers[i].sock, &w) : 0;
-                               bgp_process(&bgp_peers[i], isr, isw);
-                               if (isr) n--;
-                               if (isw) n--;
+                               case FD_TYPE_BGP:
+                                       bgp_events[d->index] = events[i].events;
+                                       n--;
+                                       break;
+#endif /* BGP */
+
+                               default:
+                                       LOG(0, 0, 0, "Unexpected fd type returned from epoll_wait: %d\n", d->type);
+                               }
                        }
+
+#ifdef BGP
+                       bgp_process(bgp_events);
 #endif /* BGP */
 
                        for (c = 0; n && c < config->multi_read_count; c++)
                        {
                                // L2TP
-                               if (FD_ISSET(udpfd, &r))
+                               if (udp_ready)
                                {
                                        alen = sizeof(addr);
                                        if ((s = recvfrom(udpfd, buf, sizeof(buf), 0, (void *) &addr, &alen)) > 0)
@@ -2999,13 +3120,13 @@ static void mainloop(void)
                                        }
                                        else
                                        {
-                                               FD_CLR(udpfd, &r);
+                                               udp_ready = 0;
                                                n--;
                                        }
                                }
 
                                // incoming IP
-                               if (FD_ISSET(tunfd, &r))
+                               if (tun_ready)
                                {
                                        if ((s = read(tunfd, buf, sizeof(buf))) > 0)
                                        {
@@ -3014,13 +3135,13 @@ static void mainloop(void)
                                        }
                                        else
                                        {
-                                               FD_CLR(tunfd, &r);
+                                               tun_ready = 0;
                                                n--;
                                        }
                                }
 
                                // cluster
-                               if (FD_ISSET(cluster_sockfd, &r))
+                               if (cluster_ready)
                                {
                                        alen = sizeof(addr);
                                        if ((s = recvfrom(cluster_sockfd, buf, sizeof(buf), MSG_WAITALL, (void *) &addr, &alen)) > 0)
@@ -3030,7 +3151,7 @@ static void mainloop(void)
                                        }
                                        else
                                        {
-                                               FD_CLR(cluster_sockfd, &r);
+                                               cluster_ready = 0;
                                                n--;
                                        }
                                }
@@ -3045,11 +3166,12 @@ static void mainloop(void)
                                        config->multi_read_count, udp_pkts, tun_pkts, cluster_pkts);
 
                                STAT(multi_read_exceeded);
+                               more++;
                        }
                }
 
                        // Runs on every machine (master and slaves).
-               if (cluster_sockfd && next_cluster_ping <= TIME)
+               if (next_cluster_ping <= TIME)
                {
                        // Check to see which of the cluster is still alive..
 
@@ -3067,9 +3189,11 @@ static void mainloop(void)
                                next_cluster_ping = TIME + config->cluster_hb_interval;
                }
 
+               if (!config->cluster_iam_master)
+                       continue;
+
                        // Run token bucket filtering queue..
                        // Only run it every 1/10th of a second.
-                       // Runs on all machines both master and slave.
                {
                        static clockt last_run = 0;
                        if (last_run != TIME)
@@ -3079,20 +3203,42 @@ static void mainloop(void)
                        }
                }
 
-               /* Handle timeouts. Make sure that this gets run anyway, even if there was
-                * something to read, else under load this will never actually run....
-                *
-                */
-               if (config->cluster_iam_master && next_clean <= time_now)
+                       // Handle timeouts, retries etc.
                {
-                       if (regular_cleanups())
+                       static double last_clean = 0;
+                       double this_clean;
+                       double diff;
+
+                       TIME = now(&this_clean);
+                       diff = this_clean - last_clean;
+
+                       // Run during idle time (after we've handled
+                       // all incoming packets) or every 1/10th sec
+                       if (!more || diff > 0.1)
                        {
-                               // Did it finish?
-                               next_clean = time_now + 1 ;     // Didn't finish. Check quickly.
+                               regular_cleanups(diff);
+                               last_clean = this_clean;
                        }
-                       else
+               }
+
+               if (*config->accounting_dir)
+               {
+                       static clockt next_acct = 0;
+                       static clockt next_shut_acct = 0;
+
+                       if (next_acct <= TIME)
+                       {
+                               // Dump accounting data
+                               next_acct = TIME + ACCT_TIME;
+                               next_shut_acct = TIME + ACCT_SHUT_TIME;
+                               dump_acct_info(1);
+                       }
+                       else if (next_shut_acct <= TIME)
                        {
-                               next_clean = time_now + config->cleanup_interval; // Did. Move to next interval.
+                               // Dump accounting data for shutdown sessions
+                               next_shut_acct = TIME + ACCT_SHUT_TIME;
+                               if (shut_acct_n)
+                                       dump_acct_info(0);
                        }
                }
        }
@@ -3107,6 +3253,7 @@ static void mainloop(void)
 
        //
        // Important!!! We MUST not process any packets past this point!
+       LOG(1, 0, 0, "Clean shutdown complete\n");
 }
 
 static void stripdomain(char *host)
@@ -3191,6 +3338,7 @@ static void initdata(int optdebug, char *optconfig)
        config->debug = optdebug;
        config->num_tbfs = MAXTBFS;
        config->rl_rate = 28; // 28kbps
+       config->cluster_master_min_adv = 1;
        strcpy(config->random_device, RANDOMDEVICE);
 
        log_stream = stderr;
@@ -3580,7 +3728,7 @@ void snoop_send_packet(char *packet, uint16_t size, in_addr_t destination, uint1
 
 static int dump_session(FILE **f, sessiont *s)
 {
-       if (!s->opened || !s->ip || !(s->cin || s->cout) || !*s->user || s->walled_garden)
+       if (!s->opened || !s->ip || !(s->cin_delta || s->cout_delta) || !*s->user || s->walled_garden)
                return 1;
 
        if (!*f)
@@ -3601,10 +3749,12 @@ static int dump_session(FILE **f, sessiont *s)
                LOG(3, 0, 0, "Dumping accounting information to %s\n", filename);
                fprintf(*f, "# dslwatch.pl dump file V1.01\n"
                        "# host: %s\n"
+                       "# endpoint: %s\n"
                        "# time: %ld\n"
                        "# uptime: %ld\n"
                        "# format: username ip qos uptxoctets downrxoctets\n",
                        hostname,
+                       fmtaddr(config->bind_address ? config->bind_address : my_address, 0),
                        now,
                        now - basetime);
        }
@@ -3614,11 +3764,10 @@ static int dump_session(FILE **f, sessiont *s)
                s->user,                                                // username
                fmtaddr(htonl(s->ip), 0),                               // ip
                (s->throttle_in || s->throttle_out) ? 2 : 1,            // qos
-               (uint32_t) s->cin,                                      // uptxoctets
-               (uint32_t) s->cout);                                    // downrxoctets
+               (uint32_t) s->cin_delta,                                // uptxoctets
+               (uint32_t) s->cout_delta);                              // downrxoctets
 
-       s->pin = s->cin = 0;
-       s->pout = s->cout = 0;
+       s->cin_delta = s->cout_delta = 0;
 
        return 1;
 }
@@ -3746,19 +3895,6 @@ int main(int argc, char *argv[])
        if (cluster_init() < 0)
                exit(1);
 
-#ifdef BGP
-       signal(SIGPIPE, SIG_IGN);
-       bgp_setup(config->as_number);
-       bgp_add_route(config->bind_address, 0xffffffff);
-       for (i = 0; i < BGP_NUM_PEERS; i++)
-       {
-               if (config->neighbour[i].name[0])
-                       bgp_start(&bgp_peers[i], config->neighbour[i].name,
-                               config->neighbour[i].as, config->neighbour[i].keepalive,
-                               config->neighbour[i].hold, 0); /* 0 = routing disabled */
-       }
-#endif /* BGP */
-
        inittun();
        LOG(1, 0, 0, "Set up on interface %s\n", config->tundevice);
 
@@ -3789,14 +3925,6 @@ int main(int argc, char *argv[])
 
        mainloop();
 
-#ifdef BGP
-       /* try to shut BGP down cleanly; with luck the sockets will be
-          writable since we're out of the select */
-       for (i = 0; i < BGP_NUM_PEERS; i++)
-               if (bgp_peers[i].state == Established)
-                       bgp_stop(&bgp_peers[i]);
-#endif /* BGP */
-
        /* remove plugins (so cleanup code gets run) */
        plugins_done();
 
@@ -3997,8 +4125,6 @@ static void update_config()
        if (!config->numradiusservers)
                LOG(0, 0, 0, "No RADIUS servers defined!\n");
 
-       config->num_radfds = 2 << RADIUS_SHIFT;
-
        // parse radius_authtypes_s
        config->radius_authtypes = config->radius_authprefer = 0;
        p = config->radius_authtypes_s;
@@ -4073,7 +4199,6 @@ static void update_config()
        }
 
        memcpy(config->old_plugins, config->plugins, sizeof(config->plugins));
-       if (!config->cleanup_interval) config->cleanup_interval = 10;
        if (!config->multi_read_count) config->multi_read_count = 10;
        if (!config->cluster_address) config->cluster_address = inet_addr(DEFAULT_MCAST_ADDR);
        if (!*config->cluster_interface)
@@ -4180,8 +4305,16 @@ int sessionsetup(tunnelidt t, sessionidt s)
                for (i = 1; i <= config->cluster_highest_sessionid; i++)
                {
                        if (i == s) continue;
-                       if (ip == session[i].ip) sessionkill(i, "Duplicate IP address");
-                       if (!session[s].walled_garden && !session[i].walled_garden && strcasecmp(user, session[i].user) == 0)
+                       if (!session[s].opened) continue;
+                       if (ip == session[i].ip)
+                       {
+                               sessionkill(i, "Duplicate IP address");
+                               continue;
+                       }
+
+                       if (config->allow_duplicate_users) continue;
+                       if (session[s].walled_garden || session[i].walled_garden) continue;
+                       if (!strcasecmp(user, session[i].user))
                                sessionkill(i, "Duplicate session for users");
                }
        }
@@ -4428,6 +4561,7 @@ static int add_plugin(char *plugin_name)
                radiusnew,
                radiussend,
                getconfig,
+               sessionshutdown,
                sessionkill,
                throttle_session,
                cluster_send_session,
@@ -4748,6 +4882,9 @@ static tunnelidt new_tunnel()
 void become_master(void)
 {
        int s, i;
+       static struct event_data d[RADIUS_FDS];
+       struct epoll_event e;
+
        run_plugins(PLUGIN_BECOME_MASTER, NULL);
 
        // running a bunch of iptables commands is slow and can cause
@@ -4766,11 +4903,14 @@ void become_master(void)
        }
 
        // add radius fds
-       for (i = 0; i < config->num_radfds; i++)
+       e.events = EPOLLIN;
+       for (i = 0; i < RADIUS_FDS; i++)
        {
-               FD_SET(radfds[i], &readset);
-               if (radfds[i] > readset_n)
-                       readset_n = radfds[i];
+               d[i].type = FD_TYPE_RADIUS;
+               d[i].index = i;
+               e.data.ptr = &d[i];
+
+               epoll_ctl(epollfd, EPOLL_CTL_ADD, radfds[i], &e);
        }
 }