Add handler for recovery of bad file descriptors
[citadel.git] / citadel / sysdep.c
index 8f71b7ea41ec3c4a09882d727ed3454ca417f308..f8ff6df46eb0b0c06aa7132e6c80ecdfd12c0bd7 100644 (file)
  * Copyright (c) 1987-2011 by the citadel.org team
  *
  * This program is open source software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
+ * it under the terms of the GNU General Public License, version 3.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include "sysdep.h"
 #include <sys/select.h>
 #endif
 
-#ifndef HAVE_SNPRINTF
-#include "snprintf.h"
-#endif
-
 #include "ctdl_module.h"
 #include "threads.h"
 #include "user_ops.h"
@@ -918,7 +908,8 @@ int client_getln(char *buf, int bufsize)
 void close_masters (void)
 {
        struct ServiceFunctionHook *serviceptr;
-       
+       const char *Text;
+
        /*
         * close all protocol master sockets
         */
@@ -927,18 +918,37 @@ void close_masters (void)
 
                if (serviceptr->tcp_port > 0)
                {
-                       syslog(LOG_INFO, "Closing %d listener on port %d\n",
+                       if (serviceptr->msock == -1)
+                               Text = "not closing again";
+                       else
+                               Text = "Closing";
+                                       
+                       syslog(LOG_INFO, "%s %d listener on port %d\n",
+                              Text,
                               serviceptr->msock,
                               serviceptr->tcp_port);
                        serviceptr->tcp_port = 0;
                }
                
                if (serviceptr->sockpath != NULL)
-                       syslog(LOG_INFO, "Closing %d listener on '%s'\n",
+               {
+                       if (serviceptr->msock == -1)
+                               Text = "not closing again";
+                       else
+                               Text = "Closing";
+
+                       syslog(LOG_INFO, "%s %d listener on '%s'\n",
+                              Text,
                               serviceptr->msock,
                               serviceptr->sockpath);
+               }
+
                 if (serviceptr->msock != -1)
+               {
                        close(serviceptr->msock);
+                       serviceptr->msock = -1;
+               }
+
                /* If it's a Unix domain socket, remove the file. */
                if (serviceptr->sockpath != NULL) {
                        unlink(serviceptr->sockpath);
@@ -969,9 +979,11 @@ void sysdep_master_cleanup(void) {
        CtdlDestroyCleanupHooks();
        CtdlDestroyFixedOutputHooks();  
        CtdlDestroySessionHooks();
+       CtdlDestroyTDAPVetoHooks();
        CtdlDestroyServiceHook();
        CtdlDestroyRoomHooks();
        CtdlDestroySearchHooks();
+       CtdlDestroyDebugTable();
        #ifdef HAVE_BACKTRACE
 ///    eCrash_Uninit();
        #endif
@@ -1051,7 +1063,7 @@ void start_daemon(int unused) {
                        }
                        waitpid(current_child, &status, 0);
                }
-               do_restart = 0;
+
                nFireUpsNonRestart = nFireUps;
                
                /* Exit code 0 means the watcher should exit */
@@ -1095,7 +1107,7 @@ void checkcrash(void)
                        "factor.\n \n"
                        " You can obtain more information about this by enabling core dumps.\n \n"
                        " For more information, please see:\n \n"
-                       " http://citadel.org/doku.php/faq:mastering_your_os:gdb#how.do.i.make.my.system.produce.core-files"
+                       " http://citadel.org/doku.php?id=faq:mastering_your_os:gdb#how.do.i.make.my.system.produce.core-files"
                        "\n \n"
 
                        " If you have already done this, the core dump is likely to be found at %score.%d\n"
@@ -1130,6 +1142,87 @@ int convert_login(char NameToConvert[]) {
 
 
 
+void HuntBadSession(void)
+{
+       int highest;
+       CitContext *ptr;
+       fd_set readfds;
+       struct timeval tv;
+       struct ServiceFunctionHook *serviceptr;
+
+
+
+       /* Next, add all of the client sockets. */
+       begin_critical_section(S_SESSION_TABLE);
+       for (ptr = ContextList; ptr != NULL; ptr = ptr->next) {
+               if ((ptr->state == CON_SYS) && (ptr->client_socket == 0))
+                       continue;
+               /* Initialize the fdset. */
+               FD_ZERO(&readfds);
+               highest = 0;
+               tv.tv_sec = 0;          /* wake up every second if no input */
+               tv.tv_usec = 0;
+
+               /* Don't select on dead sessions, only truly idle ones */
+               if (    (ptr->state == CON_IDLE)
+                       && (ptr->kill_me == 0)
+                       && (ptr->client_socket > 0)
+                       ) {
+                       FD_SET(ptr->client_socket, &readfds);
+                       if (ptr->client_socket > highest)
+                               highest = ptr->client_socket;
+                       
+                       if ((select(highest + 1, &readfds, NULL, NULL, &tv) < 0) &&
+                           (errno == EBADF))
+                       {
+                               /* Gotcha! */
+                               syslog(LOG_EMERG,
+                                      "Killing Session CC[%d] bad FD: [%d:%d] User[%s] Host[%s:%s]\n",
+                                      ptr->cs_pid,
+                                      ptr->client_socket,
+                                      ptr->is_local_socket,
+                                      ptr->curr_user,
+                                      ptr->cs_host,ptr->cs_addr);
+
+                               ptr->kill_me = 1;
+                               ptr->client_socket = -1;
+                               break;
+                       }
+               }
+               
+       }
+       end_critical_section(S_SESSION_TABLE);
+
+
+       /* First, add the various master sockets to the fdset. */
+       for (serviceptr = ServiceHookTable; serviceptr != NULL; serviceptr = serviceptr->next ) {
+
+               /* Initialize the fdset. */
+               highest = 0;
+               tv.tv_sec = 0;          /* wake up every second if no input */
+               tv.tv_usec = 0;
+
+               FD_SET(serviceptr->msock, &readfds);
+               if (serviceptr->msock > highest) {
+                       highest = serviceptr->msock;
+               }
+               if ((select(highest + 1, &readfds, NULL, NULL, &tv) < 0) &&
+                   (errno == EBADF))
+               {
+                       /* Gotcha! server socket dead? commit suicide! */
+                       syslog(LOG_EMERG,
+                              "Found bad FD: %d and its a server socket! Shutting Down!\n",
+                              serviceptr->msock);
+
+                       server_shutting_down = 1;
+                       break;
+               }
+       }
+
+
+}
+
+
 /* 
  * This loop just keeps going and going and going...
  */
@@ -1146,7 +1239,9 @@ void *worker_thread(void *blah) {
        CitContext *con = NULL;         /* Temporary context pointer */
        int i;
 
+       pthread_mutex_lock(&ThreadCountMutex);
        ++num_workers;
+       pthread_mutex_unlock(&ThreadCountMutex);
 
        while (!server_shutting_down) {
 
@@ -1221,7 +1316,8 @@ do_select:        force_purge = 0;
                 */
                if (retval < 0) {
                        if (errno == EBADF) {
-                               syslog(LOG_NOTICE, "select() failed: (%s)\n", strerror(errno));
+                               syslog(LOG_EMERG, "select() failed: (%s)\n", strerror(errno));
+                               HuntBadSession ();
                                goto do_select;
                        }
                        if (errno != EINTR) {
@@ -1271,6 +1367,7 @@ do_select:        force_purge = 0;
                                        con = CreateNewContext();
 
                                        /* Assign our new socket number to it. */
+                                       con->tcp_port = serviceptr->tcp_port;
                                        con->client_socket = ssock;
                                        con->h_command_function = serviceptr->h_command_function;
                                        con->h_async_function = serviceptr->h_async_function;
@@ -1327,7 +1424,10 @@ do_select:       force_purge = 0;
 
 SKIP_SELECT:
                /* We're bound to a session */
+               pthread_mutex_lock(&ThreadCountMutex);
                ++active_workers;
+               pthread_mutex_unlock(&ThreadCountMutex);
+
                if (bind_me != NULL) {
                        become_session(bind_me);
 
@@ -1361,11 +1461,23 @@ SKIP_SELECT:
 
                dead_session_purge(force_purge);
                do_housekeeping();
+
+               pthread_mutex_lock(&ThreadCountMutex);
                --active_workers;
+               if ((active_workers + config.c_min_workers < num_workers) &&
+                   (num_workers > config.c_min_workers))
+               {
+                       num_workers--;
+                       pthread_mutex_unlock(&ThreadCountMutex);
+                       return (NULL);
+               }
+               pthread_mutex_unlock(&ThreadCountMutex);
        }
 
        /* If control reaches this point, the server is shutting down */
+       pthread_mutex_lock(&ThreadCountMutex);
        --num_workers;
+       pthread_mutex_unlock(&ThreadCountMutex);
        return(NULL);
 }