]> code.citadel.org Git - citadel.git/blobdiff - citadel/threads.c
Increased the load average before strangling the server. Its now 10.00
[citadel.git] / citadel / threads.c
index b95848f4e63dfc61e2d7a49ef345a8e6a1c56527..c13a2aed52c700e216e70b1d3095e8efe2879772 100644 (file)
@@ -1,18 +1,22 @@
 /*
- * $Id: sysdep.c 5882 2007-12-13 19:46:05Z davew $
+ * $Id$
  *
  * Citadel "system dependent" stuff.
- * See copyright.txt for copyright information.
+ * See COPYING for copyright information.
  *
  * Here's where we have the Citadel thread implimentation
  *
  */
 
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
 #include <sys/types.h>
 #include <errno.h>
 #include <sys/socket.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <signal.h>
 
 #if TIME_WITH_SYS_TIME
 # include <sys/time.h>
@@ -25,6 +29,8 @@
 # endif
 #endif
 
+#include <libcitadel.h>
+
 #include "threads.h"
 #include "ctdl_module.h"
 #include "modules_init.h"
@@ -32,6 +38,7 @@
 #include "config.h"
 #include "citserver.h"
 #include "sysdep_decls.h"
+#include "context.h"
 
 /*
  * define this to use the new worker_thread method of handling connections
 
 static int num_threads = 0;                    /* Current number of threads */
 static int num_workers = 0;                    /* Current number of worker threads */
+long statcount = 0;            /* are we doing a stats check? */
+static long stats_done = 0;
 
 CtdlThreadNode *CtdlThreadList = NULL;
 CtdlThreadNode *CtdlThreadSchedList = NULL;
 
-static citthread_t GC_thread;
+static CtdlThreadNode *GC_thread = NULL;
 static char *CtdlThreadStates[CTDL_THREAD_LAST_STATE];
 double CtdlThreadLoadAvg = 0;
 double CtdlThreadWorkerAvg = 0;
@@ -153,8 +162,7 @@ void ctdl_thread_internal_init_tsd(void)
        int ret;
        
        if ((ret = citthread_key_create(&ThreadKey, ctdl_thread_internal_dest_tsd))) {
-               lprintf(CTDL_EMERG, "citthread_key_create: %s\n",
-                       strerror(ret));
+               CtdlLogPrintf(CTDL_EMERG, "citthread_key_create: %s\n", strerror(ret));
                exit(CTDLEXIT_DB);
        }
 }
@@ -221,7 +229,6 @@ void ctdl_thread_internal_init(void)
        CtdlThreadNode *this_thread;
        int ret = 0;
        
-       GC_thread = citthread_self();
        CtdlThreadStates[CTDL_THREAD_INVALID] = strdup ("Invalid Thread");
        CtdlThreadStates[CTDL_THREAD_VALID] = strdup("Valid Thread");
        CtdlThreadStates[CTDL_THREAD_CREATE] = strdup("Thread being Created");
@@ -258,7 +265,8 @@ void ctdl_thread_internal_init(void)
 
        this_thread->name = "Garbage Collection Thread";
        
-       this_thread->tid = GC_thread;
+       this_thread->tid = citthread_self();
+       GC_thread = this_thread;
        CT = this_thread;
        
        num_threads++;  // Increase the count of threads in the system.
@@ -323,6 +331,9 @@ void ctdl_thread_internal_change_state (CtdlThreadNode *this_thread, enum CtdlTh
  */
 void CtdlThreadStopAll(void)
 {
+       /* First run any registered shutdown hooks.  This probably doesn't belong here. */
+       PerformSessionHooks(EVT_SHUTDOWN);
+
        //FIXME: The signalling of the condition should not be in the critical_section
        // We need to build a list of threads we are going to signal and then signal them afterwards
        
@@ -330,15 +341,20 @@ void CtdlThreadStopAll(void)
        
        begin_critical_section(S_THREAD_LIST);
        this_thread = CtdlThreadList;
+       // Ask the GC thread to stop first so everything knows we are shutting down.
+       GC_thread->state = CTDL_THREAD_STOP_REQ;
        while(this_thread)
        {
 #ifdef THREADS_USESIGNALS
-               citthread_killl(this_thread->tid, SIGHUP);
+               if (!citthread_equal(this_thread->tid, GC_thread->tid))
+                       citthread_kill(this_thread->tid, SIGHUP);
 #endif
                ctdl_thread_internal_change_state (this_thread, CTDL_THREAD_STOP_REQ);
                citthread_cond_signal(&this_thread->ThreadCond);
                citthread_cond_signal(&this_thread->SleepCond);
-               CtdlLogPrintf(CTDL_DEBUG, "Thread system stopping thread \"%s\" (%ld).\n", this_thread->name, this_thread->tid);
+               this_thread->stop_ticker = time(NULL);
+               CtdlLogPrintf(CTDL_DEBUG, "Thread system stopping thread \"%s\" (0x%08lx).\n",
+                       this_thread->name, this_thread->tid);
                this_thread = this_thread->next;
        }
        end_critical_section(S_THREAD_LIST);
@@ -394,12 +410,33 @@ double CtdlThreadGetWorkerAvg(void)
 
 double CtdlThreadGetLoadAvg(void)
 {
-       double ret;
-       
+       double load_avg[3] ;
+
+       int ret;
+       int smp_num_cpus;
+
+       /* Borrowed this straight from procps */
+       smp_num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       if(smp_num_cpus<1) smp_num_cpus=1; /* SPARC glibc is buggy */
+
+       ret = getloadavg(load_avg, 3);
+       if (ret < 0)
+               return 0;
+       return load_avg[0] / smp_num_cpus;
+/*
+ * This old chunk of code return a value that indicated the load on citserver
+ * This value could easily reach 100 % even when citserver was doing very little and
+ * hence the machine has much more spare capacity.
+ * Because this value was used to determine if the machine was under heavy load conditions
+ * from other processes in the system then citserver could be strangled un-necesarily
+ * What we are actually trying to achieve is to strangle citserver if the machine is heavily loaded.
+ * So we have changed this.
+
        begin_critical_section(S_THREAD_LIST);
        ret =  CtdlThreadLoadAvg;
        end_critical_section(S_THREAD_LIST);
        return ret;
+*/
 }
 
 
@@ -471,9 +508,12 @@ int CtdlThreadCheckStop(void)
        
        state = CT->state;
 
-#ifdef THREADS_USERSIGNALS
+#ifdef THREADS_USESIGNALS
        if (CT->signal)
+       {
                CtdlLogPrintf(CTDL_DEBUG, "Thread \"%s\" caught signal %d.\n", CT->name, CT->signal);
+               CT->signal = 0;
+       }
 #endif
        if(state == CTDL_THREAD_STOP_REQ)
        {
@@ -505,11 +545,13 @@ void CtdlThreadStop(CtdlThreadNode *thread)
        if (!(this_thread->thread_func))
                return;         // Don't stop garbage collector
 #ifdef THREADS_USESIGNALS
-       citthread_kill(this_thread->tid, SIGHUP);       
+       if (!citthread_equal(this_thread->tid, GC_thread->tid))
+               citthread_kill(this_thread->tid, SIGHUP);
 #endif
        ctdl_thread_internal_change_state (this_thread, CTDL_THREAD_STOP_REQ);
        citthread_cond_signal(&this_thread->ThreadCond);
        citthread_cond_signal(&this_thread->SleepCond);
+       this_thread->stop_ticker = time(NULL);
 }
 
 /*
@@ -551,7 +593,7 @@ static void ctdl_internal_thread_cleanup(void *arg)
         * In here we were called by the current thread because it is exiting
         * NB. WE ARE THE CURRENT THREAD
         */
-       CtdlLogPrintf(CTDL_NOTICE, "Thread \"%s\" (%ld) exited.\n", CT->name, CT->tid);
+       CtdlLogPrintf(CTDL_NOTICE, "Thread \"%s\" (0x%08lx) exited.\n", CT->name, CT->tid);
        
        #ifdef HAVE_BACKTRACE
        eCrash_UnregisterThread();
@@ -605,7 +647,7 @@ void ctdl_thread_internal_calc_loadavg(void)
        CtdlThreadLoadAvg = load_avg/num_threads;
        CtdlThreadWorkerAvg = worker_avg/workers;
 #ifdef WITH_THREADLOG
-       CtdlLogPrintf(CTDL_INFO, "System load average %.2f, workers averag %.2f, threads %d, workers %d, sessions %d\n", CtdlThreadLoadAvg, CtdlThreadWorkerAvg, num_threads, num_workers, num_sessions);
+       CtdlLogPrintf(CTDL_INFO, "System load average %.2f, workers averag %.2f, threads %d, workers %d, sessions %d\n", CtdlThreadGetLoadAvg(), CtdlThreadWorkerAvg, num_threads, num_workers, num_sessions);
 #endif
 }
 
@@ -638,8 +680,31 @@ void CtdlThreadGC (void)
                that_thread = this_thread;
                this_thread = this_thread->next;
                
+               if ((that_thread->state == CTDL_THREAD_STOP_REQ || that_thread->state == CTDL_THREAD_STOPPING)
+                       && (!citthread_equal(that_thread->tid, citthread_self())))
+                               CtdlLogPrintf(CTDL_DEBUG, "Waiting for thread %s (0x%08lx) to exit.\n", that_thread->name, that_thread->tid);
+               else
+               {
+                       /**
+                        * Catch the situation where a worker was asked to stop but couldn't and we are not
+                        * shutting down.
+                        */
+                       that_thread->stop_ticker = 0;
+               }
+               
+               if (that_thread->stop_ticker + 5 == time(NULL))
+               {
+                       CtdlLogPrintf(CTDL_DEBUG, "Thread System: The thread \"%s\" (0x%08lx) failed to self terminate within 5 ticks. It would be cancelled now.\n", that_thread->name, that_thread->tid);
+                       if ((that_thread->flags & CTDLTHREAD_WORKER) == 0)
+                               CtdlLogPrintf(CTDL_INFO, "Thread System: A non worker thread would have been canceled this may cause message loss.\n");
+//                     that_thread->state = CTDL_THREAD_CANCELLED;
+                       that_thread->stop_ticker++;
+//                     citthread_cancel(that_thread->tid);
+//                     continue;
+               }
+               
                /* Do we need to clean up this thread? */
-               if (that_thread->state != CTDL_THREAD_EXITED)
+               if ((that_thread->state != CTDL_THREAD_EXITED) && (that_thread->state != CTDL_THREAD_CANCELLED))
                {
                        if(that_thread->flags & CTDLTHREAD_WORKER)
                                workers++;      /* Sanity check on number of worker threads */
@@ -690,11 +755,12 @@ void CtdlThreadGC (void)
                else if (ret == ESRCH)
                        CtdlLogPrintf(CTDL_DEBUG, "Garbage collection, no thread to join on.\n");
                else if (ret != 0)
-                       CtdlLogPrintf(CTDL_DEBUG, "Garbage collection, citthread_join returned an unknown error.\n");
+                       CtdlLogPrintf(CTDL_DEBUG, "Garbage collection, citthread_join returned an unknown error(%d).\n", ret);
                /*
                 * Now we own that thread entry
                 */
-               CtdlLogPrintf(CTDL_INFO, "Garbage Collection for thread \"%s\" (%ld).\n", that_thread->name, that_thread->tid);
+               CtdlLogPrintf(CTDL_INFO, "Garbage Collection for thread \"%s\" (0x%08lx).\n",
+                       that_thread->name, that_thread->tid);
                citthread_mutex_destroy(&that_thread->ThreadMutex);
                citthread_cond_destroy(&that_thread->ThreadCond);
                citthread_mutex_destroy(&that_thread->SleepMutex);
@@ -765,9 +831,8 @@ static void *ctdl_internal_thread_func (void *arg)
        #endif
        
        // Tell the world we are here
-       CtdlLogPrintf(CTDL_NOTICE, "Created a new thread \"%s\" (%ld). \n", this_thread->name, this_thread->tid);
-
-       
+       CtdlLogPrintf(CTDL_NOTICE, "Created a new thread \"%s\" (0x%08lx).\n",
+               this_thread->name, this_thread->tid);
        
        /*
         * run the thread to do the work but only if we haven't been asked to stop
@@ -1063,6 +1128,10 @@ void ctdl_thread_internal_check_scheduled(void)
        CtdlThreadNode *this_thread, *that_thread;
        time_t now;
        
+       /* Don't start scheduled threads if the system wants single user mode */
+       if (CtdlWantSingleUser())
+               return;
+       
        if (try_critical_section(S_SCHEDULE_LIST))
                return; /* If this list is locked we wait till the next chance */
        
@@ -1097,7 +1166,7 @@ void ctdl_thread_internal_check_scheduled(void)
                                if (ctdl_thread_internal_start_scheduled (that_thread))
                                {
 #ifdef WITH_THREADLOG
-                                       CtdlLogPrintf(CTDL_INFO, "Thread system, Started a scheduled thread \"%s\" (%ud).\n",
+                                       CtdlLogPrintf(CTDL_INFO, "Thread system, Started a scheduled thread \"%s\" (0x%08lx).\n",
                                                that_thread->name, that_thread->tid);
 #endif
                                }
@@ -1106,7 +1175,8 @@ void ctdl_thread_internal_check_scheduled(void)
 #ifdef WITH_THREADLOG
                else
                {
-                       CtdlLogPrintf(CTDL_DEBUG, "Thread \"%s\" will start in %ld seconds.\n", that_thread->name, that_thread->when - time(NULL));
+                       CtdlLogPrintf(CTDL_DEBUG, "Thread \"%s\" will start in %ld seconds.\n",
+                               that_thread->name, that_thread->when - time(NULL));
                }
 #endif
        }
@@ -1119,11 +1189,42 @@ void ctdl_thread_internal_check_scheduled(void)
  */
 int CtdlThreadSelect(int n, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout)
 {
-       int ret;
+       int ret = 0;
        
        ctdl_thread_internal_change_state(CT, CTDL_THREAD_BLOCKED);
-       ret = select(n, readfds, writefds, exceptfds, timeout);
+       if (!CtdlThreadCheckStop())
+               ret = select(n, readfds, writefds, exceptfds, timeout);
+       /**
+        * If the select returned <= 0 then it failed due to an error
+        * or timeout so this thread could stop if asked to do so.
+        * Anything else means it needs to continue unless the system is shutting down
+        */
+       if (ret > 0)
+       {
+               /**
+                * The select says this thread needs to do something useful.
+                * This thread was in an idle state so it may have been asked to stop
+                * but if the system isn't shutting down this thread is no longer
+                * idle and select has given it a task to do so it must not stop
+                * In this condition we need to force it into the running state.
+                * CtdlThreadGC will clear its ticker for us.
+                *
+                * FIXME: there is still a small hole here. It is possible for the sequence of locking
+                * to allow the state to get changed to STOP_REQ just after this code if the other thread
+                * has decided to change the state before this lock, it there fore has to wait till the lock
+                * completes but it will continue to change the state. We need something a bit better here.
+                */
+               citthread_mutex_lock(&CT->ThreadMutex); /* To prevent race condition of a sleeping thread */
+               if (GC_thread->state > CTDL_THREAD_STOP_REQ && CT->state <= CTDL_THREAD_STOP_REQ)
+               {
+                       CtdlLogPrintf(CTDL_DEBUG, "Thread %s (0x%08lx) refused stop request.\n", CT->name, CT->tid);
+                       CT->state = CTDL_THREAD_RUNNING;
+               }
+               citthread_mutex_unlock(&CT->ThreadMutex);
+       }
+
        ctdl_thread_internal_change_state(CT, CTDL_THREAD_RUNNING);
+
        return ret;
 }
 
@@ -1133,19 +1234,54 @@ void *new_worker_thread(void *arg);
 extern void close_masters (void);
 
 
+void *simulation_worker (void*arg) {
+       struct CitContext *this;
+
+       this = CreateNewContext();
+       CtdlThreadSleep(1);
+       this->kill_me = 1;
+       this->state = CON_IDLE;
+       dead_session_purge(1);
+       begin_critical_section(S_SESSION_TABLE);
+       stats_done++;
+       end_critical_section(S_SESSION_TABLE);
+       return NULL;
+}
+
+
+void *simulation_thread (void *arg)
+{
+       long stats = statcount;
+
+       while(stats && !CtdlThreadCheckStop()) {
+               CtdlThreadCreate("Connection simulation worker", CTDLTHREAD_BIGSTACK, simulation_worker, NULL);
+               stats--;
+       }
+       CtdlThreadStopAll();
+       return NULL;
+}
 
 void go_threading(void)
 {
        int i;
        CtdlThreadNode *last_worker;
-       
+       struct timeval start, now, result;
+       double last_duration;
+
        /*
         * Initialise the thread system
         */
        ctdl_thread_internal_init();
 
        /* Second call to module init functions now that threading is up */
-       initialise_modules(1);
+       if (!statcount)
+               initialise_modules(1);
+       else {
+               CtdlLogPrintf(CTDL_EMERG, "Running connection simulation stats\n");
+               gettimeofday(&start, NULL);
+               CtdlThreadCreate("Connection simulation master", CTDLTHREAD_BIGSTACK, simulation_thread, NULL);
+       }
+
 
        /*
         * This thread is now used for garbage collection of other threads in the thread list
@@ -1199,7 +1335,7 @@ void go_threading(void)
                        if (last_worker)
                        {
 #ifdef WITH_THREADLOG
-                               CtdlLogPrintf(CTDL_DEBUG, "Thread system, stopping excess worker thread \"%s\" (%ld).\n",
+                               CtdlLogPrintf(CTDL_DEBUG, "Thread system, stopping excess worker thread \"%s\" (0x%08lx).\n",
                                        last_worker->name,
                                        last_worker->tid
                                        );
@@ -1215,44 +1351,63 @@ void go_threading(void)
                /* FIXME: come up with a better way to dynamically alter the number of threads
                 * based on the system load
                 */
+               if (!statcount) {
 #ifdef NEW_WORKER
                if ((((CtdlThreadGetWorkers() < config.c_max_workers) && (CtdlThreadGetWorkers() <= num_sessions) ) || CtdlThreadGetWorkers() < config.c_min_workers) && (CT->state > CTDL_THREAD_STOP_REQ))
 #else
-               if ((((CtdlThreadGetWorkers() < config.c_max_workers) && (CtdlThreadGetWorkerAvg() > 60) && (CtdlThreadGetLoadAvg() < 90) ) || CtdlThreadGetWorkers() < config.c_min_workers) && (CT->state > CTDL_THREAD_STOP_REQ))
+               if ((((CtdlThreadGetWorkers() < config.c_max_workers) && (CtdlThreadGetWorkerAvg() > 60)) || CtdlThreadGetWorkers() < config.c_min_workers) && (CT->state > CTDL_THREAD_STOP_REQ))
 #endif /* NEW_WORKER */
                {
-                       for (i=0; i<5 ; i++)
-                       {
+                       /* Only start new threads if we are not going to overload the machine */
+                       /* Temporarily set to 10 should be enough to make sure we don't stranglew the server
+                        * at least until we make this a config option */
+                       if (CtdlThreadGetLoadAvg() < ((double)10.00)) {
+                               for (i=0; i<5 ; i++) {
 #ifdef NEW_WORKER
-                               CtdlThreadCreate("Worker Thread (new)",
-                                       CTDLTHREAD_BIGSTACK + CTDLTHREAD_WORKER,
-                                       new_worker_thread,
-                                       NULL
-                                       );
+                                       CtdlThreadCreate("Worker Thread (new)",
+                                               CTDLTHREAD_BIGSTACK + CTDLTHREAD_WORKER,
+                                               new_worker_thread,
+                                               NULL
+                                               );
 #else
-                               CtdlThreadCreate("Worker Thread",
-                                       CTDLTHREAD_BIGSTACK + CTDLTHREAD_WORKER,
-                                       worker_thread,
-                                       NULL
-                                       );
+                                       CtdlThreadCreate("Worker Thread",
+                                               CTDLTHREAD_BIGSTACK + CTDLTHREAD_WORKER,
+                                               worker_thread,
+                                               NULL
+                                               );
 #endif /* NEW_WORKER */
+                               }
                        }
+                       else
+                               CtdlLogPrintf (CTDL_WARNING, "Server strangled due to machine load average too high.\n");
                }
-               
+               }
+
                CtdlThreadGC();
-               
+
                if (CtdlThreadGetCount() <= 1) // Shutting down clean up the garbage collector
                {
                        CtdlThreadGC();
                }
                
+#ifdef THREADS_USESIGNALS
+               if (CtdlThreadGetCount() && CT->state > CTDL_THREAD_STOP_REQ)
+#else
                if (CtdlThreadGetCount())
+#endif
                        CtdlThreadSleep(1);
        }
        /*
         * If the above loop exits we must be shutting down since we obviously have no threads
         */
        ctdl_thread_internal_cleanup();
+
+       if (statcount) {
+               gettimeofday(&now, NULL);
+               timersub(&now, &start, &result);
+               last_duration = (double)result.tv_sec + ((double)result.tv_usec / (double) 1000000);
+               CtdlLogPrintf(CTDL_EMERG, "Simulated %ld connections in %f seconds\n", stats_done, last_duration);
+       }
 }
 
 
@@ -1280,7 +1435,7 @@ void select_on_master(void)
         int m, i;
         int retval = 0;
         struct timeval tv;
-        struct CitContext *con;
+        CitContext *con;
         const char *old_name;
 
 
@@ -1366,7 +1521,7 @@ void select_on_master(void)
  * If the select succeeds the thread goes off to handle the client request.
  * If the list of client connections is empty the threads all sleep for one second
  */
-struct CitContext *select_on_client(void)
+CitContext *select_on_client(void)
 {
        fd_set readfds;
        struct timeval tv;
@@ -1433,7 +1588,7 @@ struct CitContext *select_on_client(void)
 /*
  * Do the worker threads work when needed
  */
-int execute_session(struct CitContext *bind_me)
+int execute_session(CitContext *bind_me)
 {
        int force_purge;
        
@@ -1463,7 +1618,6 @@ int execute_session(struct CitContext *bind_me)
 
 
 
-extern void dead_session_purge(int force);
 
 /*
  * A new worker_thread loop.
@@ -1471,7 +1625,7 @@ extern void dead_session_purge(int force);
  
 void *new_worker_thread(void *arg)
 {
-       struct CitContext *bind_me;
+       CitContext *bind_me;
        int force_purge;
        
        while (!CtdlThreadCheckStop()) {