/*
- * $Id$
- *
* This module handles fulltext indexing of the message base.
+ * Copyright (c) 2005-2021 by the citadel.org team
*
+ * This program is open source software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
*
- * Copyright (c) 2005-2009 by the citadel.org team
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-
#include "sysdep.h"
#include <stdlib.h>
#include <unistd.h>
#include "ctdl_module.h"
-
-
long ft_newhighest = 0L;
long *ft_newmsgs = NULL;
int ft_num_msgs = 0;
int ft_num_alloc = 0;
-
int ftc_num_msgs[65536];
long *ftc_msgs[65536];
return(0);
}
+
/*
* Flush our index cache out to disk.
*/
for (i=0; i<65536; ++i) {
if ((time(NULL) - last_update) >= 10) {
- CtdlLogPrintf(CTDL_INFO,
- "Flushing index cache to disk (%d%% complete)\n",
+ syslog(LOG_INFO,
+ "fulltext: flushing index cache to disk (%d%% complete)",
(i * 100 / 65536)
);
last_update = time(NULL);
ftc_msgs[i] = NULL;
}
}
- CtdlLogPrintf(CTDL_INFO, "Flushed index cache to disk (100%% complete)\n");
+ syslog(LOG_INFO, "fulltext: flushed index cache to disk (100%% complete)");
}
int *tokens = NULL;
int i, j;
struct cdbdata *cdb_bucket;
- char *msgtext;
+ StrBuf *msgtext;
+ char *txt;
int tok;
+ struct CtdlMessage *msg = NULL;
+
+ msg = CtdlFetchMessage(msgnum, 1);
+ if (msg == NULL) {
+ syslog(LOG_ERR, "fulltext: ft_index_message() could not load msg %ld", msgnum);
+ return;
+ }
+
+ if (!CM_IsEmpty(msg, eSuppressIdx)) {
+ syslog(LOG_DEBUG, "fulltext: ft_index_message() excluded msg %ld", msgnum);
+ CM_Free(msg);
+ return;
+ }
- CtdlLogPrintf(CTDL_DEBUG, "ft_index_message() %s msg %ld\n",
- (op ? "adding" : "removing") , msgnum
- );
+ syslog(LOG_DEBUG, "fulltext: ft_index_message() %s msg %ld", (op ? "adding" : "removing") , msgnum);
/* Output the message as text before indexing it, so we don't end up
* indexing a bunch of encoded base64, etc.
*/
- CC->redirect_buffer = malloc(SIZ);
- CC->redirect_len = 0;
- CC->redirect_alloc = SIZ;
- CtdlOutputMsg(msgnum, MT_CITADEL, HEADERS_ALL, 0, 1, NULL, 0);
+ CC->redirect_buffer = NewStrBufPlain(NULL, SIZ);
+ CtdlOutputPreLoadedMsg(msg, MT_CITADEL, HEADERS_ALL, 0, 1, 0);
+ CM_Free(msg);
msgtext = CC->redirect_buffer;
CC->redirect_buffer = NULL;
- CC->redirect_len = 0;
- CC->redirect_alloc = 0;
- CtdlLogPrintf(CTDL_DEBUG, "Wordbreaking message %ld...\n", msgnum);
- wordbreaker(msgtext, &num_tokens, &tokens);
- free(msgtext);
+ if (msgtext != NULL) {
+ syslog(LOG_DEBUG, "fulltext: wordbreaking message %ld (%d bytes)", msgnum, StrLength(msgtext));
+ }
+ txt = SmashStrBuf(&msgtext);
+ wordbreaker(txt, &num_tokens, &tokens);
+ free(txt);
- CtdlLogPrintf(CTDL_DEBUG, "Indexing message %ld [%d tokens]\n", msgnum, num_tokens);
+ syslog(LOG_DEBUG, "fulltext: indexing message %ld [%d tokens]", msgnum, num_tokens);
if (num_tokens > 0) {
for (i=0; i<num_tokens; ++i) {
}
}
else {
- CtdlLogPrintf(CTDL_ALERT, "Invalid token %d !!\n", tok);
+ syslog(LOG_ALERT, "fulltext: invalid token %d !!", tok);
}
}
}
-
/*
* Add a message to the list of those to be indexed.
*/
void ft_index_msg(long msgnum, void *userdata) {
- if ((msgnum > CitControl.MMfulltext) && (msgnum <= ft_newhighest)) {
+ if ((msgnum > CtdlGetConfigLong("MMfulltext")) && (msgnum <= ft_newhighest)) {
++ft_num_msgs;
if (ft_num_msgs > ft_num_alloc) {
ft_num_alloc += 1024;
- ft_newmsgs = realloc(ft_newmsgs,
- (ft_num_alloc * sizeof(long)));
+ ft_newmsgs = realloc(ft_newmsgs, (ft_num_alloc * sizeof(long)));
}
ft_newmsgs[ft_num_msgs - 1] = msgnum;
}
}
+
/*
* Scan a room for messages to index.
*/
void ft_index_room(struct ctdlroom *qrbuf, void *data)
{
- if (CtdlThreadCheckStop())
+ if (server_shutting_down)
return;
CtdlGetRoom(&CC->room, qrbuf->QRname);
*/
void do_fulltext_indexing(void) {
int i;
- static time_t last_index = 0L;
static time_t last_progress = 0L;
- time_t run_time = 0L;
- time_t end_time = 0L;
-
+ static int is_running = 0;
+ if (is_running) return; /* Concurrency check - only one can run */
+ is_running = 1;
+
/*
* Don't do this if the site doesn't have it enabled.
*/
- if (!config.c_enable_fulltext) {
+ if (!CtdlGetConfigInt("c_enable_fulltext")) {
return;
}
/*
- * Make sure we don't run the indexer too frequently.
- * FIXME move the setting into config
+ * If we've switched wordbreaker modules, burn the index and start over.
*/
-/*
- * The thread sleeps for 300 seconds so we don't need this here any more
-
- if ( (time(NULL) - last_index) < 300L) {
- return;
+ begin_critical_section(S_CONTROL);
+ if (CtdlGetConfigInt("MM_fulltext_wordbreaker") != FT_WORDBREAKER_ID) {
+ syslog(LOG_DEBUG, "fulltext: wb ver on disk = %d, code ver = %d",
+ CtdlGetConfigInt("MM_fulltext_wordbreaker"), FT_WORDBREAKER_ID
+ );
+ syslog(LOG_INFO, "fulltext: (re)initializing index");
+ cdb_trunc(CDB_FULLTEXT);
+ CtdlSetConfigLong("MMfulltext", 0);
}
-*/
+ end_critical_section(S_CONTROL);
/*
- * Check to see whether the fulltext index is up to date; if there
- * are no messages to index, don't waste any more time trying.
+ * Silently return if our fulltext index is up to date with new messages.
*/
- if ((CitControl.MMfulltext >= CitControl.MMhighest) && (CitControl.fulltext_wordbreaker == FT_WORDBREAKER_ID)) {
+ if ((CtdlGetConfigLong("MMfulltext") >= CtdlGetConfigLong("MMhighest"))) {
return; /* nothing to do! */
}
-
- run_time = time(NULL);
- CtdlLogPrintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time);
-
- /*
- * If we've switched wordbreaker modules, burn the index and start
- * over.
- */
- begin_critical_section(S_CONTROL);
- if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) {
- CtdlLogPrintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
- CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
- CtdlLogPrintf(CTDL_INFO, "(re)initializing full text index\n");
- cdb_trunc(CDB_FULLTEXT);
- CitControl.MMfulltext = 0L;
- put_control();
- }
- end_critical_section(S_CONTROL);
/*
* Now go through each room and find messages to index.
*/
- ft_newhighest = CitControl.MMhighest;
+ ft_newhighest = CtdlGetConfigLong("MMhighest");
CtdlForEachRoom(ft_index_room, NULL); /* load all msg pointers */
if (ft_num_msgs > 0) {
/* Here it is ... do each message! */
for (i=0; i<ft_num_msgs; ++i) {
if (time(NULL) != last_progress) {
- CtdlLogPrintf(CTDL_DEBUG,
- "Indexed %d of %d messages (%d%%)\n",
+ syslog(LOG_DEBUG,
+ "fulltext: indexed %d of %d messages (%d%%)",
i, ft_num_msgs,
((i*100) / ft_num_msgs)
);
ft_index_message(ft_newmsgs[i], 1);
/* Check to see if we need to quit early */
- if (CtdlThreadCheckStop()) {
- CtdlLogPrintf(CTDL_DEBUG, "Indexer quitting early\n");
+ if (server_shutting_down) {
+ syslog(LOG_DEBUG, "fulltext: indexer quitting early");
ft_newhighest = ft_newmsgs[i];
break;
}
/* Check to see if we have to maybe flush to disk */
if (i >= FT_MAX_CACHE) {
- CtdlLogPrintf(CTDL_DEBUG, "Time to flush.\n");
+ syslog(LOG_DEBUG, "fulltext: time to flush.");
ft_newhighest = ft_newmsgs[i];
break;
}
ft_num_alloc = 0;
ft_newmsgs = NULL;
}
- end_time = time(NULL);
- if (CtdlThreadCheckStop())
+ if (server_shutting_down) {
+ is_running = 0;
return;
+ }
- CtdlLogPrintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time);
-
/* Save our place so we don't have to do this again */
ft_flush_cache();
begin_critical_section(S_CONTROL);
- CitControl.MMfulltext = ft_newhighest;
- CitControl.fulltext_wordbreaker = FT_WORDBREAKER_ID;
- put_control();
+ CtdlSetConfigLong("MMfulltext", ft_newhighest);
+ CtdlSetConfigInt("MM_fulltext_wordbreaker", FT_WORDBREAKER_ID);
end_critical_section(S_CONTROL);
- last_index = time(NULL);
- CtdlLogPrintf(CTDL_DEBUG, "do_fulltext_indexing() finished\n");
+ syslog(LOG_DEBUG, "fulltext: indexing finished");
+ is_running = 0;
return;
}
-/*
- * Main loop for the indexer thread.
- */
-void *indexer_thread(void *arg) {
- struct CitContext indexerCC;
-
- CtdlLogPrintf(CTDL_DEBUG, "indexer_thread() initializing\n");
-
- CtdlFillSystemContext(&indexerCC, "indexer");
- citthread_setspecific(MyConKey, (void *)&indexerCC );
-
- while (!CtdlThreadCheckStop()) {
- do_fulltext_indexing();
- CtdlThreadSleep(300);
- }
-
- CtdlLogPrintf(CTDL_DEBUG, "indexer_thread() exiting\n");
- return NULL;
-}
-
-
/*
* API call to perform searches.
* (This one does the "all of these words" search.)
* Caller is responsible for freeing the message list.
*/
-void ft_search(int *fts_num_msgs, long **fts_msgs, char *search_string) {
+void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) {
int num_tokens = 0;
int *tokens = NULL;
int i, j;
}
free(tokens);
- qsort(all_msgs, num_all_msgs, sizeof(long), longcmp);
-
- /*
- * At this point, if a message appears num_tokens times in the
- * list, then it contains all of the search tokens.
- */
- if (num_all_msgs >= num_tokens)
- for (j=0; j<(num_all_msgs-num_tokens+1); ++j) {
- if (all_msgs[j] == all_msgs[j+num_tokens-1]) {
-
- ++num_ret_msgs;
- if (num_ret_msgs > num_ret_alloc) {
- num_ret_alloc += 64;
- ret_msgs = realloc(ret_msgs,
- (num_ret_alloc*sizeof(long)) );
+ if (all_msgs != NULL) {
+ qsort(all_msgs, num_all_msgs, sizeof(long), longcmp);
+
+ /*
+ * At this point, if a message appears num_tokens times in the
+ * list, then it contains all of the search tokens.
+ */
+ if (num_all_msgs >= num_tokens)
+ for (j=0; j<(num_all_msgs-num_tokens+1); ++j) {
+ if (all_msgs[j] == all_msgs[j+num_tokens-1]) {
+
+ ++num_ret_msgs;
+ if (num_ret_msgs > num_ret_alloc) {
+ num_ret_alloc += 64;
+ ret_msgs = realloc(ret_msgs,
+ (num_ret_alloc*sizeof(long)) );
+ }
+ ret_msgs[num_ret_msgs - 1] = all_msgs[j];
+
+ }
}
- ret_msgs[num_ret_msgs - 1] = all_msgs[j];
-
- }
+ free(all_msgs);
}
-
- free(all_msgs);
}
*fts_num_msgs = num_ret_msgs;
if (CtdlAccessCheck(ac_logged_in)) return;
- if (!config.c_enable_fulltext) {
+ if (!CtdlGetConfigInt("c_enable_fulltext")) {
cprintf("%d Full text index is not enabled on this server.\n",
ERROR + CMD_NOT_SUPPORTED);
return;
cprintf("000\n");
}
+
/*
* Zero out our index cache.
*/
if (room) return;
/* Remove from fulltext index */
- if (config.c_enable_fulltext) {
+ if (CtdlGetConfigInt("c_enable_fulltext")) {
ft_index_message(msgnum, 0);
}
}
+
/*****************************************************************************/
CTDL_MODULE_INIT(fulltext)
if (!threading)
{
initialize_ft_cache();
- initialize_noise_words();
CtdlRegisterProtoHook(cmd_srch, "SRCH", "Full text search");
CtdlRegisterDeleteHook(ft_delete_remove);
CtdlRegisterSearchFuncHook(ft_search, "fulltext");
- CtdlRegisterCleanupHook(noise_word_cleanup);
- }
- else
- {
- CtdlThreadCreate("Indexer", CTDLTHREAD_BIGSTACK, indexer_thread, NULL);
+ CtdlRegisterSessionHook(do_fulltext_indexing, EVT_TIMER, PRIO_CLEANUP + 300);
}
- /* return our Subversion id for the Log */
- return "$Id$";
+ /* return our module name for the log */
+ return "fulltext";
}