This is an omnibus commit which moves the Citadel Server from crusty old GNU Autotool...
[citadel.git] / citadel / server / modules / fulltext / serv_fulltext.c
1 /*
2  * This module handles fulltext indexing of the message base.
3  * Copyright (c) 2005-2022 by the citadel.org team
4  *
5  * This program is open source software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as published
7  * by the Free Software Foundation; either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */
19
20 #include "../../sysdep.h"
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24 #include <fcntl.h>
25 #include <signal.h>
26 #include <pwd.h>
27 #include <errno.h>
28 #include <sys/types.h>
29 #include <time.h>
30 #include <sys/wait.h>
31 #include <string.h>
32 #include <limits.h>
33 #include <libcitadel.h>
34 #include "../../citadel.h"
35 #include "../../server.h"
36 #include "../../citserver.h"
37 #include "../../support.h"
38 #include "../../config.h"
39 #include "../../database.h"
40 #include "../../msgbase.h"
41 #include "../../control.h"
42 #include "serv_fulltext.h"
43 #include "ft_wordbreaker.h"
44 #include "../../threads.h"
45 #include "../../context.h"
46 #include "../../ctdl_module.h"
47
48 long ft_newhighest = 0L;
49 long *ft_newmsgs = NULL;
50 int ft_num_msgs = 0;
51 int ft_num_alloc = 0;
52
53 int ftc_num_msgs[65536];
54 long *ftc_msgs[65536];
55
56
57 /*
58  * Compare function
59  */
60 int longcmp(const void *rec1, const void *rec2) {
61         long i1, i2;
62
63         i1 = *(const long *)rec1;
64         i2 = *(const long *)rec2;
65
66         if (i1 > i2) return(1);
67         if (i1 < i2) return(-1);
68         return(0);
69 }
70
71
72 /*
73  * Flush our index cache out to disk.
74  */
75 void ft_flush_cache(void) {
76         int i;
77         time_t last_update = 0;
78
79         for (i=0; i<65536; ++i) {
80                 if ((time(NULL) - last_update) >= 10) {
81                         syslog(LOG_INFO,
82                                 "fulltext: flushing index cache to disk (%d%% complete)",
83                                 (i * 100 / 65536)
84                         );
85                         last_update = time(NULL);
86                 }
87                 if (ftc_msgs[i] != NULL) {
88                         cdb_store(CDB_FULLTEXT, &i, sizeof(int), ftc_msgs[i],
89                                 (ftc_num_msgs[i] * sizeof(long)));
90                         ftc_num_msgs[i] = 0;
91                         free(ftc_msgs[i]);
92                         ftc_msgs[i] = NULL;
93                 }
94         }
95         syslog(LOG_INFO, "fulltext: flushed index cache to disk (100%% complete)");
96 }
97
98
99 /*
100  * Index or de-index a message.  (op == 1 to index, 0 to de-index)
101  */
102 void ft_index_message(long msgnum, int op) {
103         int num_tokens = 0;
104         int *tokens = NULL;
105         int i, j;
106         struct cdbdata *cdb_bucket;
107         StrBuf *msgtext;
108         char *txt;
109         int tok;
110         struct CtdlMessage *msg = NULL;
111
112         msg = CtdlFetchMessage(msgnum, 1);
113         if (msg == NULL) {
114                 syslog(LOG_ERR, "fulltext: ft_index_message() could not load msg %ld", msgnum);
115                 return;
116         }
117
118         if (!CM_IsEmpty(msg, eSuppressIdx)) {
119                 syslog(LOG_DEBUG, "fulltext: ft_index_message() excluded msg %ld", msgnum);
120                 CM_Free(msg);
121                 return;
122         }
123
124         syslog(LOG_DEBUG, "fulltext: ft_index_message() %s msg %ld", (op ? "adding" : "removing") , msgnum);
125
126         /* Output the message as text before indexing it, so we don't end up
127          * indexing a bunch of encoded base64, etc.
128          */
129         CC->redirect_buffer = NewStrBufPlain(NULL, SIZ);
130         CtdlOutputPreLoadedMsg(msg, MT_CITADEL, HEADERS_ALL, 0, 1, 0);
131         CM_Free(msg);
132         msgtext = CC->redirect_buffer;
133         CC->redirect_buffer = NULL;
134         if (msgtext != NULL) {
135                 syslog(LOG_DEBUG, "fulltext: wordbreaking message %ld (%d bytes)", msgnum, StrLength(msgtext));
136         }
137         txt = SmashStrBuf(&msgtext);
138         wordbreaker(txt, &num_tokens, &tokens);
139         free(txt);
140
141         syslog(LOG_DEBUG, "fulltext: indexing message %ld [%d tokens]", msgnum, num_tokens);
142         if (num_tokens > 0) {
143                 for (i=0; i<num_tokens; ++i) {
144
145                         /* Add the message to the relevant token bucket */
146
147                         /* search for tokens[i] */
148                         tok = tokens[i];
149
150                         if ( (tok >= 0) && (tok <= 65535) ) {
151                                 /* fetch the bucket, Liza */
152                                 if (ftc_msgs[tok] == NULL) {
153                                         cdb_bucket = cdb_fetch(CDB_FULLTEXT, &tok, sizeof(int));
154                                         if (cdb_bucket != NULL) {
155                                                 ftc_num_msgs[tok] = cdb_bucket->len / sizeof(long);
156                                                 ftc_msgs[tok] = (long *)cdb_bucket->ptr;
157                                                 cdb_bucket->ptr = NULL;
158                                                 cdb_free(cdb_bucket);
159                                         }
160                                         else {
161                                                 ftc_num_msgs[tok] = 0;
162                                                 ftc_msgs[tok] = malloc(sizeof(long));
163                                         }
164                                 }
165         
166         
167                                 if (op == 1) {  /* add to index */
168                                         ++ftc_num_msgs[tok];
169                                         ftc_msgs[tok] = realloc(ftc_msgs[tok],
170                                                                 ftc_num_msgs[tok]*sizeof(long));
171                                         ftc_msgs[tok][ftc_num_msgs[tok] - 1] = msgnum;
172                                 }
173         
174                                 if (op == 0) {  /* remove from index */
175                                         if (ftc_num_msgs[tok] >= 1) {
176                                                 for (j=0; j<ftc_num_msgs[tok]; ++j) {
177                                                         if (ftc_msgs[tok][j] == msgnum) {
178                                                                 memmove(&ftc_msgs[tok][j], &ftc_msgs[tok][j+1], ((ftc_num_msgs[tok] - j - 1)*sizeof(long)));
179                                                                 --ftc_num_msgs[tok];
180                                                                 --j;
181                                                         }
182                                                 }
183                                         }
184                                 }
185                         }
186                         else {
187                                 syslog(LOG_ALERT, "fulltext: invalid token %d !!", tok);
188                         }
189                 }
190
191                 free(tokens);
192         }
193 }
194
195
196 /*
197  * Add a message to the list of those to be indexed.
198  */
199 void ft_index_msg(long msgnum, void *userdata) {
200
201         if ((msgnum > CtdlGetConfigLong("MMfulltext")) && (msgnum <= ft_newhighest)) {
202                 ++ft_num_msgs;
203                 if (ft_num_msgs > ft_num_alloc) {
204                         ft_num_alloc += 1024;
205                         ft_newmsgs = realloc(ft_newmsgs, (ft_num_alloc * sizeof(long)));
206                 }
207                 ft_newmsgs[ft_num_msgs - 1] = msgnum;
208         }
209
210 }
211
212
213 /*
214  * Scan a room for messages to index.
215  */
216 void ft_index_room(struct ctdlroom *qrbuf, void *data)
217 {
218         if (server_shutting_down)
219                 return;
220                 
221         CtdlGetRoom(&CC->room, qrbuf->QRname);
222         CtdlForEachMessage(MSGS_ALL, 0L, NULL, NULL, NULL, ft_index_msg, NULL);
223 }
224
225
226 /*
227  * Begin the fulltext indexing process.
228  */
229 void do_fulltext_indexing(void) {
230         int i;
231         static time_t last_progress = 0L;
232         static int is_running = 0;
233         if (is_running) return;         /* Concurrency check - only one can run */
234         is_running = 1;
235
236         /*
237          * Don't do this if the site doesn't have it enabled.
238          */
239         if (!CtdlGetConfigInt("c_enable_fulltext")) {
240                 return;
241         }
242
243         /*
244          * If we've switched wordbreaker modules, burn the index and start over.
245          */
246         begin_critical_section(S_CONTROL);
247         if (CtdlGetConfigInt("MM_fulltext_wordbreaker") != FT_WORDBREAKER_ID) {
248                 syslog(LOG_DEBUG, "fulltext: wb ver on disk = %d, code ver = %d",
249                         CtdlGetConfigInt("MM_fulltext_wordbreaker"), FT_WORDBREAKER_ID
250                 );
251                 syslog(LOG_INFO, "fulltext: (re)initializing index");
252                 cdb_trunc(CDB_FULLTEXT);
253                 CtdlSetConfigLong("MMfulltext", 0);
254         }
255         end_critical_section(S_CONTROL);
256
257         /*
258          * Silently return if our fulltext index is up to date with new messages.
259          */
260         if ((CtdlGetConfigLong("MMfulltext") >= CtdlGetConfigLong("MMhighest"))) {
261                 return;         /* nothing to do! */
262         }
263
264         /*
265          * Now go through each room and find messages to index.
266          */
267         ft_newhighest = CtdlGetConfigLong("MMhighest");
268         CtdlForEachRoom(ft_index_room, NULL);   /* load all msg pointers */
269
270         if (ft_num_msgs > 0) {
271                 qsort(ft_newmsgs, ft_num_msgs, sizeof(long), longcmp);
272                 for (i=0; i<(ft_num_msgs-1); ++i) { /* purge dups */
273                         if (ft_newmsgs[i] == ft_newmsgs[i+1]) {
274                                 memmove(&ft_newmsgs[i], &ft_newmsgs[i+1],
275                                         ((ft_num_msgs - i - 1)*sizeof(long)));
276                                 --ft_num_msgs;
277                                 --i;
278                         }
279                 }
280
281                 /* Here it is ... do each message! */
282                 for (i=0; i<ft_num_msgs; ++i) {
283                         if (time(NULL) != last_progress) {
284                                 syslog(LOG_DEBUG,
285                                         "fulltext: indexed %d of %d messages (%d%%)",
286                                                 i, ft_num_msgs,
287                                                 ((i*100) / ft_num_msgs)
288                                 );
289                                 last_progress = time(NULL);
290                         }
291                         ft_index_message(ft_newmsgs[i], 1);
292
293                         /* Check to see if we need to quit early */
294                         if (server_shutting_down) {
295                                 syslog(LOG_DEBUG, "fulltext: indexer quitting early");
296                                 ft_newhighest = ft_newmsgs[i];
297                                 break;
298                         }
299
300                         /* Check to see if we have to maybe flush to disk */
301                         if (i >= FT_MAX_CACHE) {
302                                 syslog(LOG_DEBUG, "fulltext: time to flush.");
303                                 ft_newhighest = ft_newmsgs[i];
304                                 break;
305                         }
306
307                 }
308
309                 free(ft_newmsgs);
310                 ft_num_msgs = 0;
311                 ft_num_alloc = 0;
312                 ft_newmsgs = NULL;
313         }
314
315         if (server_shutting_down) {
316                 is_running = 0;
317                 return;
318         }
319         
320         /* Save our place so we don't have to do this again */
321         ft_flush_cache();
322         begin_critical_section(S_CONTROL);
323         CtdlSetConfigLong("MMfulltext", ft_newhighest);
324         CtdlSetConfigInt("MM_fulltext_wordbreaker", FT_WORDBREAKER_ID);
325         end_critical_section(S_CONTROL);
326
327         syslog(LOG_DEBUG, "fulltext: indexing finished");
328         is_running = 0;
329         return;
330 }
331
332
333 /*
334  * API call to perform searches.
335  * (This one does the "all of these words" search.)
336  * Caller is responsible for freeing the message list.
337  */
338 void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) {
339         int num_tokens = 0;
340         int *tokens = NULL;
341         int i, j;
342         struct cdbdata *cdb_bucket;
343         int num_all_msgs = 0;
344         long *all_msgs = NULL;
345         int num_ret_msgs = 0;
346         int num_ret_alloc = 0;
347         long *ret_msgs = NULL;
348         int tok;
349
350         wordbreaker(search_string, &num_tokens, &tokens);
351         if (num_tokens > 0) {
352                 for (i=0; i<num_tokens; ++i) {
353
354                         /* search for tokens[i] */
355                         tok = tokens[i];
356
357                         /* fetch the bucket, Liza */
358                         if (ftc_msgs[tok] == NULL) {
359                                 cdb_bucket = cdb_fetch(CDB_FULLTEXT, &tok, sizeof(int));
360                                 if (cdb_bucket != NULL) {
361                                         ftc_num_msgs[tok] = cdb_bucket->len / sizeof(long);
362                                         ftc_msgs[tok] = (long *)cdb_bucket->ptr;
363                                         cdb_bucket->ptr = NULL;
364                                         cdb_free(cdb_bucket);
365                                 }
366                                 else {
367                                         ftc_num_msgs[tok] = 0;
368                                         ftc_msgs[tok] = malloc(sizeof(long));
369                                 }
370                         }
371
372                         num_all_msgs += ftc_num_msgs[tok];
373                         if (num_all_msgs > 0) {
374                                 all_msgs = realloc(all_msgs, num_all_msgs*sizeof(long) );
375                                 memcpy(&all_msgs[num_all_msgs-ftc_num_msgs[tok]],
376                                         ftc_msgs[tok], ftc_num_msgs[tok]*sizeof(long) );
377                         }
378
379                 }
380                 free(tokens);
381                 if (all_msgs != NULL) {
382                         qsort(all_msgs, num_all_msgs, sizeof(long), longcmp);
383
384                         /*
385                          * At this point, if a message appears num_tokens times in the
386                          * list, then it contains all of the search tokens.
387                          */
388                         if (num_all_msgs >= num_tokens)
389                                 for (j=0; j<(num_all_msgs-num_tokens+1); ++j) {
390                                         if (all_msgs[j] == all_msgs[j+num_tokens-1]) {
391                                                 
392                                                 ++num_ret_msgs;
393                                                 if (num_ret_msgs > num_ret_alloc) {
394                                                         num_ret_alloc += 64;
395                                                         ret_msgs = realloc(ret_msgs,
396                                                                            (num_ret_alloc*sizeof(long)) );
397                                                 }
398                                                 ret_msgs[num_ret_msgs - 1] = all_msgs[j];
399                                                 
400                                         }
401                                 }
402                         free(all_msgs);
403                 }
404         }
405
406         *fts_num_msgs = num_ret_msgs;
407         *fts_msgs = ret_msgs;
408 }
409
410
411 /*
412  * This search command is for diagnostic purposes and may be removed or replaced.
413  */
414 void cmd_srch(char *argbuf) {
415         int num_msgs = 0;
416         long *msgs = NULL;
417         int i;
418         char search_string[256];
419
420         if (CtdlAccessCheck(ac_logged_in)) return;
421
422         if (!CtdlGetConfigInt("c_enable_fulltext")) {
423                 cprintf("%d Full text index is not enabled on this server.\n",
424                         ERROR + CMD_NOT_SUPPORTED);
425                 return;
426         }
427
428         extract_token(search_string, argbuf, 0, '|', sizeof search_string);
429         ft_search(&num_msgs, &msgs, search_string);
430
431         cprintf("%d %d msgs match all search words:\n",
432                 LISTING_FOLLOWS, num_msgs);
433         if (num_msgs > 0) {
434                 for (i=0; i<num_msgs; ++i) {
435                         cprintf("%ld\n", msgs[i]);
436                 }
437         }
438         if (msgs != NULL) free(msgs);
439         cprintf("000\n");
440 }
441
442
443 /*
444  * Zero out our index cache.
445  */
446 void initialize_ft_cache(void) {
447         memset(ftc_num_msgs, 0, (65536 * sizeof(int)));
448         memset(ftc_msgs, 0, (65536 * sizeof(long *)));
449 }
450
451
452 void ft_delete_remove(char *room, long msgnum)
453 {
454         if (room) return;
455         
456         /* Remove from fulltext index */
457         if (CtdlGetConfigInt("c_enable_fulltext")) {
458                 ft_index_message(msgnum, 0);
459         }
460 }
461
462
463 /*****************************************************************************/
464
465 char *ctdl_module_init_fulltext(void) {
466         if (!threading) {
467                 initialize_ft_cache();
468                 CtdlRegisterProtoHook(cmd_srch, "SRCH", "Full text search");
469                 CtdlRegisterDeleteHook(ft_delete_remove);
470                 CtdlRegisterSearchFuncHook(ft_search, "fulltext");
471                 CtdlRegisterSessionHook(do_fulltext_indexing, EVT_TIMER, PRIO_CLEANUP + 300);
472         }
473         /* return our module name for the log */
474         return "fulltext";
475 }