Updated more modules to the new logging standard
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * Default wordbreaker module for full text indexing.
3  *
4  * Copyright (c) 2005-2017 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include "sysdep.h"
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <signal.h>
21 #include <pwd.h>
22 #include <errno.h>
23 #include <sys/types.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <sys/wait.h>
37 #include <ctype.h>
38 #include <string.h>
39 #include <limits.h>
40 #include <libcitadel.h>
41 #include "citadel.h"
42 #include "server.h"
43 #include "sysdep_decls.h"
44 #include "citserver.h"
45 #include "support.h"
46 #include "config.h"
47 #include "database.h"
48 #include "msgbase.h"
49 #include "control.h"
50 #include "ft_wordbreaker.h"
51 #include "crc16.h"
52 #include "ctdl_module.h"
53
54 /*
55  * Noise words are not included in search indices.
56  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
57  * must also be changed, so that the index is rebuilt.
58  */
59
60 noise_word *noise_words[26];
61
62 static char *noise_words_init[] = {
63         "about",
64         "after",
65         "also",
66         "another",
67         "because",
68         "been",
69         "before",
70         "being",
71         "between",
72         "both",
73         "came",
74         "come",
75         "could",
76         "each",
77         "from",
78         "have",
79         "here",
80         "himself",
81         "into",
82         "like",
83         "make",
84         "many",
85         "might",
86         "more",
87         "most",
88         "much",
89         "must",
90         "never",
91         "only",
92         "other",
93         "over",
94         "said",
95         "same",
96         "should",
97         "since",
98         "some",
99         "still",
100         "such",
101         "take",
102         "than",
103         "that",
104         "their",
105         "them",
106         "then",
107         "there",
108         "these",
109         "they",
110         "this",
111         "those",
112         "through",
113         "under",
114         "very",
115         "well",
116         "were",
117         "what",
118         "where",
119         "which",
120         "while",
121         "with",
122         "would",
123         "your"
124 };
125
126
127 void initialize_noise_words(void)
128 {
129         int i;
130         int len;
131         int ch;
132         noise_word *next;
133         
134         memset (noise_words, 0, sizeof(noise_words));
135         
136         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
137         {
138                 ch = noise_words_init[i][0] - 'a';
139                 len = strlen(noise_words_init[i]);
140                 
141                 next = malloc(sizeof(noise_word));
142                 next->len = len;
143                 next->word = strdup(noise_words_init[i]);
144                 next->next = noise_words[ch];
145                 noise_words[ch] = next;
146         }
147 }
148
149
150 void noise_word_cleanup(void)
151 {
152         int i;
153         noise_word *cur, *next;
154         
155         syslog(LOG_INFO, "wordbreaker: cleaning up fulltext noise words");
156         
157         for (i = 0 ; i < 26 ; i++)
158         {
159                 cur = noise_words[i];
160                 while (cur)
161                 {
162                         next = cur->next;
163                         free(cur->word);
164                         free(cur);
165                         cur = next;
166                 }
167         }
168 }
169
170 /*
171  * Compare function
172  */
173 int intcmp(const void *rec1, const void *rec2) {
174         int i1, i2;
175
176         i1 = *(const int *)rec1;
177         i2 = *(const int *)rec2;
178
179         if (i1 > i2) return(1);
180         if (i1 < i2) return(-1);
181         return(0);
182 }
183
184
185 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
186
187         int wb_num_tokens = 0;
188         int wb_num_alloc = 0;
189         int *wb_tokens = NULL;
190
191         const char *ptr;
192         const char *word_start;
193         const char *word_end;
194         char ch;
195         int word_len;
196         char word[256];
197         int i;
198         int word_crc;
199         noise_word *noise;
200         
201         
202         if (text == NULL) {             /* no NULL text please */
203                 *num_tokens = 0;
204                 *tokens = NULL;
205                 return;
206         }
207
208         if (text[0] == 0) {             /* no empty text either */
209                 *num_tokens = 0;
210                 *tokens = NULL;
211                 return;
212         }
213
214         ptr = text;
215         word_start = NULL;
216         while (*ptr) {
217                 ch = *ptr;
218                 if (isalnum(ch)) {
219                         if (!word_start) {
220                                 word_start = ptr;
221                         }
222                 }
223                 ++ptr;
224                 ch = *ptr;
225                 if ( (!isalnum(ch)) && (word_start) ) {
226                         word_end = ptr;
227
228                         /* extract the word */
229                         word_len = word_end - word_start;
230                         if (word_len >= sizeof word) {
231                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
232                                 safestrncpy(word, word_start, sizeof word);
233                                 word[(sizeof word) - 1] = 0;
234                         }
235                         else {
236                                 safestrncpy(word, word_start, word_len+1);
237                                 word[word_len] = 0;
238                         }
239                         word_start = NULL;
240
241                         /* are we ok with the length? */
242                         if ( (word_len >= WB_MIN)
243                            && (word_len <= WB_MAX) ) {
244                                 for (i=0; i<word_len; ++i) {
245                                         word[i] = tolower(word[i]);
246                                 }
247                                 /* disqualify noise words */
248                                 noise = noise_words[(int) (word[0]-'a')];
249                                 while (noise)
250                                 {
251                                         if (noise->len == word_len)
252                                         {
253                                                 if (!strcmp(word, noise->word)) 
254                                                 {
255                                                         word_len = 0;
256                                                         break;
257                                                 }
258                                         }
259                                         noise = noise->next;
260                                 }
261                                 if (word_len == 0)
262                                         continue;
263
264                                 word_crc = (int)
265                                         CalcCRC16Bytes(word_len, word);
266
267                                 ++wb_num_tokens;
268                                 if (wb_num_tokens > wb_num_alloc) {
269                                         wb_num_alloc += 512;
270                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
271                                 }
272                                 wb_tokens[wb_num_tokens - 1] = word_crc;
273                         }
274                 }
275         }
276
277         /* sort and purge dups */
278         if (wb_num_tokens > 1) {
279                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
280                 for (i=0; i<(wb_num_tokens-1); ++i) {
281                         if (wb_tokens[i] == wb_tokens[i+1]) {
282                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
283                                         ((wb_num_tokens - i - 1)*sizeof(int)));
284                                 --wb_num_tokens;
285                                 --i;
286                         }
287                 }
288         }
289
290         *num_tokens = wb_num_tokens;
291         *tokens = wb_tokens;
292 }
293