4e1059a7f63ad982f2ffc186d5c5d5c987f59e3f
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * Default wordbreaker module for full text indexing.
3  *
4  * Copyright (c) 2005-2017 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include "sysdep.h"
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <signal.h>
21 #include <pwd.h>
22 #include <errno.h>
23 #include <sys/types.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <sys/wait.h>
37 #include <ctype.h>
38 #include <string.h>
39 #include <limits.h>
40 #include <libcitadel.h>
41 #include "citadel.h"
42 #include "server.h"
43 #include "sysdep_decls.h"
44 #include "citserver.h"
45 #include "support.h"
46 #include "config.h"
47 #include "database.h"
48 #include "msgbase.h"
49 #include "control.h"
50 #include "ft_wordbreaker.h"
51 #include "crc16.h"
52 #include "ctdl_module.h"
53
54 /*
55  * Noise words are not included in search indices.
56  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
57  * must also be changed, so that the index is rebuilt.
58  */
59 static char *noise_words[] = {
60         "about",
61         "after",
62         "also",
63         "another",
64         "because",
65         "been",
66         "before",
67         "being",
68         "between",
69         "both",
70         "came",
71         "come",
72         "could",
73         "each",
74         "from",
75         "have",
76         "here",
77         "himself",
78         "into",
79         "like",
80         "make",
81         "many",
82         "might",
83         "more",
84         "most",
85         "much",
86         "must",
87         "never",
88         "only",
89         "other",
90         "over",
91         "said",
92         "same",
93         "should",
94         "since",
95         "some",
96         "still",
97         "such",
98         "take",
99         "than",
100         "that",
101         "their",
102         "them",
103         "then",
104         "there",
105         "these",
106         "they",
107         "this",
108         "those",
109         "through",
110         "under",
111         "very",
112         "well",
113         "were",
114         "what",
115         "where",
116         "which",
117         "while",
118         "with",
119         "would",
120         "your"
121 };
122 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
123
124
125 /*
126  * Compare function
127  */
128 int intcmp(const void *rec1, const void *rec2) {
129         int i1, i2;
130
131         i1 = *(const int *)rec1;
132         i2 = *(const int *)rec2;
133
134         if (i1 > i2) return(1);
135         if (i1 < i2) return(-1);
136         return(0);
137 }
138
139
140 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
141
142         int wb_num_tokens = 0;
143         int wb_num_alloc = 0;
144         int *wb_tokens = NULL;
145
146         const char *ptr;
147         const char *word_start;
148         const char *word_end;
149         char ch;
150         int word_len;
151         char word[256];
152         int i;
153         int word_crc;
154         
155         if (text == NULL) {             /* no NULL text please */
156                 *num_tokens = 0;
157                 *tokens = NULL;
158                 return;
159         }
160
161         if (text[0] == 0) {             /* no empty text either */
162                 *num_tokens = 0;
163                 *tokens = NULL;
164                 return;
165         }
166
167         ptr = text;
168         word_start = NULL;
169         while (*ptr) {
170                 ch = *ptr;
171                 if (isalnum(ch)) {
172                         if (!word_start) {
173                                 word_start = ptr;
174                         }
175                 }
176                 ++ptr;
177                 ch = *ptr;
178                 if ( (!isalnum(ch)) && (word_start) ) {
179                         word_end = ptr;
180
181                         /* extract the word */
182                         word_len = word_end - word_start;
183                         if (word_len >= sizeof word) {
184                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
185                                 safestrncpy(word, word_start, sizeof word);
186                                 word[(sizeof word) - 1] = 0;
187                         }
188                         else {
189                                 safestrncpy(word, word_start, word_len+1);
190                                 word[word_len] = 0;
191                         }
192                         word_start = NULL;
193
194                         /* are we ok with the length? */
195                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
196                                 for (i=0; i<word_len; ++i) {
197                                         word[i] = tolower(word[i]);
198                                 }
199                                 /* disqualify noise words */
200                                 for (i=0; i<NUM_NOISE; ++i) {
201                                         if (!strcmp(word, noise_words[i])) {
202                                                 word_len = 0;
203                                                 break;
204                                         }
205                                 }
206
207                                 if (word_len == 0)
208                                         continue;
209
210                                 word_crc = (int) CalcCRC16Bytes(word_len, word);
211
212                                 ++wb_num_tokens;
213                                 if (wb_num_tokens > wb_num_alloc) {
214                                         wb_num_alloc += 512;
215                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
216                                 }
217                                 wb_tokens[wb_num_tokens - 1] = word_crc;
218                         }
219                 }
220         }
221
222         /* sort and purge dups */
223         if (wb_num_tokens > 1) {
224                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
225                 for (i=0; i<(wb_num_tokens-1); ++i) {
226                         if (wb_tokens[i] == wb_tokens[i+1]) {
227                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
228                                         ((wb_num_tokens - i - 1)*sizeof(int)));
229                                 --wb_num_tokens;
230                                 --i;
231                         }
232                 }
233         }
234
235         *num_tokens = wb_num_tokens;
236         *tokens = wb_tokens;
237 }
238