misc style cleanup
[citadel.git] / citadel / server / modules / fulltext / ft_wordbreaker.c
1 // Default wordbreaker module for full text indexing.
2 //
3 // Copyright (c) 2005-2024 by the citadel.org team
4 // This program is open source software.  Use, duplication, or disclosure is subject to the GNU General Public License v3.
5
6 #include "../../sysdep.h"
7 #include <stdlib.h>
8 #include <unistd.h>
9 #include <stdio.h>
10 #include <fcntl.h>
11 #include <signal.h>
12 #include <pwd.h>
13 #include <errno.h>
14 #include <sys/types.h>
15 #include <time.h>
16 #include <sys/wait.h>
17 #include <ctype.h>
18 #include <string.h>
19 #include <limits.h>
20 #include <libcitadel.h>
21 #include "../../citadel_defs.h"
22 #include "../../server.h"
23 #include "../../sysdep_decls.h"
24 #include "../../citserver.h"
25 #include "../../support.h"
26 #include "../../config.h"
27 #include "../../database.h"
28 #include "../../msgbase.h"
29 #include "../../control.h"
30 #include "ft_wordbreaker.h"
31 #include "crc16.h"
32 #include "../../ctdl_module.h"
33
34 // Noise words are not included in search indices.
35 // NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID must also be changed, so that the index is rebuilt.
36 static char *noise_words[] = {
37         "about",
38         "after",
39         "also",
40         "another",
41         "because",
42         "been",
43         "before",
44         "being",
45         "between",
46         "both",
47         "came",
48         "come",
49         "could",
50         "each",
51         "from",
52         "have",
53         "here",
54         "himself",
55         "into",
56         "like",
57         "make",
58         "many",
59         "might",
60         "more",
61         "most",
62         "much",
63         "must",
64         "never",
65         "only",
66         "other",
67         "over",
68         "said",
69         "same",
70         "should",
71         "since",
72         "some",
73         "still",
74         "such",
75         "take",
76         "than",
77         "that",
78         "their",
79         "them",
80         "then",
81         "there",
82         "these",
83         "they",
84         "this",
85         "those",
86         "through",
87         "under",
88         "very",
89         "well",
90         "were",
91         "what",
92         "where",
93         "which",
94         "while",
95         "with",
96         "would",
97         "your"
98 };
99 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
100
101
102 // Compare function
103 int intcmp(const void *rec1, const void *rec2) {
104         int i1, i2;
105
106         i1 = *(const int *)rec1;
107         i2 = *(const int *)rec2;
108
109         if (i1 > i2) return(1);
110         if (i1 < i2) return(-1);
111         return(0);
112 }
113
114
115 Array *wordbreaker(const char *text) {
116         const char *ptr;
117         const char *word_start;
118         const char *word_end;
119         char ch;
120         int word_len;
121         char word[256];
122         int i;
123         int word_crc;
124
125         if (text == NULL) {             // no NULL text please
126                 return(NULL);
127         }
128
129         if (text[0] == 0) {             // no empty text either
130                 return(NULL);
131         }
132
133         Array *found_tokens = array_new(sizeof(int));
134         if (found_tokens == NULL) {
135                 return(NULL);
136         }
137
138         ptr = text;
139         word_start = NULL;
140         while (*ptr) {
141                 ch = *ptr;
142                 if (isalnum(ch)) {
143                         if (!word_start) {
144                                 word_start = ptr;
145                         }
146                 }
147                 ++ptr;
148                 ch = *ptr;
149                 if ( (!isalnum(ch)) && (word_start) ) {
150                         word_end = ptr;
151
152                         // extract the word
153                         word_len = word_end - word_start;
154                         if (word_len >= sizeof word) {
155                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
156                                 safestrncpy(word, word_start, sizeof word);
157                                 word[(sizeof word) - 1] = 0;
158                         }
159                         else {
160                                 safestrncpy(word, word_start, word_len+1);
161                                 word[word_len] = 0;
162                         }
163                         word_start = NULL;
164
165                         // are we ok with the length?
166                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
167                                 for (i=0; i<word_len; ++i) {
168                                         word[i] = tolower(word[i]);
169                                 }
170                                 // disqualify noise words
171                                 for (i=0; i<NUM_NOISE; ++i) {
172                                         if (!strcasecmp(word, noise_words[i])) {
173                                                 word_len = 0;
174                                                 break;
175                                         }
176                                 }
177                                 // add it to the array (FIXME make this case insensitive)
178                                 if (word_len > 0) {
179                                         word_crc = (int) CalcCRC16Bytes(word_len, word);
180                                         array_append(found_tokens, &word_crc);
181                                 }
182                         }
183                 }
184         }
185
186         // sort and purge dups
187         if (array_len(found_tokens) > 1) {
188                 array_sort(found_tokens, intcmp);
189                 for (i=0; i<(array_len(found_tokens)); ++i) {
190                         if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
191                                 array_delete_element_at(found_tokens, i);
192                                 --i;
193                         }
194                 }
195         }
196         return(found_tokens);
197 }