]> code.citadel.org Git - citadel.git/blob - citadel/ft_wordbreaker.c
* Finished the indexer *and* deindexer! The search API is now working
[citadel.git] / citadel / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include "citadel.h"
35 #include "server.h"
36 #include "sysdep_decls.h"
37 #include "citserver.h"
38 #include "support.h"
39 #include "config.h"
40 #include "serv_extensions.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "tools.h"
45 #include "ft_wordbreaker.h"
46 #include "crc16.h"
47
48 /*
49  * Compare function
50  */
51 int intcmp(const void *rec1, const void *rec2) {
52         int i1, i2;
53
54         i1 = *(const int *)rec1;
55         i2 = *(const int *)rec2;
56
57         if (i1 > i2) return(1);
58         if (i1 < i2) return(-1);
59         return(0);
60 }
61
62
63 void wordbreaker(char *text, int *num_tokens, int **tokens) {
64
65         int wb_num_tokens = 0;
66         int wb_num_alloc = 0;
67         int *wb_tokens = NULL;
68
69         char *ptr;
70         char *word_start;
71         char *word_end;
72         char ch;
73         int word_len;
74         char word[256];
75         int i;
76         int word_crc;
77
78         if (text == NULL) {             /* no NULL text please */
79                 *num_tokens = 0;
80                 *tokens = NULL;
81                 return;
82         }
83
84         if (text[0] == 0) {             /* no empty text either */
85                 *num_tokens = 0;
86                 *tokens = NULL;
87                 return;
88         }
89
90         ptr = text;
91         word_start = NULL;
92         while (*ptr) {
93                 ch = *ptr;
94                 if (isalnum(ch)) {
95                         if (!word_start) {
96                                 word_start = ptr;
97                         }
98                 }
99                 ++ptr;
100                 ch = *ptr;
101                 if ( (!isalnum(ch)) && (word_start) ) {
102                         word_end = ptr;
103                         --word_end;
104
105                         /* extract the word */
106                         word_len = word_end - word_start + 1;
107                         safestrncpy(word, word_start, sizeof word);
108                         word[word_len] = 0;
109                         word_start = NULL;
110
111                         /* are we ok with the length? */
112                         if ( (word_len >= WB_MIN)
113                            && (word_len <= WB_MAX) ) {
114                                 for (i=0; i<word_len; ++i) {
115                                         word[i] = tolower(word[i]);
116                                 }
117                                 word_crc = (int)
118                                         CalcCRC16Bytes(word_len, word);
119
120                                 ++wb_num_tokens;
121                                 if (wb_num_tokens > wb_num_alloc) {
122                                         wb_num_alloc += 512;
123                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
124                                 }
125                                 wb_tokens[wb_num_tokens - 1] = word_crc;
126                         }
127                 }
128         }
129
130         /* sort and purge dups */
131         if (wb_num_tokens > 1) {
132                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
133                 for (i=0; i<(wb_num_tokens-1); ++i) {
134                         if (wb_tokens[i] == wb_tokens[i+1]) {
135                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
136                                         ((wb_num_tokens - i - 1)*sizeof(int)));
137                                 --wb_num_tokens;
138                         }
139                 }
140         }
141
142         *num_tokens = wb_num_tokens;
143         *tokens = wb_tokens;
144 }
145