]> code.citadel.org Git - citadel.git/blob - citadel/ft_wordbreaker.c
* Completed the wordbreaker for the fulltext indexer.
[citadel.git] / citadel / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include "citadel.h"
35 #include "server.h"
36 #include "sysdep_decls.h"
37 #include "citserver.h"
38 #include "support.h"
39 #include "config.h"
40 #include "serv_extensions.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "tools.h"
45 #include "ft_wordbreaker.h"
46 #include "crc16.h"
47
48
49 void wordbreaker(char *text, int *num_tokens, int **tokens) {
50
51         int wb_num_tokens = 0;
52         int wb_num_alloc = 0;
53         int *wb_tokens = NULL;
54
55         char *ptr;
56         char *word_start;
57         char *word_end;
58         char ch;
59         int word_len;
60         char word[256];
61         int i;
62         int word_crc;
63
64         if (text == NULL) {             /* no NULL text please */
65                 *num_tokens = 0;
66                 *tokens = NULL;
67                 return;
68         }
69
70         if (text[0] == 0) {             /* no empty text either */
71                 *num_tokens = 0;
72                 *tokens = NULL;
73                 return;
74         }
75
76         ptr = text;
77         word_start = NULL;
78         while (ptr++, *ptr) {
79                 ch = *ptr;
80                 if (isalnum(ch)) {
81                         if (!word_start) {
82                                 word_start = ptr;
83                         }
84                 }
85                 else {
86                         if (word_start) {
87                                 word_end = ptr;
88                                 --word_end;
89
90                                 /* extract the word */
91                                 word_len = word_end - word_start + 1;
92                                 safestrncpy(word, word_start, sizeof word);
93                                 word[word_len] = 0;
94                                 word_start = NULL;
95
96                                 /* are we ok with the length? */
97                                 if ( (word_len >= WB_MIN)
98                                    && (word_len <= WB_MAX) ) {
99                                         for (i=0; i<word_len; ++i) {
100                                                 word[i] = tolower(word[i]);
101                                         }
102                                         word_crc = (int)
103                                                 CalcCRC16Bytes(word_len, word);
104
105                                         ++wb_num_tokens;
106                                         if (wb_num_tokens > wb_num_alloc) {
107                                                 wb_num_alloc += 512;
108                                                 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
109                                         }
110                                         wb_tokens[wb_num_tokens - 1] = word_crc;
111                                 }
112                         }
113                 }
114         }
115
116         *num_tokens = wb_num_tokens;
117         *tokens = wb_tokens;
118 }
119