citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * $Id$
   3  *
   4  * Default wordbreaker module for full text indexing.
   5  *
   6  */
   7
   8
   9 #include "sysdep.h"
  10 #include <stdlib.h>
  11 #include <unistd.h>
  12 #include <stdio.h>
  13 #include <fcntl.h>
  14 #include <signal.h>
  15 #include <pwd.h>
  16 #include <errno.h>
  17 #include <sys/types.h>
  18
  19 #if TIME_WITH_SYS_TIME
  20 # include <sys/time.h>
  21 # include <time.h>
  22 #else
  23 # if HAVE_SYS_TIME_H
  24 #  include <sys/time.h>
  25 # else
  26 #  include <time.h>
  27 # endif
  28 #endif
  29
  30 #include <sys/wait.h>
  31 #include <ctype.h>
  32 #include <string.h>
  33 #include <limits.h>
  34 #include <libcitadel.h>
  35 #include "citadel.h"
  36 #include "server.h"
  37 #include "sysdep_decls.h"
  38 #include "citserver.h"
  39 #include "support.h"
  40 #include "config.h"
  41 #include "database.h"
  42 #include "msgbase.h"
  43 #include "control.h"
  44 #include "ft_wordbreaker.h"
  45 #include "crc16.h"
  46
  47 /*
  48  * Noise words are not included in search indices.
  49  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  50  * must also be changed, so that the index is rebuilt.
  51  */
  52 static char *noise_words[] = {
  53         "about",
  54         "after",
  55         "also",
  56         "another",
  57         "because",
  58         "been",
  59         "before",
  60         "being",
  61         "between",
  62         "both",
  63         "came",
  64         "come",
  65         "could",
  66         "each",
  67         "from",
  68         "have",
  69         "here",
  70         "himself",
  71         "into",
  72         "like",
  73         "make",
  74         "many",
  75         "might",
  76         "more",
  77         "most",
  78         "much",
  79         "must",
  80         "never",
  81         "only",
  82         "other",
  83         "over",
  84         "said",
  85         "same",
  86         "should",
  87         "since",
  88         "some",
  89         "still",
  90         "such",
  91         "take",
  92         "than",
  93         "that",
  94         "their",
  95         "them",
  96         "then",
  97         "there",
  98         "these",
  99         "they",
 100         "this",
 101         "those",
 102         "through",
 103         "under",
 104         "very",
 105         "well",
 106         "were",
 107         "what",
 108         "where",
 109         "which",
 110         "while",
 111         "with",
 112         "would",
 113         "your"
 114 };
 115
 116 /*
 117  * Compare function
 118  */
 119 int intcmp(const void *rec1, const void *rec2) {
 120         int i1, i2;
 121
 122         i1 = *(const int *)rec1;
 123         i2 = *(const int *)rec2;
 124
 125         if (i1 > i2) return(1);
 126         if (i1 < i2) return(-1);
 127         return(0);
 128 }
 129
 130
 131 void wordbreaker(char *text, int *num_tokens, int **tokens) {
 132
 133         int wb_num_tokens = 0;
 134         int wb_num_alloc = 0;
 135         int *wb_tokens = NULL;
 136
 137         char *ptr;
 138         char *word_start;
 139         char *word_end;
 140         char ch;
 141         int word_len;
 142         char word[256];
 143         int i;
 144         int word_crc;
 145
 146         if (text == NULL) {             /* no NULL text please */
 147                 *num_tokens = 0;
 148                 *tokens = NULL;
 149                 return;
 150         }
 151
 152         if (text[0] == 0) {             /* no empty text either */
 153                 *num_tokens = 0;
 154                 *tokens = NULL;
 155                 return;
 156         }
 157
 158         ptr = text;
 159         word_start = NULL;
 160         while (*ptr) {
 161                 ch = *ptr;
 162                 if (isalnum(ch)) {
 163                         if (!word_start) {
 164                                 word_start = ptr;
 165                         }
 166                 }
 167                 ++ptr;
 168                 ch = *ptr;
 169                 if ( (!isalnum(ch)) && (word_start) ) {
 170                         word_end = ptr;
 171 //                      --word_end;
 172
 173                         /* extract the word */
 174                         word_len = word_end - word_start;
 175                         if (word_len >= sizeof word) {
 176                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
 177                                 safestrncpy(word, word_start, sizeof word);
 178                                 word[(sizeof word) - 1] = 0;
 179                         }
 180                         else {
 181                                 safestrncpy(word, word_start, word_len+1);
 182                                 word[word_len] = 0;
 183                         }
 184                         word_start = NULL;
 185
 186                         /* are we ok with the length? */
 187                         if ( (word_len >= WB_MIN)
 188                            && (word_len <= WB_MAX) ) {
 189                                 for (i=0; i<word_len; ++i) {
 190                                         word[i] = tolower(word[i]);
 191                                 }
 192                                 /* disqualify noise words */
 193                                 for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
 194                                         if (!strcmp(word, noise_words[i])) {
 195                                                 word_len = 0;
 196                                                 break;
 197                                         }
 198                                 }
 199                                 if (word_len == 0)
 200                                         continue;
 201
 202                                 word_crc = (int)
 203                                         CalcCRC16Bytes(word_len, word);
 204
 205                                 ++wb_num_tokens;
 206                                 if (wb_num_tokens > wb_num_alloc) {
 207                                         wb_num_alloc += 512;
 208                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 209                                 }
 210                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 211                         }
 212                 }
 213         }
 214
 215         /* sort and purge dups */
 216         if (wb_num_tokens > 1) {
 217                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 218                 for (i=0; i<(wb_num_tokens-1); ++i) {
 219                         if (wb_tokens[i] == wb_tokens[i+1]) {
 220                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 221                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 222                                 --wb_num_tokens;
 223                                 --i;
 224                         }
 225                 }
 226         }
 227
 228         *num_tokens = wb_num_tokens;
 229         *tokens = wb_tokens;
 230 }
 231