citadel/ft_wordbreaker.c

   1 /*
   2  * $Id$
   3  *
   4  * Default wordbreaker module for full text indexing.
   5  *
   6  */
   7
   8
   9 #include "sysdep.h"
  10 #include <stdlib.h>
  11 #include <unistd.h>
  12 #include <stdio.h>
  13 #include <fcntl.h>
  14 #include <signal.h>
  15 #include <pwd.h>
  16 #include <errno.h>
  17 #include <sys/types.h>
  18
  19 #if TIME_WITH_SYS_TIME
  20 # include <sys/time.h>
  21 # include <time.h>
  22 #else
  23 # if HAVE_SYS_TIME_H
  24 #  include <sys/time.h>
  25 # else
  26 #  include <time.h>
  27 # endif
  28 #endif
  29
  30 #include <sys/wait.h>
  31 #include <ctype.h>
  32 #include <string.h>
  33 #include <limits.h>
  34 #include "citadel.h"
  35 #include "server.h"
  36 #include "sysdep_decls.h"
  37 #include "citserver.h"
  38 #include "support.h"
  39 #include "config.h"
  40 #include "database.h"
  41 #include "msgbase.h"
  42 #include "control.h"
  43 #include "tools.h"
  44 #include "ft_wordbreaker.h"
  45 #include "crc16.h"
  46
  47 /*
  48  * Noise words are not included in search indices.
  49  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  50  * must also be changed, so that the index is rebuilt.
  51  */
  52 static char *noise_words[] = {
  53         "about",
  54         "after",
  55         "all",
  56         "also",
  57         "an",
  58         "and",
  59         "another",
  60         "any",
  61         "are",
  62         "as",
  63         "at",
  64         "be",
  65         "because",
  66         "been",
  67         "before",
  68         "being",
  69         "between",
  70         "both",
  71         "but",
  72         "by",
  73         "came",
  74         "can",
  75         "come",
  76         "could",
  77         "did",
  78         "do",
  79         "each",
  80         "for",
  81         "from",
  82         "get",
  83         "got",
  84         "had",
  85         "has",
  86         "have",
  87         "he",
  88         "her",
  89         "here",
  90         "him",
  91         "himself",
  92         "his",
  93         "how",
  94         "if",
  95         "in",
  96         "into",
  97         "is",
  98         "it",
  99         "like",
 100         "make",
 101         "many",
 102         "me",
 103         "might",
 104         "more",
 105         "most",
 106         "much",
 107         "must",
 108         "my",
 109         "never",
 110         "now",
 111         "of",
 112         "on",
 113         "only",
 114         "or",
 115         "other",
 116         "our",
 117         "out",
 118         "over",
 119         "said",
 120         "same",
 121         "see",
 122         "should",
 123         "since",
 124         "some",
 125         "still",
 126         "such",
 127         "take",
 128         "than",
 129         "that",
 130         "the",
 131         "their",
 132         "them",
 133         "then",
 134         "there",
 135         "these",
 136         "they",
 137         "this",
 138         "those",
 139         "through",
 140         "to",
 141         "too",
 142         "under",
 143         "up",
 144         "very",
 145         "was",
 146         "way",
 147         "we",
 148         "well",
 149         "were",
 150         "what",
 151         "where",
 152         "which",
 153         "while",
 154         "with",
 155         "would",
 156         "you",
 157         "your"
 158 };
 159
 160 /*
 161  * Compare function
 162  */
 163 int intcmp(const void *rec1, const void *rec2) {
 164         int i1, i2;
 165
 166         i1 = *(const int *)rec1;
 167         i2 = *(const int *)rec2;
 168
 169         if (i1 > i2) return(1);
 170         if (i1 < i2) return(-1);
 171         return(0);
 172 }
 173
 174
 175 void wordbreaker(char *text, int *num_tokens, int **tokens) {
 176
 177         int wb_num_tokens = 0;
 178         int wb_num_alloc = 0;
 179         int *wb_tokens = NULL;
 180
 181         char *ptr;
 182         char *word_start;
 183         char *word_end;
 184         char ch;
 185         int word_len;
 186         char word[256];
 187         int i;
 188         int word_crc;
 189
 190         if (text == NULL) {             /* no NULL text please */
 191                 *num_tokens = 0;
 192                 *tokens = NULL;
 193                 return;
 194         }
 195
 196         if (text[0] == 0) {             /* no empty text either */
 197                 *num_tokens = 0;
 198                 *tokens = NULL;
 199                 return;
 200         }
 201
 202         ptr = text;
 203         word_start = NULL;
 204         while (*ptr) {
 205                 ch = *ptr;
 206                 if (isalnum(ch)) {
 207                         if (!word_start) {
 208                                 word_start = ptr;
 209                         }
 210                 }
 211                 ++ptr;
 212                 ch = *ptr;
 213                 if ( (!isalnum(ch)) && (word_start) ) {
 214                         word_end = ptr;
 215                         --word_end;
 216
 217                         /* extract the word */
 218                         word_len = word_end - word_start + 1;
 219                         safestrncpy(word, word_start, sizeof word);
 220                         if (word_len >= sizeof word) {
 221                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
 222                                 word[(sizeof word_len) - 1] = 0;
 223                         }
 224                         else {
 225                                 word[word_len] = 0;
 226                         }
 227                         word_start = NULL;
 228
 229                         /* disqualify noise words */
 230                         for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
 231                                 if (!strcasecmp(word, noise_words[i])) {
 232                                         word_len = 0;
 233                                         break;
 234                                 }
 235                         }
 236
 237                         /* are we ok with the length? */
 238                         if ( (word_len >= WB_MIN)
 239                            && (word_len <= WB_MAX) ) {
 240                                 for (i=0; i<word_len; ++i) {
 241                                         word[i] = tolower(word[i]);
 242                                 }
 243                                 word_crc = (int)
 244                                         CalcCRC16Bytes(word_len, word);
 245
 246                                 ++wb_num_tokens;
 247                                 if (wb_num_tokens > wb_num_alloc) {
 248                                         wb_num_alloc += 512;
 249                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 250                                 }
 251                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 252                         }
 253                 }
 254         }
 255
 256         /* sort and purge dups */
 257         if (wb_num_tokens > 1) {
 258                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 259                 for (i=0; i<(wb_num_tokens-1); ++i) {
 260                         if (wb_tokens[i] == wb_tokens[i+1]) {
 261                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 262                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 263                                 --wb_num_tokens;
 264                                 --i;
 265                         }
 266                 }
 267         }
 268
 269         *num_tokens = wb_num_tokens;
 270         *tokens = wb_tokens;
 271 }
 272