citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * $Id$
   3  *
   4  * Default wordbreaker module for full text indexing.
   5  *
   6  */
   7
   8
   9 #include "sysdep.h"
  10 #include <stdlib.h>
  11 #include <unistd.h>
  12 #include <stdio.h>
  13 #include <fcntl.h>
  14 #include <signal.h>
  15 #include <pwd.h>
  16 #include <errno.h>
  17 #include <sys/types.h>
  18
  19 #if TIME_WITH_SYS_TIME
  20 # include <sys/time.h>
  21 # include <time.h>
  22 #else
  23 # if HAVE_SYS_TIME_H
  24 #  include <sys/time.h>
  25 # else
  26 #  include <time.h>
  27 # endif
  28 #endif
  29
  30 #include <sys/wait.h>
  31 #include <ctype.h>
  32 #include <string.h>
  33 #include <limits.h>
  34 #include <libcitadel.h>
  35 #include "citadel.h"
  36 #include "server.h"
  37 #include "sysdep_decls.h"
  38 #include "citserver.h"
  39 #include "support.h"
  40 #include "config.h"
  41 #include "database.h"
  42 #include "msgbase.h"
  43 #include "control.h"
  44 #include "ft_wordbreaker.h"
  45 #include "crc16.h"
  46 #include "ctdl_module.h"
  47
  48 /*
  49  * Noise words are not included in search indices.
  50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  51  * must also be changed, so that the index is rebuilt.
  52  */
  53 static char *noise_words[] = {
  54         "about",
  55         "after",
  56         "also",
  57         "another",
  58         "because",
  59         "been",
  60         "before",
  61         "being",
  62         "between",
  63         "both",
  64         "came",
  65         "come",
  66         "could",
  67         "each",
  68         "from",
  69         "have",
  70         "here",
  71         "himself",
  72         "into",
  73         "like",
  74         "make",
  75         "many",
  76         "might",
  77         "more",
  78         "most",
  79         "much",
  80         "must",
  81         "never",
  82         "only",
  83         "other",
  84         "over",
  85         "said",
  86         "same",
  87         "should",
  88         "since",
  89         "some",
  90         "still",
  91         "such",
  92         "take",
  93         "than",
  94         "that",
  95         "their",
  96         "them",
  97         "then",
  98         "there",
  99         "these",
 100         "they",
 101         "this",
 102         "those",
 103         "through",
 104         "under",
 105         "very",
 106         "well",
 107         "were",
 108         "what",
 109         "where",
 110         "which",
 111         "while",
 112         "with",
 113         "would",
 114         "your"
 115 };
 116
 117 /*
 118  * Compare function
 119  */
 120 int intcmp(const void *rec1, const void *rec2) {
 121         int i1, i2;
 122
 123         i1 = *(const int *)rec1;
 124         i2 = *(const int *)rec2;
 125
 126         if (i1 > i2) return(1);
 127         if (i1 < i2) return(-1);
 128         return(0);
 129 }
 130
 131
 132 void wordbreaker(char *text, int *num_tokens, int **tokens) {
 133
 134         int wb_num_tokens = 0;
 135         int wb_num_alloc = 0;
 136         int *wb_tokens = NULL;
 137
 138         char *ptr;
 139         char *word_start;
 140         char *word_end;
 141         char ch;
 142         int word_len;
 143         char word[256];
 144         int i;
 145         int word_crc;
 146
 147         if (text == NULL) {             /* no NULL text please */
 148                 *num_tokens = 0;
 149                 *tokens = NULL;
 150                 return;
 151         }
 152
 153         if (text[0] == 0) {             /* no empty text either */
 154                 *num_tokens = 0;
 155                 *tokens = NULL;
 156                 return;
 157         }
 158
 159         ptr = text;
 160         word_start = NULL;
 161         while (*ptr) {
 162                 ch = *ptr;
 163                 if (isalnum(ch)) {
 164                         if (!word_start) {
 165                                 word_start = ptr;
 166                         }
 167                 }
 168                 ++ptr;
 169                 ch = *ptr;
 170                 if ( (!isalnum(ch)) && (word_start) ) {
 171                         word_end = ptr;
 172 //                      --word_end;
 173
 174                         /* extract the word */
 175                         word_len = word_end - word_start;
 176                         if (word_len >= sizeof word) {
 177                                 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
 178                                 safestrncpy(word, word_start, sizeof word);
 179                                 word[(sizeof word) - 1] = 0;
 180                         }
 181                         else {
 182                                 safestrncpy(word, word_start, word_len+1);
 183                                 word[word_len] = 0;
 184                         }
 185                         word_start = NULL;
 186
 187                         /* are we ok with the length? */
 188                         if ( (word_len >= WB_MIN)
 189                            && (word_len <= WB_MAX) ) {
 190                                 for (i=0; i<word_len; ++i) {
 191                                         word[i] = tolower(word[i]);
 192                                 }
 193                                 /* disqualify noise words */
 194                                 for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
 195                                         if (!strcmp(word, noise_words[i])) {
 196                                                 word_len = 0;
 197                                                 break;
 198                                         }
 199                                 }
 200                                 if (word_len == 0)
 201                                         continue;
 202
 203                                 word_crc = (int)
 204                                         CalcCRC16Bytes(word_len, word);
 205
 206                                 ++wb_num_tokens;
 207                                 if (wb_num_tokens > wb_num_alloc) {
 208                                         wb_num_alloc += 512;
 209                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 210                                 }
 211                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 212                         }
 213                 }
 214         }
 215
 216         /* sort and purge dups */
 217         if (wb_num_tokens > 1) {
 218                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 219                 for (i=0; i<(wb_num_tokens-1); ++i) {
 220                         if (wb_tokens[i] == wb_tokens[i+1]) {
 221                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 222                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 223                                 --wb_num_tokens;
 224                                 --i;
 225                         }
 226                 }
 227         }
 228
 229         *num_tokens = wb_num_tokens;
 230         *tokens = wb_tokens;
 231 }
 232