citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * Default wordbreaker module for full text indexing.
   3  *
   4  * Copyright (c) 2005-2017 by the citadel.org team
   5  *
   6  * This program is open source software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 3.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "sysdep.h"
  16 #include <stdlib.h>
  17 #include <unistd.h>
  18 #include <stdio.h>
  19 #include <fcntl.h>
  20 #include <signal.h>
  21 #include <pwd.h>
  22 #include <errno.h>
  23 #include <sys/types.h>
  24
  25 #if TIME_WITH_SYS_TIME
  26 # include <sys/time.h>
  27 # include <time.h>
  28 #else
  29 # if HAVE_SYS_TIME_H
  30 #  include <sys/time.h>
  31 # else
  32 #  include <time.h>
  33 # endif
  34 #endif
  35
  36 #include <sys/wait.h>
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <limits.h>
  40 #include <libcitadel.h>
  41 #include "citadel.h"
  42 #include "server.h"
  43 #include "sysdep_decls.h"
  44 #include "citserver.h"
  45 #include "support.h"
  46 #include "config.h"
  47 #include "database.h"
  48 #include "msgbase.h"
  49 #include "control.h"
  50 #include "ft_wordbreaker.h"
  51 #include "crc16.h"
  52 #include "ctdl_module.h"
  53
  54 /*
  55  * Noise words are not included in search indices.
  56  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  57  * must also be changed, so that the index is rebuilt.
  58  */
  59
  60 noise_word *noise_words[26];
  61
  62 static char *noise_words_init[] = {
  63         "about",
  64         "after",
  65         "also",
  66         "another",
  67         "because",
  68         "been",
  69         "before",
  70         "being",
  71         "between",
  72         "both",
  73         "came",
  74         "come",
  75         "could",
  76         "each",
  77         "from",
  78         "have",
  79         "here",
  80         "himself",
  81         "into",
  82         "like",
  83         "make",
  84         "many",
  85         "might",
  86         "more",
  87         "most",
  88         "much",
  89         "must",
  90         "never",
  91         "only",
  92         "other",
  93         "over",
  94         "said",
  95         "same",
  96         "should",
  97         "since",
  98         "some",
  99         "still",
 100         "such",
 101         "take",
 102         "than",
 103         "that",
 104         "their",
 105         "them",
 106         "then",
 107         "there",
 108         "these",
 109         "they",
 110         "this",
 111         "those",
 112         "through",
 113         "under",
 114         "very",
 115         "well",
 116         "were",
 117         "what",
 118         "where",
 119         "which",
 120         "while",
 121         "with",
 122         "would",
 123         "your"
 124 };
 125
 126
 127 void initialize_noise_words(void)
 128 {
 129         int i;
 130         int len;
 131         int ch;
 132         noise_word *next;
 133
 134         memset (noise_words, 0, sizeof(noise_words));
 135
 136         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
 137         {
 138                 ch = noise_words_init[i][0] - 'a';
 139                 len = strlen(noise_words_init[i]);
 140
 141                 next = malloc(sizeof(noise_word));
 142                 next->len = len;
 143                 next->word = strdup(noise_words_init[i]);
 144                 next->next = noise_words[ch];
 145                 noise_words[ch] = next;
 146         }
 147 }
 148
 149
 150 /*
 151  * Compare function
 152  */
 153 int intcmp(const void *rec1, const void *rec2) {
 154         int i1, i2;
 155
 156         i1 = *(const int *)rec1;
 157         i2 = *(const int *)rec2;
 158
 159         if (i1 > i2) return(1);
 160         if (i1 < i2) return(-1);
 161         return(0);
 162 }
 163
 164
 165 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
 166
 167         int wb_num_tokens = 0;
 168         int wb_num_alloc = 0;
 169         int *wb_tokens = NULL;
 170
 171         const char *ptr;
 172         const char *word_start;
 173         const char *word_end;
 174         char ch;
 175         int word_len;
 176         char word[256];
 177         int i;
 178         int word_crc;
 179         noise_word *noise;
 180
 181
 182         if (text == NULL) {             /* no NULL text please */
 183                 *num_tokens = 0;
 184                 *tokens = NULL;
 185                 return;
 186         }
 187
 188         if (text[0] == 0) {             /* no empty text either */
 189                 *num_tokens = 0;
 190                 *tokens = NULL;
 191                 return;
 192         }
 193
 194         ptr = text;
 195         word_start = NULL;
 196         while (*ptr) {
 197                 ch = *ptr;
 198                 if (isalnum(ch)) {
 199                         if (!word_start) {
 200                                 word_start = ptr;
 201                         }
 202                 }
 203                 ++ptr;
 204                 ch = *ptr;
 205                 if ( (!isalnum(ch)) && (word_start) ) {
 206                         word_end = ptr;
 207
 208                         /* extract the word */
 209                         word_len = word_end - word_start;
 210                         if (word_len >= sizeof word) {
 211                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
 212                                 safestrncpy(word, word_start, sizeof word);
 213                                 word[(sizeof word) - 1] = 0;
 214                         }
 215                         else {
 216                                 safestrncpy(word, word_start, word_len+1);
 217                                 word[word_len] = 0;
 218                         }
 219                         word_start = NULL;
 220
 221                         /* are we ok with the length? */
 222                         if ( (word_len >= WB_MIN)
 223                            && (word_len <= WB_MAX) ) {
 224                                 for (i=0; i<word_len; ++i) {
 225                                         word[i] = tolower(word[i]);
 226                                 }
 227                                 /* disqualify noise words */
 228                                 noise = noise_words[(int) (word[0]-'a')];
 229                                 while (noise)
 230                                 {
 231                                         if (noise->len == word_len)
 232                                         {
 233                                                 if (!strcmp(word, noise->word))
 234                                                 {
 235                                                         word_len = 0;
 236                                                         break;
 237                                                 }
 238                                         }
 239                                         noise = noise->next;
 240                                 }
 241                                 if (word_len == 0)
 242                                         continue;
 243
 244                                 word_crc = (int)
 245                                         CalcCRC16Bytes(word_len, word);
 246
 247                                 ++wb_num_tokens;
 248                                 if (wb_num_tokens > wb_num_alloc) {
 249                                         wb_num_alloc += 512;
 250                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 251                                 }
 252                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 253                         }
 254                 }
 255         }
 256
 257         /* sort and purge dups */
 258         if (wb_num_tokens > 1) {
 259                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 260                 for (i=0; i<(wb_num_tokens-1); ++i) {
 261                         if (wb_tokens[i] == wb_tokens[i+1]) {
 262                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 263                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 264                                 --wb_num_tokens;
 265                                 --i;
 266                         }
 267                 }
 268         }
 269
 270         *num_tokens = wb_num_tokens;
 271         *tokens = wb_tokens;
 272 }
 273