citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * Default wordbreaker module for full text indexing.
   3  *
   4  * Copyright (c) 2005-2017 by the citadel.org team
   5  *
   6  * This program is open source software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 3.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "sysdep.h"
  16 #include <stdlib.h>
  17 #include <unistd.h>
  18 #include <stdio.h>
  19 #include <fcntl.h>
  20 #include <signal.h>
  21 #include <pwd.h>
  22 #include <errno.h>
  23 #include <sys/types.h>
  24 #include <time.h>
  25 #include <sys/wait.h>
  26 #include <ctype.h>
  27 #include <string.h>
  28 #include <limits.h>
  29 #include <libcitadel.h>
  30 #include "citadel.h"
  31 #include "server.h"
  32 #include "sysdep_decls.h"
  33 #include "citserver.h"
  34 #include "support.h"
  35 #include "config.h"
  36 #include "database.h"
  37 #include "msgbase.h"
  38 #include "control.h"
  39 #include "ft_wordbreaker.h"
  40 #include "crc16.h"
  41 #include "ctdl_module.h"
  42
  43 /*
  44  * Noise words are not included in search indices.
  45  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  46  * must also be changed, so that the index is rebuilt.
  47  */
  48 static char *noise_words[] = {
  49         "about",
  50         "after",
  51         "also",
  52         "another",
  53         "because",
  54         "been",
  55         "before",
  56         "being",
  57         "between",
  58         "both",
  59         "came",
  60         "come",
  61         "could",
  62         "each",
  63         "from",
  64         "have",
  65         "here",
  66         "himself",
  67         "into",
  68         "like",
  69         "make",
  70         "many",
  71         "might",
  72         "more",
  73         "most",
  74         "much",
  75         "must",
  76         "never",
  77         "only",
  78         "other",
  79         "over",
  80         "said",
  81         "same",
  82         "should",
  83         "since",
  84         "some",
  85         "still",
  86         "such",
  87         "take",
  88         "than",
  89         "that",
  90         "their",
  91         "them",
  92         "then",
  93         "there",
  94         "these",
  95         "they",
  96         "this",
  97         "those",
  98         "through",
  99         "under",
 100         "very",
 101         "well",
 102         "were",
 103         "what",
 104         "where",
 105         "which",
 106         "while",
 107         "with",
 108         "would",
 109         "your"
 110 };
 111 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
 112
 113
 114 /*
 115  * Compare function
 116  */
 117 int intcmp(const void *rec1, const void *rec2) {
 118         int i1, i2;
 119
 120         i1 = *(const int *)rec1;
 121         i2 = *(const int *)rec2;
 122
 123         if (i1 > i2) return(1);
 124         if (i1 < i2) return(-1);
 125         return(0);
 126 }
 127
 128
 129 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
 130
 131         int wb_num_tokens = 0;
 132         int wb_num_alloc = 0;
 133         int *wb_tokens = NULL;
 134
 135         const char *ptr;
 136         const char *word_start;
 137         const char *word_end;
 138         char ch;
 139         int word_len;
 140         char word[256];
 141         int i;
 142         int word_crc;
 143
 144         if (text == NULL) {             /* no NULL text please */
 145                 *num_tokens = 0;
 146                 *tokens = NULL;
 147                 return;
 148         }
 149
 150         if (text[0] == 0) {             /* no empty text either */
 151                 *num_tokens = 0;
 152                 *tokens = NULL;
 153                 return;
 154         }
 155
 156         ptr = text;
 157         word_start = NULL;
 158         while (*ptr) {
 159                 ch = *ptr;
 160                 if (isalnum(ch)) {
 161                         if (!word_start) {
 162                                 word_start = ptr;
 163                         }
 164                 }
 165                 ++ptr;
 166                 ch = *ptr;
 167                 if ( (!isalnum(ch)) && (word_start) ) {
 168                         word_end = ptr;
 169
 170                         /* extract the word */
 171                         word_len = word_end - word_start;
 172                         if (word_len >= sizeof word) {
 173                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
 174                                 safestrncpy(word, word_start, sizeof word);
 175                                 word[(sizeof word) - 1] = 0;
 176                         }
 177                         else {
 178                                 safestrncpy(word, word_start, word_len+1);
 179                                 word[word_len] = 0;
 180                         }
 181                         word_start = NULL;
 182
 183                         /* are we ok with the length? */
 184                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
 185                                 for (i=0; i<word_len; ++i) {
 186                                         word[i] = tolower(word[i]);
 187                                 }
 188                                 /* disqualify noise words */
 189                                 for (i=0; i<NUM_NOISE; ++i) {
 190                                         if (!strcmp(word, noise_words[i])) {
 191                                                 word_len = 0;
 192                                                 break;
 193                                         }
 194                                 }
 195
 196                                 if (word_len == 0)
 197                                         continue;
 198
 199                                 word_crc = (int) CalcCRC16Bytes(word_len, word);
 200
 201                                 ++wb_num_tokens;
 202                                 if (wb_num_tokens > wb_num_alloc) {
 203                                         wb_num_alloc += 512;
 204                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 205                                 }
 206                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 207                         }
 208                 }
 209         }
 210
 211         /* sort and purge dups */
 212         if (wb_num_tokens > 1) {
 213                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 214                 for (i=0; i<(wb_num_tokens-1); ++i) {
 215                         if (wb_tokens[i] == wb_tokens[i+1]) {
 216                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 217                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 218                                 --wb_num_tokens;
 219                                 --i;
 220                         }
 221                 }
 222         }
 223
 224         *num_tokens = wb_num_tokens;
 225         *tokens = wb_tokens;
 226 }
 227