citadel/server/modules/fulltext/ft_wordbreaker.c

   1 // Default wordbreaker module for full text indexing.
   2 //
   3 // Copyright (c) 2005-2024 by the citadel.org team
   4 // This program is open source software.  Use, duplication, or disclosure is subject to the GNU General Public License v3.
   5
   6 #include "../../sysdep.h"
   7 #include <stdlib.h>
   8 #include <unistd.h>
   9 #include <stdio.h>
  10 #include <fcntl.h>
  11 #include <signal.h>
  12 #include <pwd.h>
  13 #include <errno.h>
  14 #include <sys/types.h>
  15 #include <time.h>
  16 #include <sys/wait.h>
  17 #include <ctype.h>
  18 #include <string.h>
  19 #include <limits.h>
  20 #include <libcitadel.h>
  21 #include "../../citadel_defs.h"
  22 #include "../../server.h"
  23 #include "../../sysdep_decls.h"
  24 #include "../../citserver.h"
  25 #include "../../support.h"
  26 #include "../../config.h"
  27 #include "../../database.h"
  28 #include "../../msgbase.h"
  29 #include "../../control.h"
  30 #include "ft_wordbreaker.h"
  31 #include "crc16.h"
  32 #include "../../ctdl_module.h"
  33
  34 // Noise words are not included in search indices.
  35 // NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID must also be changed, so that the index is rebuilt.
  36 static char *noise_words[] = {
  37         "about",
  38         "after",
  39         "also",
  40         "another",
  41         "because",
  42         "been",
  43         "before",
  44         "being",
  45         "between",
  46         "both",
  47         "came",
  48         "come",
  49         "could",
  50         "each",
  51         "from",
  52         "have",
  53         "here",
  54         "himself",
  55         "into",
  56         "like",
  57         "make",
  58         "many",
  59         "might",
  60         "more",
  61         "most",
  62         "much",
  63         "must",
  64         "never",
  65         "only",
  66         "other",
  67         "over",
  68         "said",
  69         "same",
  70         "should",
  71         "since",
  72         "some",
  73         "still",
  74         "such",
  75         "take",
  76         "than",
  77         "that",
  78         "their",
  79         "them",
  80         "then",
  81         "there",
  82         "these",
  83         "they",
  84         "this",
  85         "those",
  86         "through",
  87         "under",
  88         "very",
  89         "well",
  90         "were",
  91         "what",
  92         "where",
  93         "which",
  94         "while",
  95         "with",
  96         "would",
  97         "your"
  98 };
  99 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
 100
 101
 102 // Compare function
 103 int intcmp(const void *rec1, const void *rec2) {
 104         int i1, i2;
 105
 106         i1 = *(const int *)rec1;
 107         i2 = *(const int *)rec2;
 108
 109         if (i1 > i2) return(1);
 110         if (i1 < i2) return(-1);
 111         return(0);
 112 }
 113
 114
 115 Array *wordbreaker(const char *text) {
 116         const char *ptr;
 117         const char *word_start;
 118         const char *word_end;
 119         char ch;
 120         int word_len;
 121         char word[256];
 122         int i;
 123         int word_crc;
 124
 125         if (text == NULL) {             // no NULL text please
 126                 return(NULL);
 127         }
 128
 129         if (text[0] == 0) {             // no empty text either
 130                 return(NULL);
 131         }
 132
 133         Array *found_tokens = array_new(sizeof(int));
 134         if (found_tokens == NULL) {
 135                 return(NULL);
 136         }
 137
 138         ptr = text;
 139         word_start = NULL;
 140         while (*ptr) {
 141                 ch = *ptr;
 142                 if (isalnum(ch)) {
 143                         if (!word_start) {
 144                                 word_start = ptr;
 145                         }
 146                 }
 147                 ++ptr;
 148                 ch = *ptr;
 149                 if ( (!isalnum(ch)) && (word_start) ) {
 150                         word_end = ptr;
 151
 152                         // extract the word
 153                         word_len = word_end - word_start;
 154                         if (word_len >= sizeof word) {
 155                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
 156                                 safestrncpy(word, word_start, sizeof word);
 157                                 word[(sizeof word) - 1] = 0;
 158                         }
 159                         else {
 160                                 safestrncpy(word, word_start, word_len+1);
 161                                 word[word_len] = 0;
 162                         }
 163                         word_start = NULL;
 164
 165                         // are we ok with the length?
 166                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
 167                                 for (i=0; i<word_len; ++i) {
 168                                         word[i] = tolower(word[i]);
 169                                 }
 170                                 // disqualify noise words
 171                                 for (i=0; i<NUM_NOISE; ++i) {
 172                                         if (!strcasecmp(word, noise_words[i])) {
 173                                                 word_len = 0;
 174                                                 break;
 175                                         }
 176                                 }
 177                                 // add it to the array (FIXME make this case insensitive)
 178                                 if (word_len > 0) {
 179                                         word_crc = (int) CalcCRC16Bytes(word_len, word);
 180                                         array_append(found_tokens, &word_crc);
 181                                 }
 182                         }
 183                 }
 184         }
 185
 186         // sort and purge dups
 187         if (array_len(found_tokens) > 1) {
 188                 array_sort(found_tokens, intcmp);
 189                 for (i=0; i<(array_len(found_tokens)); ++i) {
 190                         if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
 191                                 array_delete_element_at(found_tokens, i);
 192                                 --i;
 193                         }
 194                 }
 195         }
 196         return(found_tokens);
 197 }