don't run this, it's broken
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
index 6b9fb2d243b54122f4c8a6453ba77c6888b73777..4e1059a7f63ad982f2ffc186d5c5d5c987f59e3f 100644 (file)
@@ -1,11 +1,17 @@
 /*
- * $Id$
- *
  * Default wordbreaker module for full text indexing.
  *
+ * Copyright (c) 2005-2017 by the citadel.org team
+ *
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
 
-
 #include "sysdep.h"
 #include <stdlib.h>
 #include <unistd.h>
@@ -31,6 +37,7 @@
 #include <ctype.h>
 #include <string.h>
 #include <limits.h>
+#include <libcitadel.h>
 #include "citadel.h"
 #include "server.h"
 #include "sysdep_decls.h"
@@ -40,9 +47,9 @@
 #include "database.h"
 #include "msgbase.h"
 #include "control.h"
-#include "tools.h"
 #include "ft_wordbreaker.h"
 #include "crc16.h"
+#include "ctdl_module.h"
 
 /*
  * Noise words are not included in search indices.
 static char *noise_words[] = {
        "about",
        "after",
-       "all",
        "also",
-       "an",
-       "and",
        "another",
-       "any",
-       "are",
-       "as",
-       "at",
-       "be",
        "because",
        "been",
        "before",
        "being",
        "between",
        "both",
-       "but",
-       "by",
        "came",
-       "can",
        "come",
        "could",
-       "did",
-       "do",
        "each",
-       "for",
        "from",
-       "get",
-       "got",
-       "had",
-       "has",
        "have",
-       "he",
-       "her",
        "here",
-       "him",
        "himself",
-       "his",
-       "how",
-       "if",
-       "in",
        "into",
-       "is",
-       "it",
        "like",
        "make",
        "many",
-       "me",
        "might",
        "more",
        "most",
        "much",
        "must",
-       "my",
        "never",
-       "now",
-       "of",
-       "on",
        "only",
-       "or",
        "other",
-       "our",
-       "out",
        "over",
        "said",
        "same",
-       "see",
        "should",
        "since",
        "some",
@@ -127,7 +98,6 @@ static char *noise_words[] = {
        "take",
        "than",
        "that",
-       "the",
        "their",
        "them",
        "then",
@@ -137,14 +107,8 @@ static char *noise_words[] = {
        "this",
        "those",
        "through",
-       "to",
-       "too",
        "under",
-       "up",
        "very",
-       "was",
-       "way",
-       "we",
        "well",
        "were",
        "what",
@@ -153,9 +117,10 @@ static char *noise_words[] = {
        "while",
        "with",
        "would",
-       "you",
        "your"
 };
+#define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
+
 
 /*
  * Compare function
@@ -172,21 +137,21 @@ int intcmp(const void *rec1, const void *rec2) {
 }
 
 
-void wordbreaker(char *text, int *num_tokens, int **tokens) {
+void wordbreaker(const char *text, int *num_tokens, int **tokens) {
 
        int wb_num_tokens = 0;
        int wb_num_alloc = 0;
        int *wb_tokens = NULL;
 
-       char *ptr;
-       char *word_start;
-       char *word_end;
+       const char *ptr;
+       const char *word_start;
+       const char *word_end;
        char ch;
        int word_len;
        char word[256];
        int i;
        int word_crc;
-
+       
        if (text == NULL) {             /* no NULL text please */
                *num_tokens = 0;
                *tokens = NULL;
@@ -212,36 +177,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                ch = *ptr;
                if ( (!isalnum(ch)) && (word_start) ) {
                        word_end = ptr;
-                       --word_end;
 
                        /* extract the word */
-                       word_len = word_end - word_start + 1;
-                       safestrncpy(word, word_start, sizeof word);
+                       word_len = word_end - word_start;
                        if (word_len >= sizeof word) {
-                               lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
-                               word[(sizeof word_len) - 1] = 0;
+                               syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
+                               safestrncpy(word, word_start, sizeof word);
+                               word[(sizeof word) - 1] = 0;
                        }
                        else {
+                               safestrncpy(word, word_start, word_len+1);
                                word[word_len] = 0;
                        }
                        word_start = NULL;
 
-                       /* disqualify noise words */
-                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-                               if (!strcasecmp(word, noise_words[i])) {
-                                       word_len = 0;
-                                       break;
-                               }
-                       }
-
                        /* are we ok with the length? */
-                       if ( (word_len >= WB_MIN)
-                          && (word_len <= WB_MAX) ) {
+                       if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
                                for (i=0; i<word_len; ++i) {
                                        word[i] = tolower(word[i]);
                                }
-                               word_crc = (int)
-                                       CalcCRC16Bytes(word_len, word);
+                               /* disqualify noise words */
+                               for (i=0; i<NUM_NOISE; ++i) {
+                                       if (!strcmp(word, noise_words[i])) {
+                                               word_len = 0;
+                                               break;
+                                       }
+                               }
+
+                               if (word_len == 0)
+                                       continue;
+
+                               word_crc = (int) CalcCRC16Bytes(word_len, word);
 
                                ++wb_num_tokens;
                                if (wb_num_tokens > wb_num_alloc) {