X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=citadel%2Fmodules%2Ffulltext%2Fft_wordbreaker.c;h=4e1059a7f63ad982f2ffc186d5c5d5c987f59e3f;hb=a520d904d0069654c57d2b69618bee1225ee3067;hp=6b9fb2d243b54122f4c8a6453ba77c6888b73777;hpb=84aa84fdd0a02f703c5e836f258e33f950c66355;p=citadel.git diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c index 6b9fb2d24..4e1059a7f 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.c +++ b/citadel/modules/fulltext/ft_wordbreaker.c @@ -1,11 +1,17 @@ /* - * $Id$ - * * Default wordbreaker module for full text indexing. * + * Copyright (c) 2005-2017 by the citadel.org team + * + * This program is open source software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. */ - #include "sysdep.h" #include #include @@ -31,6 +37,7 @@ #include #include #include +#include #include "citadel.h" #include "server.h" #include "sysdep_decls.h" @@ -40,9 +47,9 @@ #include "database.h" #include "msgbase.h" #include "control.h" -#include "tools.h" #include "ft_wordbreaker.h" #include "crc16.h" +#include "ctdl_module.h" /* * Noise words are not included in search indices. @@ -52,73 +59,37 @@ static char *noise_words[] = { "about", "after", - "all", "also", - "an", - "and", "another", - "any", - "are", - "as", - "at", - "be", "because", "been", "before", "being", "between", "both", - "but", - "by", "came", - "can", "come", "could", - "did", - "do", "each", - "for", "from", - "get", - "got", - "had", - "has", "have", - "he", - "her", "here", - "him", "himself", - "his", - "how", - "if", - "in", "into", - "is", - "it", "like", "make", "many", - "me", "might", "more", "most", "much", "must", - "my", "never", - "now", - "of", - "on", "only", - "or", "other", - "our", - "out", "over", "said", "same", - "see", "should", "since", "some", @@ -127,7 +98,6 @@ static char *noise_words[] = { "take", "than", "that", - "the", "their", "them", "then", @@ -137,14 +107,8 @@ static char *noise_words[] = { "this", "those", "through", - "to", - "too", "under", - "up", "very", - "was", - "way", - "we", "well", "were", "what", @@ -153,9 +117,10 @@ static char *noise_words[] = { "while", "with", "would", - "you", "your" }; +#define NUM_NOISE (sizeof(noise_words) / sizeof(char *)) + /* * Compare function @@ -172,21 +137,21 @@ int intcmp(const void *rec1, const void *rec2) { } -void wordbreaker(char *text, int *num_tokens, int **tokens) { +void wordbreaker(const char *text, int *num_tokens, int **tokens) { int wb_num_tokens = 0; int wb_num_alloc = 0; int *wb_tokens = NULL; - char *ptr; - char *word_start; - char *word_end; + const char *ptr; + const char *word_start; + const char *word_end; char ch; int word_len; char word[256]; int i; int word_crc; - + if (text == NULL) { /* no NULL text please */ *num_tokens = 0; *tokens = NULL; @@ -212,36 +177,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { ch = *ptr; if ( (!isalnum(ch)) && (word_start) ) { word_end = ptr; - --word_end; /* extract the word */ - word_len = word_end - word_start + 1; - safestrncpy(word, word_start, sizeof word); + word_len = word_end - word_start; if (word_len >= sizeof word) { - lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len); - word[(sizeof word_len) - 1] = 0; + syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len); + safestrncpy(word, word_start, sizeof word); + word[(sizeof word) - 1] = 0; } else { + safestrncpy(word, word_start, word_len+1); word[word_len] = 0; } word_start = NULL; - /* disqualify noise words */ - for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) { - if (!strcasecmp(word, noise_words[i])) { - word_len = 0; - break; - } - } - /* are we ok with the length? */ - if ( (word_len >= WB_MIN) - && (word_len <= WB_MAX) ) { + if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) { for (i=0; i wb_num_alloc) {