X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=citadel%2Fmodules%2Ffulltext%2Fft_wordbreaker.c;h=968fa9f02b0aa2bf21ddf87890a0e19887d8a56a;hb=8e944083763c9ddcb32d763cf8f19c966d01f873;hp=4a03e1b77c6289877ba42815a439bd0e925d4b11;hpb=1e656d277fe91b7c4f5d73eab4a0dd0b7a173145;p=citadel.git diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c index 4a03e1b77..968fa9f02 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.c +++ b/citadel/modules/fulltext/ft_wordbreaker.c @@ -1,11 +1,17 @@ /* - * $Id$ - * * Default wordbreaker module for full text indexing. * + * Copyright (c) 2005-2017 by the citadel.org team + * + * This program is open source software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. */ - #include "sysdep.h" #include #include @@ -43,13 +49,17 @@ #include "control.h" #include "ft_wordbreaker.h" #include "crc16.h" +#include "ctdl_module.h" /* * Noise words are not included in search indices. * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID * must also be changed, so that the index is rebuilt. */ -static char *noise_words[] = { + +noise_word *noise_words[26]; + +static char *noise_words_init[] = { "about", "after", "also", @@ -113,6 +123,30 @@ static char *noise_words[] = { "your" }; + +void initialize_noise_words(void) +{ + int i; + int len; + int ch; + noise_word *next; + + memset (noise_words, 0, sizeof(noise_words)); + + for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i) + { + ch = noise_words_init[i][0] - 'a'; + len = strlen(noise_words_init[i]); + + next = malloc(sizeof(noise_word)); + next->len = len; + next->word = strdup(noise_words_init[i]); + next->next = noise_words[ch]; + noise_words[ch] = next; + } +} + + /* * Compare function */ @@ -128,21 +162,23 @@ int intcmp(const void *rec1, const void *rec2) { } -void wordbreaker(char *text, int *num_tokens, int **tokens) { +void wordbreaker(const char *text, int *num_tokens, int **tokens) { int wb_num_tokens = 0; int wb_num_alloc = 0; int *wb_tokens = NULL; - char *ptr; - char *word_start; - char *word_end; + const char *ptr; + const char *word_start; + const char *word_end; char ch; int word_len; char word[256]; int i; int word_crc; - + noise_word *noise; + + if (text == NULL) { /* no NULL text please */ *num_tokens = 0; *tokens = NULL; @@ -168,12 +204,11 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { ch = *ptr; if ( (!isalnum(ch)) && (word_start) ) { word_end = ptr; -// --word_end; /* extract the word */ word_len = word_end - word_start; if (word_len >= sizeof word) { - lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len); + syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len); safestrncpy(word, word_start, sizeof word); word[(sizeof word) - 1] = 0; } @@ -190,11 +225,18 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { word[i] = tolower(word[i]); } /* disqualify noise words */ - for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) { - if (!strcmp(word, noise_words[i])) { - word_len = 0; - break; + noise = noise_words[(int) (word[0]-'a')]; + while (noise) + { + if (noise->len == word_len) + { + if (!strcmp(word, noise->word)) + { + word_len = 0; + break; + } } + noise = noise->next; } if (word_len == 0) continue;