From 2e76d1431d857054387a4f22b74a96c86e80f958 Mon Sep 17 00:00:00 2001 From: Dave West Date: Tue, 13 Nov 2007 02:37:45 +0000 Subject: [PATCH] A little bit of speed up in the wordbreaker for full text indexing. Also fixed a small bug that made some of our index tokens bad. Maybe this will fix some of the text search problems some people are seeing. We also need to alter the indexer to index the message headers as well as the body, at the moment we only index the body. --- citadel/modules/fulltext/ft_wordbreaker.c | 71 +++++------------------ citadel/modules/fulltext/ft_wordbreaker.h | 4 +- citadel/modules/fulltext/serv_fulltext.c | 15 +++-- 3 files changed, 27 insertions(+), 63 deletions(-) diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c index 6b9fb2d24..12b0f667c 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.c +++ b/citadel/modules/fulltext/ft_wordbreaker.c @@ -52,73 +52,37 @@ static char *noise_words[] = { "about", "after", - "all", "also", - "an", - "and", "another", - "any", - "are", - "as", - "at", - "be", "because", "been", "before", "being", "between", "both", - "but", - "by", "came", - "can", "come", "could", - "did", - "do", "each", - "for", "from", - "get", - "got", - "had", - "has", "have", - "he", - "her", "here", - "him", "himself", - "his", - "how", - "if", - "in", "into", - "is", - "it", "like", "make", "many", - "me", "might", "more", "most", "much", "must", - "my", "never", - "now", - "of", - "on", "only", - "or", "other", - "our", - "out", "over", "said", "same", - "see", "should", "since", "some", @@ -127,7 +91,6 @@ static char *noise_words[] = { "take", "than", "that", - "the", "their", "them", "then", @@ -137,14 +100,8 @@ static char *noise_words[] = { "this", "those", "through", - "to", - "too", "under", - "up", "very", - "was", - "way", - "we", "well", "were", "what", @@ -153,7 +110,6 @@ static char *noise_words[] = { "while", "with", "would", - "you", "your" }; @@ -212,34 +168,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { ch = *ptr; if ( (!isalnum(ch)) && (word_start) ) { word_end = ptr; - --word_end; +// --word_end; /* extract the word */ - word_len = word_end - word_start + 1; - safestrncpy(word, word_start, sizeof word); + word_len = word_end - word_start; if (word_len >= sizeof word) { lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len); - word[(sizeof word_len) - 1] = 0; + safestrncpy(word, word_start, sizeof word); + word[(sizeof word) - 1] = 0; } else { + safestrncpy(word, word_start, word_len+1); word[word_len] = 0; } word_start = NULL; - /* disqualify noise words */ - for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) { - if (!strcasecmp(word, noise_words[i])) { - word_len = 0; - break; - } - } - /* are we ok with the length? */ if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) { for (i=0; i= CitControl.MMhighest) { return; /* nothing to do! */ } - - lprintf(CTDL_DEBUG, "do_fulltext_indexing() started\n"); + + run_time = time(NULL); + lprintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time); /* * If we've switched wordbreaker modules, burn the index and start * over. */ begin_critical_section(S_CONTROL); - lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n", - CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID); if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) { + lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n", + CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID); lprintf(CTDL_INFO, "(re)initializing full text index\n"); cdb_trunc(CDB_FULLTEXT); CitControl.MMfulltext = 0L; @@ -310,6 +313,8 @@ void do_fulltext_indexing(void) { ft_num_alloc = 0; ft_newmsgs = NULL; } + end_time = time(NULL); + lprintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time); /* Save our place so we don't have to do this again */ ft_flush_cache(); -- 2.30.2