From: Dave West Date: Tue, 13 Nov 2007 02:37:45 +0000 (+0000) Subject: A little bit of speed up in the wordbreaker for full text indexing. X-Git-Tag: v7.86~2792 X-Git-Url: https://code.citadel.org/?p=citadel.git;a=commitdiff_plain;h=2e76d1431d857054387a4f22b74a96c86e80f958 A little bit of speed up in the wordbreaker for full text indexing. Also fixed a small bug that made some of our index tokens bad. Maybe this will fix some of the text search problems some people are seeing. We also need to alter the indexer to index the message headers as well as the body, at the moment we only index the body. --- diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c index 6b9fb2d24..12b0f667c 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.c +++ b/citadel/modules/fulltext/ft_wordbreaker.c @@ -52,73 +52,37 @@ static char *noise_words[] = { "about", "after", - "all", "also", - "an", - "and", "another", - "any", - "are", - "as", - "at", - "be", "because", "been", "before", "being", "between", "both", - "but", - "by", "came", - "can", "come", "could", - "did", - "do", "each", - "for", "from", - "get", - "got", - "had", - "has", "have", - "he", - "her", "here", - "him", "himself", - "his", - "how", - "if", - "in", "into", - "is", - "it", "like", "make", "many", - "me", "might", "more", "most", "much", "must", - "my", "never", - "now", - "of", - "on", "only", - "or", "other", - "our", - "out", "over", "said", "same", - "see", "should", "since", "some", @@ -127,7 +91,6 @@ static char *noise_words[] = { "take", "than", "that", - "the", "their", "them", "then", @@ -137,14 +100,8 @@ static char *noise_words[] = { "this", "those", "through", - "to", - "too", "under", - "up", "very", - "was", - "way", - "we", "well", "were", "what", @@ -153,7 +110,6 @@ static char *noise_words[] = { "while", "with", "would", - "you", "your" }; @@ -212,34 +168,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { ch = *ptr; if ( (!isalnum(ch)) && (word_start) ) { word_end = ptr; - --word_end; +// --word_end; /* extract the word */ - word_len = word_end - word_start + 1; - safestrncpy(word, word_start, sizeof word); + word_len = word_end - word_start; if (word_len >= sizeof word) { lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len); - word[(sizeof word_len) - 1] = 0; + safestrncpy(word, word_start, sizeof word); + word[(sizeof word) - 1] = 0; } else { + safestrncpy(word, word_start, word_len+1); word[word_len] = 0; } word_start = NULL; - /* disqualify noise words */ - for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) { - if (!strcasecmp(word, noise_words[i])) { - word_len = 0; - break; - } - } - /* are we ok with the length? */ if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) { for (i=0; i= CitControl.MMhighest) { return; /* nothing to do! */ } - - lprintf(CTDL_DEBUG, "do_fulltext_indexing() started\n"); + + run_time = time(NULL); + lprintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time); /* * If we've switched wordbreaker modules, burn the index and start * over. */ begin_critical_section(S_CONTROL); - lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n", - CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID); if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) { + lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n", + CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID); lprintf(CTDL_INFO, "(re)initializing full text index\n"); cdb_trunc(CDB_FULLTEXT); CitControl.MMfulltext = 0L; @@ -310,6 +313,8 @@ void do_fulltext_indexing(void) { ft_num_alloc = 0; ft_newmsgs = NULL; } + end_time = time(NULL); + lprintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time); /* Save our place so we don't have to do this again */ ft_flush_cache();