From 15d8ce26425bebfe4a4df12e25f6387714ee5500 Mon Sep 17 00:00:00 2001 From: Art Cancro Date: Thu, 20 Oct 2005 21:56:42 +0000 Subject: [PATCH] * ft_wordbreaker.c: added a list of 'noise words' to ignore. This is admittedly EN/US specific, so if anyone wants to contribute noise words for other languages... --- citadel/ChangeLog | 5 ++ citadel/ft_wordbreaker.c | 121 +++++++++++++++++++++++++++++++++++++++ citadel/ft_wordbreaker.h | 2 +- 3 files changed, 127 insertions(+), 1 deletion(-) diff --git a/citadel/ChangeLog b/citadel/ChangeLog index 39866b966..1f7e1ac53 100644 --- a/citadel/ChangeLog +++ b/citadel/ChangeLog @@ -1,3 +1,8 @@ +Thu Oct 20 17:55:12 EDT 2005 ajc +* ft_wordbreaker.c: added a list of "noise words" to ignore. This is + admittedly EN/US specific, so if anyone wants to contribute noise words + for other languages... + Wed Oct 19 22:55:19 EDT 2005 ajc * serv_calendar.c: registered a fixed output hook for text/calendar. diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c index e45fd3641..64d075f22 100644 --- a/citadel/ft_wordbreaker.c +++ b/citadel/ft_wordbreaker.c @@ -45,6 +45,119 @@ #include "ft_wordbreaker.h" #include "crc16.h" +/* + * Noise words are not included in search indices. + * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID + * must also be changed, so that the index is rebuilt. + */ +static char *noise_words[] = { + "about", + "after", + "all", + "also", + "an", + "and", + "another", + "any", + "are", + "as", + "at", + "be", + "because", + "been", + "before", + "being", + "between", + "both", + "but", + "by", + "came", + "can", + "come", + "could", + "did", + "do", + "each", + "for", + "from", + "get", + "got", + "had", + "has", + "have", + "he", + "her", + "here", + "him", + "himself", + "his", + "how", + "if", + "in", + "into", + "is", + "it", + "like", + "make", + "many", + "me", + "might", + "more", + "most", + "much", + "must", + "my", + "never", + "now", + "of", + "on", + "only", + "or", + "other", + "our", + "out", + "over", + "said", + "same", + "see", + "should", + "since", + "some", + "still", + "such", + "take", + "than", + "that", + "the", + "their", + "them", + "then", + "there", + "these", + "they", + "this", + "those", + "through", + "to", + "too", + "under", + "up", + "very", + "was", + "way", + "we", + "well", + "were", + "what", + "where", + "which", + "while", + "with", + "would", + "you", + "your" +}; + /* * Compare function */ @@ -108,6 +221,14 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { word[word_len] = 0; word_start = NULL; + /* disqualify noise words */ + for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) { + if (!strcasecmp(word, noise_words[i])) { + word_len = 0; + break; + } + } + /* are we ok with the length? */ if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) { diff --git a/citadel/ft_wordbreaker.h b/citadel/ft_wordbreaker.h index caca77d29..c40cf869f 100644 --- a/citadel/ft_wordbreaker.h +++ b/citadel/ft_wordbreaker.h @@ -9,7 +9,7 @@ * later on, or even if we update this one, we can use a different ID so the * system knows it needs to throw away the existing index and rebuild it. */ -#define FT_WORDBREAKER_ID 0x001c +#define FT_WORDBREAKER_ID 0x001d /* * Minimum and maximum length of words to index -- 2.39.2