From 15d8ce26425bebfe4a4df12e25f6387714ee5500 Mon Sep 17 00:00:00 2001
From: Art Cancro <ajc@citadel.org>
Date: Thu, 20 Oct 2005 21:56:42 +0000
Subject: [PATCH] * ft_wordbreaker.c: added a list of 'noise words' to ignore. 
 This is   admittedly EN/US specific, so if anyone wants to contribute noise
 words   for other languages...

---
 citadel/ChangeLog        |   5 ++
 citadel/ft_wordbreaker.c | 121 +++++++++++++++++++++++++++++++++++++++
 citadel/ft_wordbreaker.h |   2 +-
 3 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/citadel/ChangeLog b/citadel/ChangeLog
index 39866b966..1f7e1ac53 100644
--- a/citadel/ChangeLog
+++ b/citadel/ChangeLog
@@ -1,3 +1,8 @@
+Thu Oct 20 17:55:12 EDT 2005 ajc
+* ft_wordbreaker.c: added a list of "noise words" to ignore.  This is
+  admittedly EN/US specific, so if anyone wants to contribute noise words
+  for other languages...
+
 Wed Oct 19 22:55:19 EDT 2005 ajc
 * serv_calendar.c: registered a fixed output hook for text/calendar.
 
diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c
index e45fd3641..64d075f22 100644
--- a/citadel/ft_wordbreaker.c
+++ b/citadel/ft_wordbreaker.c
@@ -45,6 +45,119 @@
 #include "ft_wordbreaker.h"
 #include "crc16.h"
 
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+	"about",
+	"after",
+	"all",
+	"also",
+	"an",
+	"and",
+	"another",
+	"any",
+	"are",
+	"as",
+	"at",
+	"be",
+	"because",
+	"been",
+	"before",
+	"being",
+	"between",
+	"both",
+	"but",
+	"by",
+	"came",
+	"can",
+	"come",
+	"could",
+	"did",
+	"do",
+	"each",
+	"for",
+	"from",
+	"get",
+	"got",
+	"had",
+	"has",
+	"have",
+	"he",
+	"her",
+	"here",
+	"him",
+	"himself",
+	"his",
+	"how",
+	"if",
+	"in",
+	"into",
+	"is",
+	"it",
+	"like",
+	"make",
+	"many",
+	"me",
+	"might",
+	"more",
+	"most",
+	"much",
+	"must",
+	"my",
+	"never",
+	"now",
+	"of",
+	"on",
+	"only",
+	"or",
+	"other",
+	"our",
+	"out",
+	"over",
+	"said",
+	"same",
+	"see",
+	"should",
+	"since",
+	"some",
+	"still",
+	"such",
+	"take",
+	"than",
+	"that",
+	"the",
+	"their",
+	"them",
+	"then",
+	"there",
+	"these",
+	"they",
+	"this",
+	"those",
+	"through",
+	"to",
+	"too",
+	"under",
+	"up",
+	"very",
+	"was",
+	"way",
+	"we",
+	"well",
+	"were",
+	"what",
+	"where",
+	"which",
+	"while",
+	"with",
+	"would",
+	"you",
+	"your"
+};
+
 /*
  * Compare function
  */
@@ -108,6 +221,14 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
 			word[word_len] = 0;
 			word_start = NULL;
 
+			/* disqualify noise words */
+			for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+				if (!strcasecmp(word, noise_words[i])) {
+					word_len = 0;
+					break;
+				}
+			}
+
 			/* are we ok with the length? */
 			if ( (word_len >= WB_MIN)
 			   && (word_len <= WB_MAX) ) {
diff --git a/citadel/ft_wordbreaker.h b/citadel/ft_wordbreaker.h
index caca77d29..c40cf869f 100644
--- a/citadel/ft_wordbreaker.h
+++ b/citadel/ft_wordbreaker.h
@@ -9,7 +9,7 @@
  * later on, or even if we update this one, we can use a different ID so the
  * system knows it needs to throw away the existing index and rebuild it.
  */
-#define	FT_WORDBREAKER_ID	0x001c
+#define	FT_WORDBREAKER_ID	0x001d
 
 /*
  * Minimum and maximum length of words to index
-- 
2.39.2