* ft_wordbreaker.c: added a list of 'noise words' to ignore. This is

author Art Cancro <ajc@citadel.org>

Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)

committer Art Cancro <ajc@citadel.org>

Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
author Art Cancro <ajc@citadel.org>
Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
committer Art Cancro <ajc@citadel.org>
Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
diff --git a/citadel/ChangeLog b/citadel/ChangeLog

index 39866b96676ea03f0ad81f5dd3995991a2f79441..1f7e1ac53cf32b05e3b50284b1960c1940872e4b 100644 (file)
--- a/citadel/ChangeLog
+++ b/citadel/ChangeLog
@@ -1,3 +1,8 @@
+Thu Oct 20 17:55:12 EDT 2005 ajc
+* ft_wordbreaker.c: added a list of "noise words" to ignore.  This is
+  admittedly EN/US specific, so if anyone wants to contribute noise words
+  for other languages...
+
  Wed Oct 19 22:55:19 EDT 2005 ajc
  * serv_calendar.c: registered a fixed output hook for text/calendar.
  
diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c

index e45fd3641a640b2a04dc30d90cb887098efd0855..64d075f22bcf884d633472cd16b0bd1a29ef27c0 100644 (file)
--- a/citadel/ft_wordbreaker.c
+++ b/citadel/ft_wordbreaker.c
@@ -45,6 +45,119 @@
  #include "ft_wordbreaker.h"
  #include "crc16.h"
  
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+       "about",
+       "after",
+       "all",
+       "also",
+       "an",
+       "and",
+       "another",
+       "any",
+       "are",
+       "as",
+       "at",
+       "be",
+       "because",
+       "been",
+       "before",
+       "being",
+       "between",
+       "both",
+       "but",
+       "by",
+       "came",
+       "can",
+       "come",
+       "could",
+       "did",
+       "do",
+       "each",
+       "for",
+       "from",
+       "get",
+       "got",
+       "had",
+       "has",
+       "have",
+       "he",
+       "her",
+       "here",
+       "him",
+       "himself",
+       "his",
+       "how",
+       "if",
+       "in",
+       "into",
+       "is",
+       "it",
+       "like",
+       "make",
+       "many",
+       "me",
+       "might",
+       "more",
+       "most",
+       "much",
+       "must",
+       "my",
+       "never",
+       "now",
+       "of",
+       "on",
+       "only",
+       "or",
+       "other",
+       "our",
+       "out",
+       "over",
+       "said",
+       "same",
+       "see",
+       "should",
+       "since",
+       "some",
+       "still",
+       "such",
+       "take",
+       "than",
+       "that",
+       "the",
+       "their",
+       "them",
+       "then",
+       "there",
+       "these",
+       "they",
+       "this",
+       "those",
+       "through",
+       "to",
+       "too",
+       "under",
+       "up",
+       "very",
+       "was",
+       "way",
+       "we",
+       "well",
+       "were",
+       "what",
+       "where",
+       "which",
+       "while",
+       "with",
+       "would",
+       "you",
+       "your"
+};
+
  /*
   * Compare function
   */
@@ -108,6 +221,14 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                         word[word_len] = 0;
                         word_start = NULL;
  
+                       /* disqualify noise words */
+                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                               if (!strcasecmp(word, noise_words[i])) {
+                                       word_len = 0;
+                                       break;
+                               }
+                       }
+
                         /* are we ok with the length? */
                         if ( (word_len >= WB_MIN)
                            && (word_len <= WB_MAX) ) {
diff --git a/citadel/ft_wordbreaker.h b/citadel/ft_wordbreaker.h

index caca77d298be53cd13d8927e67b54eb46ac54221..c40cf869f196bc2dc8ae344d8d95ff9d06e38a81 100644 (file)
--- a/citadel/ft_wordbreaker.h
+++ b/citadel/ft_wordbreaker.h
@@ -9,7 +9,7 @@
   * later on, or even if we update this one, we can use a different ID so the
   * system knows it needs to throw away the existing index and rebuild it.
   */
-#define        FT_WORDBREAKER_ID       0x001c
+#define        FT_WORDBREAKER_ID       0x001d
  
  /*
   * Minimum and maximum length of words to index
author	Art Cancro <ajc@citadel.org>
	Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
committer	Art Cancro <ajc@citadel.org>
	Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
citadel/ChangeLog		patch \| blob \| history
citadel/ft_wordbreaker.c		patch \| blob \| history
citadel/ft_wordbreaker.h		patch \| blob \| history