]> code.citadel.org Git - citadel.git/commitdiff
* ft_wordbreaker.c: added a list of 'noise words' to ignore. This is
authorArt Cancro <ajc@citadel.org>
Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
committerArt Cancro <ajc@citadel.org>
Thu, 20 Oct 2005 21:56:42 +0000 (21:56 +0000)
  admittedly EN/US specific, so if anyone wants to contribute noise words
  for other languages...

citadel/ChangeLog
citadel/ft_wordbreaker.c
citadel/ft_wordbreaker.h

index 39866b96676ea03f0ad81f5dd3995991a2f79441..1f7e1ac53cf32b05e3b50284b1960c1940872e4b 100644 (file)
@@ -1,3 +1,8 @@
+Thu Oct 20 17:55:12 EDT 2005 ajc
+* ft_wordbreaker.c: added a list of "noise words" to ignore.  This is
+  admittedly EN/US specific, so if anyone wants to contribute noise words
+  for other languages...
+
 Wed Oct 19 22:55:19 EDT 2005 ajc
 * serv_calendar.c: registered a fixed output hook for text/calendar.
 
index e45fd3641a640b2a04dc30d90cb887098efd0855..64d075f22bcf884d633472cd16b0bd1a29ef27c0 100644 (file)
 #include "ft_wordbreaker.h"
 #include "crc16.h"
 
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+       "about",
+       "after",
+       "all",
+       "also",
+       "an",
+       "and",
+       "another",
+       "any",
+       "are",
+       "as",
+       "at",
+       "be",
+       "because",
+       "been",
+       "before",
+       "being",
+       "between",
+       "both",
+       "but",
+       "by",
+       "came",
+       "can",
+       "come",
+       "could",
+       "did",
+       "do",
+       "each",
+       "for",
+       "from",
+       "get",
+       "got",
+       "had",
+       "has",
+       "have",
+       "he",
+       "her",
+       "here",
+       "him",
+       "himself",
+       "his",
+       "how",
+       "if",
+       "in",
+       "into",
+       "is",
+       "it",
+       "like",
+       "make",
+       "many",
+       "me",
+       "might",
+       "more",
+       "most",
+       "much",
+       "must",
+       "my",
+       "never",
+       "now",
+       "of",
+       "on",
+       "only",
+       "or",
+       "other",
+       "our",
+       "out",
+       "over",
+       "said",
+       "same",
+       "see",
+       "should",
+       "since",
+       "some",
+       "still",
+       "such",
+       "take",
+       "than",
+       "that",
+       "the",
+       "their",
+       "them",
+       "then",
+       "there",
+       "these",
+       "they",
+       "this",
+       "those",
+       "through",
+       "to",
+       "too",
+       "under",
+       "up",
+       "very",
+       "was",
+       "way",
+       "we",
+       "well",
+       "were",
+       "what",
+       "where",
+       "which",
+       "while",
+       "with",
+       "would",
+       "you",
+       "your"
+};
+
 /*
  * Compare function
  */
@@ -108,6 +221,14 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                        word[word_len] = 0;
                        word_start = NULL;
 
+                       /* disqualify noise words */
+                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                               if (!strcasecmp(word, noise_words[i])) {
+                                       word_len = 0;
+                                       break;
+                               }
+                       }
+
                        /* are we ok with the length? */
                        if ( (word_len >= WB_MIN)
                           && (word_len <= WB_MAX) ) {
index caca77d298be53cd13d8927e67b54eb46ac54221..c40cf869f196bc2dc8ae344d8d95ff9d06e38a81 100644 (file)
@@ -9,7 +9,7 @@
  * later on, or even if we update this one, we can use a different ID so the
  * system knows it needs to throw away the existing index and rebuild it.
  */
-#define        FT_WORDBREAKER_ID       0x001c
+#define        FT_WORDBREAKER_ID       0x001d
 
 /*
  * Minimum and maximum length of words to index