}
-void wordbreaker(const char *text, int *num_tokens, int **tokens) {
-
- int wb_num_tokens = 0;
- int wb_num_alloc = 0;
- int *wb_tokens = NULL;
-
+Array *wordbreaker(const char *text) {
const char *ptr;
const char *word_start;
const char *word_end;
char word[256];
int i;
int word_crc;
-
+
if (text == NULL) { /* no NULL text please */
- *num_tokens = 0;
- *tokens = NULL;
- return;
+ return(NULL);
}
if (text[0] == 0) { /* no empty text either */
- *num_tokens = 0;
- *tokens = NULL;
- return;
+ return(NULL);
+ }
+
+ Array *found_tokens = array_new(sizeof(int));
+ if (found_tokens == NULL) {
+ return(NULL);
}
ptr = text;
}
/* disqualify noise words */
for (i=0; i<NUM_NOISE; ++i) {
- if (!strcmp(word, noise_words[i])) {
+ if (!strcasecmp(word, noise_words[i])) {
word_len = 0;
break;
}
}
-
- if (word_len == 0)
- continue;
-
- word_crc = (int) CalcCRC16Bytes(word_len, word);
-
- ++wb_num_tokens;
- if (wb_num_tokens > wb_num_alloc) {
- wb_num_alloc += 512;
- wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+ /* FIXME make this case insensitive */
+ /* add it to the array */
+ if (word_len > 0) {
+ word_crc = (int) CalcCRC16Bytes(word_len, word);
+ array_append(found_tokens, &word_crc);
}
- wb_tokens[wb_num_tokens - 1] = word_crc;
}
}
}
/* sort and purge dups */
- if (wb_num_tokens > 1) {
- qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
- for (i=0; i<(wb_num_tokens-1); ++i) {
- if (wb_tokens[i] == wb_tokens[i+1]) {
- memmove(&wb_tokens[i], &wb_tokens[i+1],
- ((wb_num_tokens - i - 1)*sizeof(int)));
- --wb_num_tokens;
+ if (array_len(found_tokens) > 1) {
+ array_sort(found_tokens, intcmp);
+ for (i=0; i<(array_len(found_tokens)); ++i) {
+ if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
+ array_delete_element_at(found_tokens, i);
--i;
}
}
}
-
- *num_tokens = wb_num_tokens;
- *tokens = wb_tokens;
+ return(found_tokens);
}