]> code.citadel.org Git - citadel.git/blob - citadel/html.c
* html.c: added. This is an overly simplistic HTML-to-text converter.
[citadel.git] / citadel / html.c
1 /*
2  * html.c -- Functions which handle translation between HTML and plain text
3  * $Id$
4  */
5
6 #include <stdlib.h>
7 #include <unistd.h>
8 #include <stdio.h>
9 #include <ctype.h>
10 #include <string.h>
11
12
13 /*
14  * Convert HTML to plain text.
15  */
16 void html_to_ascii(int screenwidth) {
17         char inbuf[256];
18         char outbuf[256];
19         char tag[1024];
20         int done_reading = 0;
21         char *ptr;
22         int i, ch;
23         int nest = 0;           /* Bracket nesting level */
24
25         strcpy(inbuf, "");
26         strcpy(outbuf, "");
27
28         do {
29                 /* Fill the input buffer */
30                 if ( (done_reading == 0) && (strlen(inbuf) < 128) ) {
31                         /* FIX ... genericize this */
32                         ptr = fgets(&inbuf[strlen(inbuf)], 127, stdin);
33                         if (ptr == NULL) done_reading = 1;
34                 }
35
36                 /* Do some parsing */
37                 if (strlen(inbuf)>0) {
38
39                     /* Fold in all the spacing */
40                     for (i=0; i<strlen(inbuf); ++i) {
41                         if (inbuf[i]==10) inbuf[i]=32;
42                         if (inbuf[i]==13) inbuf[i]=32;
43                         if (inbuf[i]==9) inbuf[i]=32;
44                         if ((inbuf[i]<32) || (inbuf[i]>126))
45                                 strcpy(&inbuf[i], &inbuf[i+1]);
46                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
47                                 strcpy(&inbuf[i], &inbuf[i+1]);
48                     }
49
50                     for (i=0; i<strlen(inbuf); ++i) {
51
52                         ch = inbuf[i];
53
54                         if (ch == '<') {
55                                 ++nest;
56                                 strcpy(tag, "");
57                         }
58
59                         else if (ch == '>') {
60                                 if (nest > 0) --nest;
61                                 
62                                 if (!strcasecmp(tag, "P")) {
63                                         strcat(outbuf, "\n\n");
64                                 }
65
66                                 if (!strcasecmp(tag, "HR")) {
67                                         strcat(outbuf, "\n ----- \n");
68                                 }
69
70                                 if (!strcasecmp(tag, "BR")) {
71                                         strcat(outbuf, "\n");
72                                 }
73
74                                 if (!strcasecmp(tag, "TR")) {
75                                         strcat(outbuf, "\n");
76                                 }
77
78                                 if (!strcasecmp(tag, "/TABLE")) {
79                                         strcat(outbuf, "\n");
80                                 }
81
82                         }
83
84                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
85                                 tag[strlen(tag)+1] = 0;
86                                 tag[strlen(tag)] = ch;
87                         }
88                                 
89                         else if (!nest) {
90                                 outbuf[strlen(outbuf)+1] = 0;
91                                 outbuf[strlen(outbuf)] = ch;
92                         }
93                     }
94                     strcpy(inbuf, &inbuf[i]);
95                 }
96
97                 /* Convert &; tags to the forbidden characters */
98                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
99
100                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
101                                 outbuf[i] = ' ';
102                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
103                         }
104
105                         else if (!strncasecmp(&outbuf[i], "&lb;", 4)) {
106                                 outbuf[i] = '<';
107                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
108                         }
109
110                         else if (!strncasecmp(&outbuf[i], "&rb;", 4)) {
111                                 outbuf[i] = '>';
112                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
113                         }
114
115                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
116                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
117                         }
118
119                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
120                                 outbuf[i] = '\"';
121                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
122                         }
123
124                 }
125
126                 /* Output our finely-crafted plain ASCII */
127                 printf("%s", outbuf);   /* FIX ... genericize this */
128                 strcpy(outbuf, "");
129
130         } while (done_reading == 0);
131
132 }
133
134
135 /*
136  * Temporary main loop for testing
137  */
138 int main() {
139         html_to_ascii(80);
140         return 0;
141 }