dammit, learn to spell
[citadel.git] / citadel / server / modules / rssclient / serv_rssclient.c
1 // Bring external RSS and/or Atom feeds into rooms.  This module implements a
2 // very loose parser that scrapes both kinds of feeds and is not picky about
3 // the standards compliance of the source data.
4 //
5 // Copyright (c) 2007-2023 by the citadel.org team
6 //
7 // This program is open source software.  Use, duplication, or disclosure
8 // is subject to the terms of the GNU General Public License, version 3.
9
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <time.h>
14 #include <ctype.h>
15 #include <string.h>
16 #include <errno.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <expat.h>
20 #include <curl/curl.h>
21 #include <libcitadel.h>
22 #include "../../citadel_defs.h"
23 #include "../../server.h"
24 #include "../../citserver.h"
25 #include "../../support.h"
26 #include "../../config.h"
27 #include "../../threads.h"
28 #include "../../ctdl_module.h"
29 #include "../../msgbase.h"
30 #include "../../parsedate.h"
31 #include "../../database.h"
32 #include "../../citadel_dirs.h"
33 #include "../../context.h"
34 #include "../../internet_addressing.h"
35
36 struct rssfeed {
37         char url[SIZ];                  // string containing the URL of an RSS or Atom feed
38         char room[ROOMNAMELEN];         // the name of the room which is pulling this feed
39 };
40
41 struct rssparser {
42         char url[SIZ];
43         char room[ROOMNAMELEN];
44         StrBuf *CData;
45         struct CtdlMessage *msg;
46         char *link;
47         char *description;
48         char *item_id;
49 };
50
51 time_t last_run = 0L;
52
53
54 // This handler is called whenever an XML tag opens.
55 void rss_start_element(void *data, const char *el, const char **attribute) {
56         struct rssparser *r = (struct rssparser *)data;
57         int i;
58
59         if (server_shutting_down) return;                       // shunt the whole operation if we're exiting
60
61         if (
62                 (!strcasecmp(el, "entry"))
63                 || (!strcasecmp(el, "item"))
64         ) {
65                 // this is the start of a new item(rss) or entry(atom)
66                 if (r->msg != NULL) {
67                         CM_Free(r->msg);
68                         r->msg = NULL;
69                 }
70                 r->msg = malloc(sizeof(struct CtdlMessage));
71                 memset(r->msg, 0, sizeof(struct CtdlMessage));
72                 r->msg->cm_magic = CTDLMESSAGE_MAGIC;
73                 r->msg->cm_anon_type = MES_NORMAL;
74                 r->msg->cm_format_type = FMT_RFC822;
75         }
76
77         else if (!strcasecmp(el, "link")) {                     // atom feeds have the link as an attribute
78                 for(i = 0; attribute[i]; i += 2) {
79                         if (!strcasecmp(attribute[i], "href")) {
80                                 if (r->link != NULL) {
81                                         free(r->link);
82                                         r->link = NULL;
83                                 }
84                                 r->link = strdup(attribute[i+1]);
85                                 string_trim(r->link);
86                         }
87                 }
88         }
89 }
90
91
92 // This handler is called whenever an XML tag closes.
93 void rss_end_element(void *data, const char *el) {
94         struct rssparser *r = (struct rssparser *)data;
95         StrBuf *encoded_field;
96         long msgnum;
97
98         if (server_shutting_down) return;                                       // shunt the whole operation if we're exiting
99
100         if (StrLength(r->CData) > 0) {                                          // strip leading/trailing whitespace from field
101                 StrBufTrim(r->CData);
102         }
103
104         if ((!strcasecmp(el, "entry")) || (!strcasecmp(el, "item"))) {          // end of a new item(rss) or entry(atom)
105                 if (r->msg != NULL) {                                           // Save the message to the room
106
107                         // use the link as an item id if nothing else is available
108                         if ((r->item_id == NULL) && (r->link != NULL)) {
109                                 r->item_id = strdup(r->link);
110                         }
111
112                         // check the use table
113                         StrBuf *u = NewStrBuf();
114                         StrBufAppendPrintf(u, "rss/%s", r->item_id);
115                         int already_seen = CheckIfAlreadySeen(u);
116                         FreeStrBuf(&u);
117
118                         if (already_seen == 0) {
119
120                                 // Compose the message text
121
122                                 StrBuf *TheMessage = NewStrBuf();
123                                 StrBufAppendPrintf(TheMessage, "<html><head></head><body>");
124
125                                 if (r->description != NULL) {
126                                         StrBufAppendPrintf(TheMessage, "%s<br><br>\r\n", r->description);
127                                         free(r->description);
128                                         r->description = NULL;
129                                 }
130
131                                 if (r->link != NULL) {
132                                         StrBufAppendPrintf(TheMessage, "<a href=\"%s\">%s</a>\r\n", r->link, r->link);
133                                         free(r->link);
134                                         r->link = NULL;
135                                 }
136
137                                 StrBufAppendPrintf(TheMessage, "</body></html>\r\n");
138
139                                 // Quoted-Printable encode the HTML message, because RSS and Atom make no guarantee of line length limits.
140                                 StrBuf *TheMessage_Encoded = StrBufQuotedPrintableEncode(TheMessage);
141
142                                 // Now we reuse TheMessage -- this time it will contain the MIME headers concatenated with the encoded message.
143                                 FlushStrBuf(TheMessage);
144                                 StrBufAppendBufPlain(TheMessage, HKEY(
145                                         "Content-type: text/html; charset=UTF-8\r\n"
146                                         "Content-Transfer-Encoding: quoted-printable\r\n"
147                                         "\r\n"
148                                         ), 0
149                                 );
150                                 StrBufAppendBuf(TheMessage, TheMessage_Encoded, 0);
151                                 FreeStrBuf(&TheMessage_Encoded);
152
153                                 CM_SetField(r->msg, eMessageText, ChrPtr(TheMessage));
154                                 FreeStrBuf(&TheMessage);
155
156                                 if (CM_IsEmpty(r->msg, eAuthor)) {
157                                         CM_SetField(r->msg, eAuthor, "rss");
158                                 }
159
160                                 if (CM_IsEmpty(r->msg, eTimestamp)) {
161                                         CM_SetFieldLONG(r->msg, eTimestamp, time(NULL));
162                                 }
163
164                                 CtdlSubmitMsg(r->msg, NULL, r->room);
165                         }
166                         else {
167                                 syslog(LOG_DEBUG, "rssclient: already seen %s", r->item_id);
168                         }
169
170                         CM_Free(r->msg);
171                         r->msg = NULL;
172                 }
173
174                 if (r->item_id != NULL) {
175                         free(r->item_id);
176                         r->item_id = NULL;
177                 }
178         }
179
180         else if (!strcasecmp(el, "title")) {                    // item subject (rss and atom)
181                 if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eMsgSubject))) {
182                         encoded_field = NewStrBuf();
183                         StrBufRFC2047encode(&encoded_field, r->CData);
184                         CM_SetAsFieldSB(r->msg, eMsgSubject, &encoded_field);
185                 }
186         }
187
188         else if (!strcasecmp(el, "creator")) {                  // <creator> can be used if <author> is not present
189                 if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eAuthor))) {
190                         encoded_field = NewStrBuf();
191                         StrBufRFC2047encode(&encoded_field, r->CData);
192                         CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field);
193                 }
194         }
195
196         else if (!strcasecmp(el, "author")) {                   // <author> supercedes <creator> if both are present
197                 if (r->msg != NULL) {
198                         encoded_field = NewStrBuf();
199                         StrBufRFC2047encode(&encoded_field, r->CData);
200                         CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field);
201                 }
202         }
203
204         else if (!strcasecmp(el, "pubdate")) {                  // date/time stamp (rss) Sat, 25 Feb 2017 14:28:01 EST
205                 if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
206                         CM_SetFieldLONG(r->msg, eTimestamp, parsedate(ChrPtr(r->CData)));
207                 }
208         }
209
210         else if (!strcasecmp(el, "updated")) {                  // date/time stamp (atom) 2003-12-13T18:30:02Z
211                 if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
212                         struct tm t;
213                         char zulu;
214                         memset(&t, 0, sizeof t);
215                         sscanf(ChrPtr(r->CData), "%d-%d-%dT%d:%d:%d%c", &t.tm_year, &t.tm_mon, &t.tm_mday, &t.tm_hour, &t.tm_min, &t.tm_sec, &zulu);
216                         t.tm_year -= 1900;
217                         t.tm_mon -= 1;
218                         CM_SetFieldLONG(r->msg, eTimestamp, mktime(&t));
219                 }
220         }
221
222         else if (!strcasecmp(el, "link")) {                     // link to story (rss)
223                 if (r->link != NULL) {
224                         free(r->link);
225                         r->link = NULL;
226                 }
227                 r->link = strdup(ChrPtr(r->CData));
228         }
229
230         else if (
231                 (!strcasecmp(el, "guid"))                       // unique item id (rss)
232                 || (!strcasecmp(el, "id"))                      // unique item id (atom)
233         ) {
234                 if (r->item_id != NULL) {
235                         free(r->item_id);
236                         r->item_id = NULL;
237                 }
238                 r->item_id = strdup(ChrPtr(r->CData));
239         }
240
241         else if (
242                 (!strcasecmp(el, "description"))                // message text (rss)
243                 || (!strcasecmp(el, "summary"))                 // message text (atom)
244                 || (!strcasecmp(el, "content"))                 // message text (atom)
245         ) {
246                 if (r->description != NULL) {
247                         free(r->description);
248                         r->description = NULL;
249                 }
250                 r->description = strdup(ChrPtr(r->CData));
251         }
252
253         if (r->CData != NULL) {
254                 FreeStrBuf(&r->CData);
255                 r->CData = NULL;
256         }
257 }
258
259
260 // This handler is called whenever data appears between opening and closing tags.
261 void rss_handle_data(void *data, const char *content, int length) {
262         struct rssparser *r = (struct rssparser *)data;
263
264         if (r->CData == NULL) {
265                 r->CData = NewStrBuf();
266         }
267
268         StrBufAppendBufPlain(r->CData, content, length, 0);
269 }
270
271
272 // Feed has been downloaded, now parse it.
273 // `Feed` is the actual RSS downloaded from the site.
274 // `url` is a string containing the feed URL
275 void rss_parse_feed(StrBuf *Feed, char *url, char *room) {
276         struct rssparser r;
277
278         memset(&r, 0, sizeof r);
279         strcpy(r.url, url);
280         strcpy(r.room, room);
281         XML_Parser p = XML_ParserCreate("UTF-8");
282         XML_SetElementHandler(p, rss_start_element, rss_end_element);
283         XML_SetCharacterDataHandler(p, rss_handle_data);
284         XML_SetUserData(p, (void *)&r);
285         XML_Parse(p, ChrPtr(Feed), StrLength(Feed), XML_TRUE);
286         XML_ParserFree(p);
287 }
288
289
290 // pull one RSS feed and save it to a room
291 void rss_pull_one_feed(char *url, char *room) {
292         CURL *curl;
293         CURLcode res;
294         StrBuf *Downloaded = NULL;
295
296         curl = curl_easy_init();
297         if (!curl) {
298                 return;
299         }
300
301         Downloaded = NewStrBuf();
302
303         syslog(LOG_DEBUG, "rssclient: fetching %s", url);
304         curl_easy_setopt(curl, CURLOPT_URL, url);
305         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
306         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
307         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);                     // Follow redirects
308         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, CurlFillStrBuf_callback); // What to do with downloaded data
309         curl_easy_setopt(curl, CURLOPT_WRITEDATA, Downloaded);                  // Give it our StrBuf to work with
310         curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);                           // Time out after 20 seconds
311         res = curl_easy_perform(curl);                                          // Perform the request
312         if (res != CURLE_OK) {
313                 syslog(LOG_WARNING, "rssclient: failed to load feed: %s", curl_easy_strerror(res));
314         }
315         curl_easy_cleanup(curl);
316
317         rss_parse_feed(Downloaded, url, room);
318         FreeStrBuf(&Downloaded);                                                // free the downloaded feed data
319 }
320
321
322 // Scan a room's netconfig looking for RSS feed parsing requests
323 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data) {
324         char *serialized_config = NULL;
325         int num_configs = 0;
326         char cfgline[SIZ];
327         struct rssfeed one_feed;
328         int i = 0;
329         Array *feeds = (Array *)data;
330
331         if (server_shutting_down) return;
332
333         serialized_config = LoadRoomNetConfigFile(qrbuf->QRnumber);
334         if (!serialized_config) {
335                 return;
336         }
337
338         num_configs = num_tokens(serialized_config, '\n');
339         for (i=0; i<num_configs; ++i) {
340                 extract_token(cfgline, serialized_config, i, '\n', sizeof cfgline);
341                 if (!strncasecmp(cfgline, HKEY("rssclient|"))) {
342                         strcpy(cfgline, &cfgline[10]);
343                         char *vbar = strchr(cfgline, '|');
344                         if (vbar != NULL) {
345                                 *vbar = 0;
346                         }
347                         safestrncpy(one_feed.url, cfgline, SIZ);
348                         safestrncpy(one_feed.room, qrbuf->QRname, ROOMNAMELEN);
349                         array_append(feeds, &one_feed);
350                 }
351         }
352
353         free(serialized_config);
354 }
355
356
357 // Scan for rooms that have RSS client requests configured
358 void rssclient_scan(void) {
359         time_t now = time(NULL);
360
361         // Run no more than once every 15 minutes.
362         if ((now - last_run) < 900) {
363                 syslog(LOG_DEBUG,
364                         "rssclient: polling interval not yet reached; last run was %ldm%lds ago",
365                         ((now - last_run) / 60),
366                         ((now - last_run) % 60)
367                 );
368                 return;
369         }
370
371         syslog(LOG_DEBUG, "rssclient: started");
372         Array *feeds = array_new(sizeof(struct rssfeed));
373         if (feeds == NULL) {
374                 syslog(LOG_DEBUG, "rssclient: cannot allocate memory for feed list");
375                 return;
376         }
377         CtdlForEachRoom(rssclient_scan_room, feeds);
378
379         for (int i=0; i<array_len(feeds); ++i) {
380                 struct rssfeed *r = (struct rssfeed *) array_get_element_at(feeds, i);
381                 rss_pull_one_feed(r->url, r->room);
382         }
383
384         array_free(feeds);
385         syslog(LOG_DEBUG, "rssclient: ended");
386         last_run = time(NULL);
387         return;
388 }
389
390
391 // Initialization function, called from modules_init.c
392 char *ctdl_module_init_rssclient(void) {
393         if (!threading) {
394                 syslog(LOG_INFO, "rssclient: using %s", curl_version());
395                 CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER, PRIO_AGGR + 300);
396         }
397         return "rssclient";
398 }