/* We used to wait for all threads to exit. Fuck that. The only thing important...
[citadel.git] / citadel / modules / rssclient / serv_rssclient.c
1 /*
2  * Bring external RSS and/or Atom feeds into rooms.  This module implements a
3  * very loose parser that scrapes both kinds of feeds and is not picky about
4  * the standards compliance of the source data.
5  *
6  * Copyright (c) 2007-2021 by the citadel.org team
7  *
8  * This program is open source software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License version 3.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  */
16
17 #include <stdlib.h>
18 #include <unistd.h>
19 #include <stdio.h>
20 #include <time.h>
21 #include <ctype.h>
22 #include <string.h>
23 #include <errno.h>
24 #include <sys/types.h>
25 #include <sys/stat.h>
26 #include <expat.h>
27 #include <curl/curl.h>
28 #include <libcitadel.h>
29 #include "citadel.h"
30 #include "server.h"
31 #include "citserver.h"
32 #include "support.h"
33 #include "config.h"
34 #include "threads.h"
35 #include "ctdl_module.h"
36 #include "msgbase.h"
37 #include "parsedate.h"
38 #include "database.h"
39 #include "citadel_dirs.h"
40 #include "context.h"
41 #include "internet_addressing.h"
42
43 struct rssroom {
44         struct rssroom *next;
45         char *room;
46 };
47
48 struct rssurl {
49         struct rssurl *next;
50         char *url;
51         struct rssroom *rooms;
52 };
53
54 struct rssparser {
55         StrBuf *CData;
56         struct CtdlMessage *msg;
57         char *link;
58         char *description;
59         char *item_id;
60         struct rssroom *rooms;
61 };
62
63 time_t last_run = 0L;
64 struct rssurl *rsstodo = NULL;
65
66
67 // This handler is called whenever an XML tag opens.
68 //
69 void rss_start_element(void *data, const char *el, const char **attribute) {
70         struct rssparser *r = (struct rssparser *)data;
71         int i;
72
73         if (server_shutting_down) return;                       // shunt the whole operation if we're exiting
74
75         if (
76                 (!strcasecmp(el, "entry"))
77                 || (!strcasecmp(el, "item"))
78         ) {
79                 // this is the start of a new item(rss) or entry(atom)
80                 if (r->msg != NULL) {
81                         CM_Free(r->msg);
82                         r->msg = NULL;
83                 }
84                 r->msg = malloc(sizeof(struct CtdlMessage));
85                 memset(r->msg, 0, sizeof(struct CtdlMessage));
86                 r->msg->cm_magic = CTDLMESSAGE_MAGIC;
87                 r->msg->cm_anon_type = MES_NORMAL;
88                 r->msg->cm_format_type = FMT_RFC822;
89         }
90
91         else if (!strcasecmp(el, "link")) {                     // atom feeds have the link as an attribute
92                 for(i = 0; attribute[i]; i += 2) {
93                         if (!strcasecmp(attribute[i], "href")) {
94                                 if (r->link != NULL) {
95                                         free(r->link);
96                                         r->link = NULL;
97                                 }
98                                 r->link = strdup(attribute[i+1]);
99                                 striplt(r->link);
100                         }
101                 }
102         }
103 }
104
105
106 // This handler is called whenever an XML tag closes.
107 //
108 void rss_end_element(void *data, const char *el) {
109         struct rssparser *r = (struct rssparser *)data;
110         StrBuf *encoded_field;
111
112         if (server_shutting_down) return;                       // shunt the whole operation if we're exiting
113
114         if (StrLength(r->CData) > 0) {                          // strip leading/trailing whitespace from field
115                 StrBufTrim(r->CData);
116         }
117
118         if (                                                    // end of a new item(rss) or entry(atom)
119                 (!strcasecmp(el, "entry"))
120                 || (!strcasecmp(el, "item"))
121         ) {
122                 if (r->msg != NULL) {                           // Save the message to the rooms
123
124                         // use the link as an item id if nothing else is available
125                         if ((r->item_id == NULL) && (r->link != NULL)) {
126                                 r->item_id = strdup(r->link);
127                         }
128
129                         // check the use table
130                         StrBuf *u = NewStrBuf();
131                         StrBufAppendPrintf(u, "rss/%s", r->item_id);
132                         int already_seen = CheckIfAlreadySeen(u);
133                         FreeStrBuf(&u);
134
135                         if (already_seen == 0) {
136
137                                 // Compose the message text
138                                 StrBuf *TheMessage = NewStrBuf();
139                                 StrBufAppendPrintf(TheMessage,
140                                         "Content-type: text/html\n\n"
141                                         "\n\n"
142                                         "<html><head></head><body>"
143                                 );
144                 
145                                 if (r->description != NULL) {
146                                         StrBufAppendPrintf(TheMessage, "%s<br><br>\r\n", r->description);
147                                         free(r->description);
148                                         r->description = NULL;
149                                 }
150                 
151                                 if (r->link != NULL) {
152                                         StrBufAppendPrintf(TheMessage, "<a href=\"%s\">%s</a>\r\n", r->link, r->link);
153                                         free(r->link);
154                                         r->link = NULL;
155                                 }
156         
157                                 StrBufAppendPrintf(TheMessage, "</body></html>\r\n");
158                                 CM_SetField(r->msg, eMesageText, ChrPtr(TheMessage), StrLength(TheMessage));
159                                 FreeStrBuf(&TheMessage);
160         
161                                 if (CM_IsEmpty(r->msg, eAuthor)) {
162                                         CM_SetField(r->msg, eAuthor, HKEY("rss"));
163                                 }
164         
165                                 if (CM_IsEmpty(r->msg, eTimestamp)) {
166                                         CM_SetFieldLONG(r->msg, eTimestamp, time(NULL));
167                                 }
168         
169                                 // Save it to the room(s)
170                                 struct rssroom *rr = NULL;
171                                 long msgnum = (-1);
172                                 for (rr=r->rooms; rr!=NULL; rr=rr->next) {
173                                         if (rr == r->rooms) {
174                                                 msgnum = CtdlSubmitMsg(r->msg, NULL, rr->room);         // in first room, save msg
175                                         }
176                                         else {
177                                                 CtdlSaveMsgPointerInRoom(rr->room, msgnum, 0, NULL);    // elsewhere, save a pointer
178                                         }
179                                         syslog(LOG_DEBUG, "rssclient: saved message %ld to %s", msgnum, rr->room);
180                                 }
181                         }
182                         else {
183                                 syslog(LOG_DEBUG, "rssclient: already seen %s", r->item_id);
184                         }
185         
186                         CM_Free(r->msg);
187                         r->msg = NULL;
188                 }
189
190                 if (r->item_id != NULL) {
191                         free(r->item_id);
192                         r->item_id = NULL;
193                 }
194         }
195
196         else if (!strcasecmp(el, "title")) {                    // item subject (rss and atom)
197                 if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eMsgSubject))) {
198                         encoded_field = NewStrBuf();
199                         StrBufRFC2047encode(&encoded_field, r->CData);
200                         CM_SetAsFieldSB(r->msg, eMsgSubject, &encoded_field);
201                 }
202         }
203
204         else if (!strcasecmp(el, "creator")) {                  // <creator> can be used if <author> is not present
205                 if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eAuthor))) {
206                         encoded_field = NewStrBuf();
207                         StrBufRFC2047encode(&encoded_field, r->CData);
208                         CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field);
209                 }
210         }
211
212         else if (!strcasecmp(el, "author")) {                   // <author> supercedes <creator> if both are present
213                 if (r->msg != NULL) {
214                         encoded_field = NewStrBuf();
215                         StrBufRFC2047encode(&encoded_field, r->CData);
216                         CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field);
217                 }
218         }
219
220         else if (!strcasecmp(el, "pubdate")) {                  // date/time stamp (rss) Sat, 25 Feb 2017 14:28:01 EST
221                 if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
222                         CM_SetFieldLONG(r->msg, eTimestamp, parsedate(ChrPtr(r->CData)));
223                 }
224         }
225
226         else if (!strcasecmp(el, "updated")) {                  // date/time stamp (atom) 2003-12-13T18:30:02Z
227                 if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
228                         struct tm t;
229                         char zulu;
230                         memset(&t, 0, sizeof t);
231                         sscanf(ChrPtr(r->CData), "%d-%d-%dT%d:%d:%d%c", &t.tm_year, &t.tm_mon, &t.tm_mday, &t.tm_hour, &t.tm_min, &t.tm_sec, &zulu);
232                         t.tm_year -= 1900;
233                         t.tm_mon -= 1;
234                         CM_SetFieldLONG(r->msg, eTimestamp, mktime(&t));
235                 }
236         }
237
238         else if (!strcasecmp(el, "link")) {                     // link to story (rss)
239                 if (r->link != NULL) {
240                         free(r->link);
241                         r->link = NULL;
242                 }
243                 r->link = strdup(ChrPtr(r->CData));
244         }
245
246         else if (
247                 (!strcasecmp(el, "guid"))                       // unique item id (rss)
248                 || (!strcasecmp(el, "id"))                      // unique item id (atom)
249         ) {
250                 if (r->item_id != NULL) {
251                         free(r->item_id);
252                         r->item_id = NULL;
253                 }
254                 r->item_id = strdup(ChrPtr(r->CData));
255         }
256
257         else if (
258                 (!strcasecmp(el, "description"))                // message text (rss)
259                 || (!strcasecmp(el, "summary"))                 // message text (atom)
260                 || (!strcasecmp(el, "content"))                 // message text (atom)
261         ) {
262                 if (r->description != NULL) {
263                         free(r->description);
264                         r->description = NULL;
265                 }
266                 r->description = strdup(ChrPtr(r->CData));
267         }
268
269         if (r->CData != NULL) {
270                 FreeStrBuf(&r->CData);
271                 r->CData = NULL;
272         }
273 }
274
275
276 // This handler is called whenever data appears between opening and closing tags.
277 //
278 void rss_handle_data(void *data, const char *content, int length)
279 {
280         struct rssparser *r = (struct rssparser *)data;
281
282         if (r->CData == NULL) {
283                 r->CData = NewStrBuf();
284         }
285
286         StrBufAppendBufPlain(r->CData, content, length, 0);
287 }
288
289
290 // Feed has been downloaded, now parse it.
291 //
292 void rss_parse_feed(StrBuf *Feed, struct rssroom *rooms)
293 {
294         struct rssparser r;
295
296         memset(&r, 0, sizeof r);
297         r.rooms = rooms;
298         XML_Parser p = XML_ParserCreate("UTF-8");
299         XML_SetElementHandler(p, rss_start_element, rss_end_element);
300         XML_SetCharacterDataHandler(p, rss_handle_data);
301         XML_SetUserData(p, (void *)&r);
302         XML_Parse(p, ChrPtr(Feed), StrLength(Feed), XML_TRUE);
303         XML_ParserFree(p);
304 }
305
306
307 // Add a feed/room pair into the todo list
308 //
309 void rssclient_push_todo(char *rssurl, char *roomname)
310 {
311         struct rssurl *r = NULL;
312         struct rssurl *thisone = NULL;
313         struct rssroom *newroom = NULL;
314
315         syslog(LOG_DEBUG, "rssclient: will fetch %s to %s", rssurl, roomname);
316
317         for (r=rsstodo; r!=NULL; r=r->next) {
318                 if (!strcasecmp(r->url, rssurl)) {
319                         thisone = r;
320                 }
321         }
322
323         if (thisone == NULL) {
324                 thisone = malloc(sizeof(struct rssurl));
325                 thisone->url = strdup(rssurl);
326                 thisone->rooms = NULL;
327                 thisone->next = rsstodo;
328                 rsstodo = thisone;
329         }
330
331         newroom = malloc(sizeof(struct rssroom));
332         newroom->room = strdup(roomname);
333         newroom->next = thisone->rooms;
334         thisone->rooms = newroom;
335 }
336
337
338 // pull one feed (possibly multiple rooms)
339 //
340 void rss_pull_one_feed(struct rssurl *url)
341 {
342         CURL *curl;
343         CURLcode res;
344         StrBuf *Downloaded = NULL;
345
346         syslog(LOG_DEBUG, "rssclient: fetching %s", url->url);
347
348         curl = curl_easy_init();
349         if (!curl) {
350                 return;
351         }
352
353         Downloaded = NewStrBuf();
354
355         curl_easy_setopt(curl, CURLOPT_URL, url->url);
356         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
357         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
358         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);                     // Follow redirects
359         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, CurlFillStrBuf_callback); // What to do with downloaded data
360         curl_easy_setopt(curl, CURLOPT_WRITEDATA, Downloaded);                  // Give it our StrBuf to work with
361         curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);                           // Time out after 20 seconds
362         res = curl_easy_perform(curl);                                          // Perform the request
363         if (res != CURLE_OK) {
364                 syslog(LOG_WARNING, "rssclient: failed to load feed: %s", curl_easy_strerror(res));
365         }
366         curl_easy_cleanup(curl);
367
368         rss_parse_feed(Downloaded, url->rooms);                                 // parse the feed
369         FreeStrBuf(&Downloaded);                                                // free the downloaded feed data
370 }
371
372
373 // We have a list, now download the feeds
374 //
375 void rss_pull_feeds(void)
376 {
377         struct rssurl *r;
378         struct rssroom *rr;
379
380         while ((rsstodo != NULL) && (!server_shutting_down)) {
381                 rss_pull_one_feed(rsstodo);
382                 r = rsstodo;
383                 rsstodo = rsstodo->next;
384                 while (r->rooms != NULL) {
385                         rr = r->rooms;
386                         r->rooms = r->rooms->next;
387                         free(rr->room);
388                         free(rr);
389                 }
390                 free(r->url);
391                 free(r);
392         }
393 }
394
395
396 // Scan a room's netconfig looking for RSS feed parsing requests
397 //
398 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data)
399 {
400         char *serialized_config = NULL;
401         int num_configs = 0;
402         char cfgline[SIZ];
403         int i = 0;
404
405         if (server_shutting_down) return;
406
407         serialized_config = LoadRoomNetConfigFile(qrbuf->QRnumber);
408         if (!serialized_config) {
409                 return;
410         }
411
412         num_configs = num_tokens(serialized_config, '\n');
413         for (i=0; i<num_configs; ++i) {
414                 extract_token(cfgline, serialized_config, i, '\n', sizeof cfgline);
415                 if (!strncasecmp(cfgline, HKEY("rssclient|"))) {
416                         strcpy(cfgline, &cfgline[10]);
417                         char *vbar = strchr(cfgline, '|');
418                         if (vbar != NULL) {
419                                 *vbar = 0;
420                         }
421                         rssclient_push_todo(cfgline, qrbuf->QRname);
422                 }
423         }
424
425         free(serialized_config);
426 }
427
428
429 /*
430  * Scan for rooms that have RSS client requests configured
431  */
432 void rssclient_scan(void) {
433         time_t now = time(NULL);
434
435         /* Run no more than once every 15 minutes. */
436         if ((now - last_run) < 900) {
437                 syslog(LOG_DEBUG,
438                         "rssclient: polling interval not yet reached; last run was %ldm%lds ago",
439                         ((now - last_run) / 60),
440                         ((now - last_run) % 60)
441                 );
442                 return;
443         }
444
445         syslog(LOG_DEBUG, "rssclient: started");
446         CtdlForEachRoom(rssclient_scan_room, NULL);
447         rss_pull_feeds();
448         syslog(LOG_DEBUG, "rssclient: ended");
449         last_run = time(NULL);
450         return;
451 }
452
453
454 CTDL_MODULE_INIT(rssclient)
455 {
456         if (!threading)
457         {
458                 syslog(LOG_INFO, "rssclient: using %s", curl_version());
459                 CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER, PRIO_AGGR + 300);
460         }
461         return "rssclient";
462 }