86346296244eb3504d1f7b5bccaa532b9b805a50
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2012 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  * 
9  * 
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * 
17  * 
18  * 
19  */
20
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <expat.h>
42 #include <curl/curl.h>
43 #include <libcitadel.h>
44 #include "citadel.h"
45 #include "server.h"
46 #include "citserver.h"
47 #include "support.h"
48 #include "config.h"
49 #include "threads.h"
50 #include "ctdl_module.h"
51 #include "clientsocket.h"
52 #include "msgbase.h"
53 #include "parsedate.h"
54 #include "database.h"
55 #include "citadel_dirs.h"
56 #include "md5.h"
57 #include "context.h"
58 #include "event_client.h"
59 #include "rss_atom_parser.h"
60
61 void rss_save_item(rss_item *ri, rss_aggregator *Cfg);
62
63 int RSSAtomParserDebugEnabled = 0;
64
65 #define N ((rss_aggregator*)IO->Data)->QRnumber
66
67 #define DBGLOG(LEVEL) if ((LEVEL != LOG_DEBUG) || (RSSAtomParserDebugEnabled != 0))
68
69 #define EVRSSATOM_syslog(LEVEL, FORMAT, ...)                            \
70         DBGLOG(LEVEL) syslog(LEVEL,                                     \
71                              "IO[%ld]CC[%d][%ld]RSSP" FORMAT,           \
72                              IO->ID, CCID, N, __VA_ARGS__)
73
74 #define EVRSSATOMM_syslog(LEVEL, FORMAT)                                \
75         DBGLOG(LEVEL) syslog(LEVEL,                                     \
76                              "IO[%ld]CC[%d][%ld]RSSP" FORMAT,           \
77                              IO->ID, CCID, N)
78
79 #define EVRSSATOMCS_syslog(LEVEL, FORMAT, ...)                  \
80         DBGLOG(LEVEL) syslog(LEVEL, "IO[%ld][%ld]RSSP" FORMAT,  \
81                              IO->ID, N, __VA_ARGS__)
82
83 #define EVRSSATOMSM_syslog(LEVEL, FORMAT)                       \
84         DBGLOG(LEVEL) syslog(LEVEL, "IO[%ld][%ld]RSSP" FORMAT,  \
85                              IO->ID, N)
86
87 /*
88  * Convert an RDF/RSS datestamp into a time_t
89  */
90 time_t rdf_parsedate(const char *p)
91 {
92         struct tm tm;
93         time_t t = 0;
94
95         if (!p) return 0L;
96         if (strlen(p) < 10) return 0L;
97
98         memset(&tm, 0, sizeof tm);
99
100         /*
101          * If the timestamp appears to be in W3C datetime format, try to
102          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
103          *
104          * This code, along with parsedate.c, is a potential candidate for
105          * moving into libcitadel.
106          */
107         if ( (p[4] == '-') && (p[7] == '-') ) {
108                 tm.tm_year = atoi(&p[0]) - 1900;
109                 tm.tm_mon = atoi(&p[5]) - 1;
110                 tm.tm_mday = atoi(&p[8]);
111                 if ( (p[10] == 'T') && (p[13] == ':') ) {
112                         tm.tm_hour = atoi(&p[11]);
113                         tm.tm_min = atoi(&p[14]);
114                 }
115                 return mktime(&tm);
116         }
117
118         /* hmm... try RFC822 date stamp format */
119
120         t = parsedate(p);
121         if (t > 0) return(t);
122
123         /* yeesh.  ok, just return the current date and time. */
124         return(time(NULL));
125 }
126
127 void flush_rss_item(rss_item *ri)
128 {
129         /* Initialize the feed item data structure */
130         FreeStrBuf(&ri->guid);
131         FreeStrBuf(&ri->title);
132         FreeStrBuf(&ri->link);
133         FreeStrBuf(&ri->author_or_creator);
134         FreeStrBuf(&ri->author_email);
135         FreeStrBuf(&ri->author_url);
136         FreeStrBuf(&ri->description);
137
138         FreeStrBuf(&ri->linkTitle);
139         FreeStrBuf(&ri->reLink);
140         FreeStrBuf(&ri->reLinkTitle);
141         FreeStrBuf(&ri->channel_title);
142 }
143
144
145 /******************************************************************************
146  *                              XML-Handler                                   *
147  ******************************************************************************/
148
149
150 void RSS_item_rss_start (StrBuf *CData,
151                          rss_item *ri,
152                          rss_aggregator *RSSAggr,
153                          const char** Attr)
154 {
155         AsyncIO         *IO = &RSSAggr->IO;
156         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
157         RSSAggr->ItemType = RSS_RSS;
158 }
159
160 void RSS_item_rdf_start(StrBuf *CData,
161                         rss_item *ri,
162                         rss_aggregator *RSSAggr,
163                         const char** Attr)
164 {
165         AsyncIO         *IO = &RSSAggr->IO;
166         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
167         RSSAggr->ItemType = RSS_RSS;
168 }
169
170 void ATOM_item_feed_start(StrBuf *CData,
171                           rss_item *ri,
172                           rss_aggregator *RSSAggr,
173                           const char** Attr)
174 {
175         AsyncIO         *IO = &RSSAggr->IO;
176         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
177         RSSAggr->ItemType = RSS_ATOM;
178 }
179
180
181 void RSS_item_item_start(StrBuf *CData,
182                          rss_item *ri,
183                          rss_aggregator *RSSAggr,
184                          const char** Attr)
185 {
186         ri->item_tag_nesting ++;
187         flush_rss_item(ri);
188 }
189
190 void ATOM_item_entry_start(StrBuf *CData,
191                            rss_item *ri,
192                            rss_aggregator *RSSAggr,
193                            const char** Attr)
194 {
195 /* Atom feed... */
196         ri->item_tag_nesting ++;
197         flush_rss_item(ri);
198 }
199
200 void ATOM_item_link_start (StrBuf *CData,
201                            rss_item *ri,
202                            rss_aggregator *RSSAggr,
203                            const char** Attr)
204 {
205         int i;
206         const char *pHref = NULL;
207         const char *pType = NULL;
208         const char *pRel = NULL;
209         const char *pTitle = NULL;
210
211         for (i = 0; Attr[i] != NULL; i+=2)
212         {
213                 if (!strcmp(Attr[i], "href"))
214                 {
215                         pHref = Attr[i+1];
216                 }
217                 else if (!strcmp(Attr[i], "rel"))
218                 {
219                         pRel = Attr[i+1];
220                 }
221                 else if (!strcmp(Attr[i], "type"))
222                 {
223                         pType = Attr[i+1];
224                 }
225                 else if (!strcmp(Attr[i], "title"))
226                 {
227                         pTitle = Attr[i+1];
228                 }
229         }
230         if (pHref == NULL)
231                 return; /* WHUT? Pointing... where? */
232         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
233                 return;
234         /* these just point to other rss resources,
235            we're not interested in them. */
236         if (pRel != NULL)
237         {
238                 if (!strcasecmp (pRel, "replies"))
239                 {
240                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
241                         StrBufTrim(ri->link);
242                         NewStrBufDupAppendFlush(&ri->reLinkTitle,
243                                                 NULL,
244                                                 pTitle,
245                                                 -1);
246                 }
247                 else if (!strcasecmp(pRel, "alternate"))
248                 { /* Alternative representation of this Item... */
249                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
250                         StrBufTrim(ri->link);
251                         NewStrBufDupAppendFlush(&ri->linkTitle,
252                                                 NULL,
253                                                 pTitle,
254                                                 -1);
255
256                 }
257 #if 0 /* these are also defined, but dunno what to do with them.. */
258                 else if (!strcasecmp(pRel, "related"))
259                 {
260                 }
261                 else if (!strcasecmp(pRel, "self"))
262                 {
263                 }
264                 else if (!strcasecmp(pRel, "enclosure"))
265                 {/*...reference can get big, and is probably the full article*/
266                 }
267                 else if (!strcasecmp(pRel, "via"))
268                 {/* this article was provided via... */
269                 }
270 #endif
271         }
272         else if (StrLength(ri->link) == 0)
273         {
274                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
275                 StrBufTrim(ri->link);
276                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
277         }
278 }
279
280
281
282
283 void ATOMRSS_item_title_end(StrBuf *CData,
284                             rss_item *ri,
285                             rss_aggregator *RSSAggr,
286                             const char** Attr)
287 {
288         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
289                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
290                 StrBufTrim(ri->channel_title);
291         }
292 }
293
294 void RSS_item_guid_end(StrBuf *CData,
295                        rss_item *ri,
296                        rss_aggregator *RSSAggr,
297                        const char** Attr)
298 {
299         if (StrLength(CData) > 0) {
300                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
301         }
302 }
303
304 void ATOM_item_id_end(StrBuf *CData,
305                       rss_item *ri, rss_aggregator *RSSAggr, const char** Attr)
306 {
307         if (StrLength(CData) > 0) {
308                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
309         }
310 }
311
312
313 void RSS_item_link_end (StrBuf *CData,
314                         rss_item *ri,
315                         rss_aggregator *RSSAggr,
316                         const char** Attr)
317 {
318         if (StrLength(CData) > 0) {
319                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
320                 StrBufTrim(ri->link);
321         }
322 }
323 void RSS_item_relink_end(StrBuf *CData,
324                          rss_item *ri,
325                          rss_aggregator *RSSAggr,
326                          const char** Attr)
327 {
328         if (StrLength(CData) > 0) {
329                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
330                 StrBufTrim(ri->reLink);
331         }
332 }
333
334 void RSSATOM_item_title_end (StrBuf *CData,
335                              rss_item *ri,
336                              rss_aggregator *RSSAggr,
337                              const char** Attr)
338 {
339         if (StrLength(CData) > 0) {
340                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
341                 StrBufTrim(ri->title);
342         }
343 }
344
345 void ATOM_item_content_end (StrBuf *CData,
346                             rss_item *ri,
347                             rss_aggregator *RSSAggr,
348                             const char** Attr)
349 {
350         long olen = StrLength (ri->description);
351         long clen = StrLength (CData);
352         if (clen > 0)
353         {
354                 if (olen == 0) {
355                         NewStrBufDupAppendFlush(&ri->description,
356                                                 CData,
357                                                 NULL,
358                                                 0);
359                         StrBufTrim(ri->description);
360                 }
361                 else if (olen < clen) {
362                         FlushStrBuf(ri->description);
363                         NewStrBufDupAppendFlush(&ri->description,
364                                                 CData,
365                                                 NULL,
366                                                 0);
367
368                         StrBufTrim(ri->description);
369                 }
370         }
371 }
372 void ATOM_item_summary_end (StrBuf *CData,
373                             rss_item *ri,
374                             rss_aggregator *RSSAggr,
375                             const char** Attr)
376 {
377         /*
378          * this can contain an abstract of the article.
379          * but we don't want to verwrite a full document if we already have it.
380          */
381         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
382         {
383                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
384                 StrBufTrim(ri->description);
385         }
386 }
387
388 void RSS_item_description_end (StrBuf *CData,
389                                rss_item *ri,
390                                rss_aggregator *RSSAggr,
391                                const char** Attr)
392 {
393         long olen = StrLength (ri->description);
394         long clen = StrLength (CData);
395         if (clen > 0)
396         {
397                 if (olen == 0) {
398                         NewStrBufDupAppendFlush(&ri->description,
399                                                 CData,
400                                                 NULL,
401                                                 0);
402                         StrBufTrim(ri->description);
403                 }
404                 else if (olen < clen) {
405                         FlushStrBuf(ri->description);
406                         NewStrBufDupAppendFlush(&ri->description,
407                                                 CData,
408                                                 NULL,
409                                                 0);
410                         StrBufTrim(ri->description);
411                 }
412         }
413 }
414
415 void ATOM_item_published_end (StrBuf *CData,
416                               rss_item *ri,
417                               rss_aggregator *RSSAggr,
418                               const char** Attr)
419 {
420         if (StrLength(CData) > 0) {
421                 StrBufTrim(CData);
422                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
423         }
424 }
425
426 void ATOM_item_updated_end (StrBuf *CData,
427                             rss_item *ri,
428                             rss_aggregator *RSSAggr,
429                             const char** Attr)
430 {
431         if (StrLength(CData) > 0) {
432                 StrBufTrim(CData);
433                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
434         }
435 }
436
437 void RSS_item_pubdate_end (StrBuf *CData,
438                            rss_item *ri,
439                            rss_aggregator *RSSAggr,
440                            const char** Attr)
441 {
442         if (StrLength(CData) > 0) {
443                 StrBufTrim(CData);
444                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
445         }
446 }
447
448
449 void RSS_item_date_end (StrBuf *CData,
450                         rss_item *ri,
451                         rss_aggregator *RSSAggr,
452                         const char** Attr)
453 {
454         if (StrLength(CData) > 0) {
455                 StrBufTrim(CData);
456                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
457         }
458 }
459
460
461
462 void RSS_item_author_end(StrBuf *CData,
463                          rss_item *ri,
464                          rss_aggregator *RSSAggr,
465                          const char** Attr)
466 {
467         if (StrLength(CData) > 0) {
468                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
469                 StrBufTrim(ri->author_or_creator);
470         }
471 }
472
473
474 void ATOM_item_name_end(StrBuf *CData,
475                         rss_item *ri,
476                         rss_aggregator *RSSAggr,
477                         const char** Attr)
478 {
479         if (StrLength(CData) > 0) {
480                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
481                 StrBufTrim(ri->author_or_creator);
482         }
483 }
484
485 void ATOM_item_email_end(StrBuf *CData,
486                          rss_item *ri,
487                          rss_aggregator *RSSAggr,
488                          const char** Attr)
489 {
490         if (StrLength(CData) > 0) {
491                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
492                 StrBufTrim(ri->author_email);
493         }
494 }
495
496 void RSS_item_creator_end(StrBuf *CData,
497                           rss_item *ri,
498                           rss_aggregator *RSSAggr,
499                           const char** Attr)
500 {
501         if ((StrLength(CData) > 0) &&
502             (StrLength(ri->author_or_creator) == 0))
503         {
504                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
505                 StrBufTrim(ri->author_or_creator);
506         }
507 }
508
509
510 void ATOM_item_uri_end(StrBuf *CData,
511                        rss_item *ri,
512                        rss_aggregator *RSSAggr,
513                        const char** Attr)
514 {
515         if (StrLength(CData) > 0) {
516                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
517                 StrBufTrim(ri->author_url);
518         }
519 }
520
521 void RSS_item_item_end(StrBuf *CData,
522                        rss_item *ri,
523                        rss_aggregator *RSSAggr,
524                        const char** Attr)
525 {
526         --ri->item_tag_nesting;
527         rss_save_item(ri, RSSAggr);
528 }
529
530
531 void ATOM_item_entry_end(StrBuf *CData,
532                          rss_item *ri,
533                          rss_aggregator *RSSAggr,
534                          const char** Attr)
535 {
536         --ri->item_tag_nesting;
537         rss_save_item(ri, RSSAggr);
538 }
539
540 void RSS_item_rss_end(StrBuf *CData,
541                       rss_item *ri,
542                       rss_aggregator *RSSAggr,
543                       const char** Attr)
544 {
545         AsyncIO         *IO = &RSSAggr->IO;
546         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
547         ri->done_parsing = 1;
548 }
549
550 void RSS_item_rdf_end(StrBuf *CData,
551                       rss_item *ri,
552                       rss_aggregator *RSSAggr,
553                       const char** Attr)
554 {
555         AsyncIO         *IO = &RSSAggr->IO;
556         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
557         ri->done_parsing = 1;
558 }
559
560
561 void RSSATOM_item_ignore(StrBuf *CData,
562                          rss_item *ri,
563                          rss_aggregator *RSSAggr,
564                          const char** Attr)
565 {
566 }
567
568
569
570 /*
571  * This callback stores up the data which appears in between tags.
572  */
573 void rss_xml_cdata_start(void *data)
574 {
575         rss_aggregator *RSSAggr = (rss_aggregator*) data;
576
577         FlushStrBuf(RSSAggr->CData);
578 }
579
580 void rss_xml_cdata_end(void *data)
581 {
582 }
583 void rss_xml_chardata(void *data, const XML_Char *s, int len)
584 {
585         rss_aggregator *RSSAggr = (rss_aggregator*) data;
586
587         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
588 }
589
590
591 /******************************************************************************
592  *                            RSS parser logic                                *
593  ******************************************************************************/
594
595 extern pthread_mutex_t RSSQueueMutex;
596
597 HashList *StartHandlers = NULL;
598 HashList *EndHandlers = NULL;
599 HashList *KnownNameSpaces = NULL;
600
601 void FreeNetworkSaveMessage (void *vMsg)
602 {
603         networker_save_message *Msg = (networker_save_message *) vMsg;
604
605         CtdlFreeMessageContents(&Msg->Msg);
606         FreeStrBuf(&Msg->Message);
607         FreeStrBuf(&Msg->MsgGUID);
608         free(Msg);
609 }
610
611
612 void AppendLink(StrBuf *Message,
613                 StrBuf *link,
614                 StrBuf *LinkTitle,
615                 const char *Title)
616 {
617         if (StrLength(link) > 0)
618         {
619                 StrBufAppendBufPlain(Message, HKEY("<a href=\""), 0);
620                 StrBufAppendBuf(Message, link, 0);
621                 StrBufAppendBufPlain(Message, HKEY("\">"), 0);
622                 if (StrLength(LinkTitle) > 0)
623                         StrBufAppendBuf(Message, LinkTitle, 0);
624                 else if ((Title != NULL) && !IsEmptyStr(Title))
625                         StrBufAppendBufPlain(Message, Title, -1, 0);
626                 else
627                         StrBufAppendBuf(Message, link, 0);
628                 StrBufAppendBufPlain(Message, HKEY("</a><br>\n"), 0);
629         }
630 }
631
632 /*
633  * Commit a fetched and parsed RSS item to disk
634  */
635 void rss_save_item(rss_item *ri, rss_aggregator *RSSAggr)
636 {
637         networker_save_message *SaveMsg;
638         struct MD5Context md5context;
639         u_char rawdigest[MD5_DIGEST_LEN];
640         int msglen = 0;
641         StrBuf *Message;
642         StrBuf *guid;
643         AsyncIO *IO = &RSSAggr->IO;
644         int n;
645
646
647         SaveMsg = (networker_save_message *) malloc(
648                 sizeof(networker_save_message));
649         memset(SaveMsg, 0, sizeof(networker_save_message));
650
651         /* Construct a GUID to use in the S_USETABLE table.
652          * If one is not present in the item itself, make one up.
653          */
654         if (ri->guid != NULL) {
655                 StrBufSpaceToBlank(ri->guid);
656                 StrBufTrim(ri->guid);
657                 guid = NewStrBufPlain(HKEY("rss/"));
658                 StrBufAppendBuf(guid, ri->guid, 0);
659         }
660         else {
661                 MD5Init(&md5context);
662                 if (ri->title != NULL) {
663                         MD5Update(&md5context,
664                                   (const unsigned char*)SKEY(ri->title));
665                 }
666                 if (ri->link != NULL) {
667                         MD5Update(&md5context,
668                                   (const unsigned char*)SKEY(ri->link));
669                 }
670                 MD5Final(rawdigest, &md5context);
671                 guid = NewStrBufPlain(NULL,
672                                       MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
673                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
674                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
675         }
676
677         /* translate Item into message. */
678         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: translating item...\n");
679         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
680         StrBufSpaceToBlank(ri->description);
681         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
682         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
683         SaveMsg->Msg.cm_format_type = FMT_RFC822;
684
685         if (ri->guid != NULL) {
686                 SaveMsg->Msg.cm_fields['E'] = strdup(ChrPtr(ri->guid));
687         }
688
689         if (ri->author_or_creator != NULL) {
690                 char *From;
691                 StrBuf *Encoded = NULL;
692                 int FromAt;
693
694                 From = html_to_ascii(ChrPtr(ri->author_or_creator),
695                                      StrLength(ri->author_or_creator),
696                                      512, 0);
697                 StrBufPlain(ri->author_or_creator, From, -1);
698                 StrBufTrim(ri->author_or_creator);
699                 free(From);
700
701                 FromAt = strchr(ChrPtr(ri->author_or_creator), '@') != NULL;
702                 if (!FromAt && StrLength (ri->author_email) > 0)
703                 {
704                         StrBufRFC2047encode(&Encoded, ri->author_or_creator);
705                         SaveMsg->Msg.cm_fields['A'] = SmashStrBuf(&Encoded);
706                         SaveMsg->Msg.cm_fields['P'] =
707                                 SmashStrBuf(&ri->author_email);
708                 }
709                 else
710                 {
711                         if (FromAt)
712                         {
713                                 SaveMsg->Msg.cm_fields['A'] =
714                                         SmashStrBuf(&ri->author_or_creator);
715                                 SaveMsg->Msg.cm_fields['P'] =
716                                         strdup(SaveMsg->Msg.cm_fields['A']);
717                         }
718                         else
719                         {
720                                 StrBufRFC2047encode(&Encoded,
721                                                     ri->author_or_creator);
722                                 SaveMsg->Msg.cm_fields['A'] =
723                                         SmashStrBuf(&Encoded);
724                                 SaveMsg->Msg.cm_fields['P'] =
725                                         strdup("rss@localhost");
726
727                         }
728                         if (ri->pubdate <= 0) {
729                                 ri->pubdate = time(NULL);
730                         }
731                 }
732         }
733         else {
734                 SaveMsg->Msg.cm_fields['A'] = strdup("rss");
735         }
736
737         SaveMsg->Msg.cm_fields['N'] = strdup(NODENAME);
738         if (ri->title != NULL) {
739                 long len;
740                 char *Sbj;
741                 StrBuf *Encoded, *QPEncoded;
742
743                 QPEncoded = NULL;
744                 StrBufSpaceToBlank(ri->title);
745                 len = StrLength(ri->title);
746                 Sbj = html_to_ascii(ChrPtr(ri->title), len, 512, 0);
747                 len = strlen(Sbj);
748                 if ((len > 0) && (Sbj[len - 1] == '\n'))
749                 {
750                         len --;
751                         Sbj[len] = '\0';
752                 }
753                 Encoded = NewStrBufPlain(Sbj, len);
754                 free(Sbj);
755
756                 StrBufTrim(Encoded);
757                 StrBufRFC2047encode(&QPEncoded, Encoded);
758
759                 SaveMsg->Msg.cm_fields['U'] = SmashStrBuf(&QPEncoded);
760                 FreeStrBuf(&Encoded);
761         }
762         SaveMsg->Msg.cm_fields['T'] = malloc(64);
763         snprintf(SaveMsg->Msg.cm_fields['T'], 64, "%ld", ri->pubdate);
764         if (ri->channel_title != NULL) {
765                 if (StrLength(ri->channel_title) > 0) {
766                         SaveMsg->Msg.cm_fields['O'] =
767                                 strdup(ChrPtr(ri->channel_title));
768                 }
769         }
770         if (ri->link == NULL)
771                 ri->link = NewStrBufPlain(HKEY(""));
772
773 #if 0 /* temporarily disable shorter urls. */
774         SaveMsg->Msg.cm_fields[TMP_SHORTER_URLS] =
775                 GetShorterUrls(ri->description);
776 #endif
777
778         msglen += 1024 + StrLength(ri->link) + StrLength(ri->description) ;
779
780         Message = NewStrBufPlain(NULL, msglen);
781
782         StrBufPlain(Message, HKEY(
783                             "Content-type: text/html; charset=\"UTF-8\"\r\n\r\n"
784                             "<html><body>\n"));
785 #if 0 /* disable shorter url for now. */
786         SaveMsg->Msg.cm_fields[TMP_SHORTER_URL_OFFSET] = StrLength(Message);
787 #endif
788         StrBufAppendBuf(Message, ri->description, 0);
789         StrBufAppendBufPlain(Message, HKEY("<br><br>\n"), 0);
790
791         AppendLink(Message, ri->link, ri->linkTitle, NULL);
792         AppendLink(Message, ri->reLink, ri->reLinkTitle, "Reply to this");
793         StrBufAppendBufPlain(Message, HKEY("</body></html>\n"), 0);
794
795         SaveMsg->MsgGUID = guid;
796         SaveMsg->Message = Message;
797
798         n = GetCount(RSSAggr->Messages) + 1;
799         Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
800 }
801
802
803 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
804 {
805         rss_xml_handler *h;
806         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
807         AsyncIO         *IO = &RSSAggr->IO;
808         rss_item        *ri = RSSAggr->Item;
809         void            *pv;
810         const char      *pel;
811         char            *sep = NULL;
812
813         /* Axe the namespace, we don't care about it */
814         /*
815           syslog(LOG_DEBUG,
816           "RSS: supplied el %d: %s\n", RSSAggr->RSSAggr->ItemType, supplied_el);
817         */
818         pel = supplied_el;
819         while (sep = strchr(pel, ':'), sep) {
820                 pel = sep + 1;
821         }
822
823         if (pel != supplied_el)
824         {
825                 void *v;
826
827                 if (!GetHash(KnownNameSpaces,
828                              supplied_el,
829                              pel - supplied_el - 1,
830                              &v))
831                 {
832                         EVRSSATOM_syslog(LOG_DEBUG,
833                                          "RSS: START ignoring "
834                                          "because of wrong namespace [%s]\n",
835                                          supplied_el);
836                         return;
837                 }
838         }
839
840         StrBufPlain(RSSAggr->Key, pel, -1);
841         StrBufLowerCase(RSSAggr->Key);
842         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
843         {
844                 h = (rss_xml_handler*) pv;
845
846                 if (((h->Flags & RSS_UNSET) != 0) &&
847                     (RSSAggr->ItemType == RSS_UNSET))
848                 {
849                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
850                 }
851                 else if (((h->Flags & RSS_RSS) != 0) &&
852                     (RSSAggr->ItemType == RSS_RSS))
853                 {
854                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
855                 }
856                 else if (((h->Flags & RSS_ATOM) != 0) &&
857                          (RSSAggr->ItemType == RSS_ATOM))
858                 {
859                         h->Handler(RSSAggr->CData,
860                                    ri,
861                                    RSSAggr,
862                                    attr);
863                 }
864                 else
865                         EVRSSATOM_syslog(LOG_DEBUG,
866                                           "RSS: START unhandled: [%s] [%s]...\n",
867                                          pel,
868                                          supplied_el);
869         }
870         else
871                 EVRSSATOM_syslog(LOG_DEBUG,
872                                  "RSS: START unhandled: [%s] [%s]...\n",
873                                  pel,
874                                  supplied_el);
875 }
876
877 void rss_xml_end(void *data, const char *supplied_el)
878 {
879         rss_xml_handler *h;
880         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
881         AsyncIO         *IO = &RSSAggr->IO;
882         rss_item        *ri = RSSAggr->Item;
883         const char      *pel;
884         char            *sep = NULL;
885         void            *pv;
886
887         /* Axe the namespace, we don't care about it */
888         pel = supplied_el;
889         while (sep = strchr(pel, ':'), sep) {
890                 pel = sep + 1;
891         }
892         EVRSSATOM_syslog(LOG_DEBUG, "RSS: END %s...\n", supplied_el);
893         if (pel != supplied_el)
894         {
895                 void *v;
896
897                 if (!GetHash(KnownNameSpaces,
898                              supplied_el,
899                              pel - supplied_el - 1,
900                              &v))
901                 {
902                         EVRSSATOM_syslog(LOG_DEBUG,
903                                          "RSS: END ignoring because of wrong namespace"
904                                          "[%s] = [%s]\n",
905                                          supplied_el,
906                                          ChrPtr(RSSAggr->CData));
907                         FlushStrBuf(RSSAggr->CData);
908                         return;
909                 }
910         }
911
912         StrBufPlain(RSSAggr->Key, pel, -1);
913         StrBufLowerCase(RSSAggr->Key);
914         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
915         {
916                 h = (rss_xml_handler*) pv;
917
918                 if (((h->Flags & RSS_UNSET) != 0) &&
919                     (RSSAggr->ItemType == RSS_UNSET))
920                 {
921                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
922                 }
923                 else if (((h->Flags & RSS_RSS) != 0) &&
924                     (RSSAggr->ItemType == RSS_RSS))
925                 {
926                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
927                 }
928                 else if (((h->Flags & RSS_ATOM) != 0) &&
929                          (RSSAggr->ItemType == RSS_ATOM))
930                 {
931                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
932                 }
933                 else
934                         EVRSSATOM_syslog(LOG_DEBUG,
935                                          "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
936                                          pel,
937                                          supplied_el,
938                                          ChrPtr(RSSAggr->CData));
939         }
940         else
941                 EVRSSATOM_syslog(LOG_DEBUG,
942                                  "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
943                                  pel,
944                                  supplied_el,
945                                  ChrPtr(RSSAggr->CData));
946         FlushStrBuf(RSSAggr->CData);
947 }
948
949 /*
950  * Callback function for passing libcurl's output to expat for parsing
951  * we don't do streamed parsing so expat can handle non-utf8 documents
952 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
953 {
954         XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
955         return (size*nmemb);
956 }
957  */
958
959 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
960 {
961         StrBuf *Buf;
962         rss_aggregator *RSSAggr;
963         rss_item *ri;
964         const char *at;
965         char *ptr;
966         long len;
967         const char *Key;
968
969
970         if (IO->HttpReq.httpcode != 200)
971         {
972
973                 EVRSSATOM_syslog(LOG_ALERT, "need a 200, got a %ld !\n",
974                                  IO->HttpReq.httpcode);
975 // TODO: aide error message with rate limit
976                 return eAbort;
977         }
978
979         RSSAggr = IO->Data;
980         ri = RSSAggr->Item;
981         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
982         RSSAggr->Key = NewStrBuf();
983         at = NULL;
984         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
985         ptr = NULL;
986
987 #define encoding "encoding=\""
988         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
989         if (ptr != NULL)
990         {
991                 char *pche;
992
993                 ptr += sizeof (encoding) - 1;
994                 pche = strchr(ptr, '"');
995                 if (pche != NULL)
996                         StrBufCutAt(RSSAggr->Key, -1, pche);
997                 else
998                         ptr = "UTF-8";
999         }
1000         else
1001                 ptr = "UTF-8";
1002
1003         EVRSSATOM_syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
1004
1005         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
1006         if (!RSSAggr->xp) {
1007                 EVRSSATOMM_syslog(LOG_ALERT, "Cannot create XML parser!\n");
1008                 return eAbort;
1009         }
1010         FlushStrBuf(RSSAggr->Key);
1011
1012         RSSAggr->Messages = NewHash(1, Flathash);
1013         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
1014         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
1015         XML_SetUserData(RSSAggr->xp, RSSAggr);
1016         XML_SetCdataSectionHandler(RSSAggr->xp,
1017                                    rss_xml_cdata_start,
1018                                    rss_xml_cdata_end);
1019
1020
1021         len = StrLength(IO->HttpReq.ReplyData);
1022         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
1023         XML_Parse(RSSAggr->xp, ptr, len, 0);
1024         free (ptr);
1025         if (ri->done_parsing == 0)
1026                 XML_Parse(RSSAggr->xp, "", 0, 1);
1027
1028
1029         EVRSSATOM_syslog(LOG_DEBUG, "RSS: XML Status [%s] \n",
1030                          XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
1031
1032         XML_ParserFree(RSSAggr->xp);
1033         flush_rss_item(ri);
1034
1035         Buf = NewStrBufDup(RSSAggr->rooms);
1036         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
1037         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
1038         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
1039
1040         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
1041
1042 //RSSAggr->next_poll = time(NULL) + config.c_net_freq;
1043         if (GetNextHashPos(RSSAggr->Messages,
1044                            RSSAggr->Pos,
1045                            &len,
1046                            &Key,
1047                            (void**) &RSSAggr->ThisMsg))
1048                 return QueueDBOperation(IO, RSS_FetchNetworkUsetableEntry);
1049         else
1050                 return eAbort;
1051 }
1052
1053
1054 /******************************************************************************
1055  *                    RSS handler registering logic                           *
1056  ******************************************************************************/
1057
1058 void AddRSSStartHandler(rss_handler_func Handler,
1059                         int Flags,
1060                         const char *key,
1061                         long len)
1062 {
1063         rss_xml_handler *h;
1064         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
1065         h->Flags = Flags;
1066         h->Handler = Handler;
1067         Put(StartHandlers, key, len, h, NULL);
1068 }
1069
1070 void AddRSSEndHandler(rss_handler_func Handler,
1071                       int Flags,
1072                       const char *key,
1073                       long len)
1074 {
1075         rss_xml_handler *h;
1076         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
1077         h->Flags = Flags;
1078         h->Handler = Handler;
1079         Put(EndHandlers, key, len, h, NULL);
1080 }
1081
1082 void rss_parser_cleanup(void)
1083 {
1084         DeleteHash(&StartHandlers);
1085         DeleteHash(&EndHandlers);
1086         DeleteHash(&KnownNameSpaces);
1087 }
1088
1089 void LogDebugEnableRSSATOMParser(const int n)
1090 {
1091         RSSAtomParserDebugEnabled = n;
1092 }
1093
1094 CTDL_MODULE_INIT(rssparser)
1095 {
1096         if (!threading)
1097         {
1098                 StartHandlers = NewHash(1, NULL);
1099                 EndHandlers = NewHash(1, NULL);
1100
1101                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
1102                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
1103                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
1104                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
1105                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
1106                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
1107
1108                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1109                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
1110                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
1111                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
1112 #if 0
1113 // hm, rss to the comments of that blog, might be interesting in future, but...
1114                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
1115 // comment count...
1116                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
1117 #endif
1118                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1119                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
1120                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
1121                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
1122                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
1123                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
1124                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
1125                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
1126                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
1127                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
1128                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
1129 /* <author> */
1130                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
1131                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
1132                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
1133 /* </author> */
1134                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
1135                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
1136                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
1137                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
1138
1139
1140 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
1141                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1142                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1143                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1144                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1145
1146 /* links to other feed generators... */
1147                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1148                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1149                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1150                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1151
1152                 KnownNameSpaces = NewHash(1, NULL);
1153                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
1154                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
1155                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
1156                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
1157                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
1158                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1159                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
1160                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
1161                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
1162                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1163                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
1164                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
1165                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1166                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1167                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1168                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1169                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1170                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1171                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1172                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1173                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1174                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1175                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1176 #if 0
1177                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1178                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1179 #endif
1180                 CtdlRegisterDebugFlagHook(HKEY("RSSAtomParser"), LogDebugEnableRSSATOMParser, &RSSAtomParserDebugEnabled);
1181                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1182         }
1183         return "rssparser";
1184 }