cd4121d5d1172bb52392e553fc49a2feb720e726
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2012 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  * 
9  * 
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * 
17  * 
18  * 
19  */
20
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <expat.h>
42 #include <curl/curl.h>
43 #include <libcitadel.h>
44 #include "citadel.h"
45 #include "server.h"
46 #include "citserver.h"
47 #include "support.h"
48 #include "config.h"
49 #include "threads.h"
50 #include "ctdl_module.h"
51 #include "clientsocket.h"
52 #include "msgbase.h"
53 #include "parsedate.h"
54 #include "database.h"
55 #include "citadel_dirs.h"
56 #include "md5.h"
57 #include "context.h"
58 #include "event_client.h"
59 #include "rss_atom_parser.h"
60
61 void rss_remember_item(rss_item *ri, rss_aggregator *Cfg);
62
63 int RSSAtomParserDebugEnabled = 0;
64
65 #define N ((rss_aggregator*)IO->Data)->Cfg.QRnumber
66
67 #define DBGLOG(LEVEL) if ((LEVEL != LOG_DEBUG) || (RSSAtomParserDebugEnabled != 0))
68
69 #define EVRSSATOM_syslog(LEVEL, FORMAT, ...)                            \
70         DBGLOG(LEVEL) syslog(LEVEL,                                     \
71                              "%s[%ld]CC[%d][%ld]RSSP" FORMAT,           \
72                              IOSTR, IO->ID, CCID, N, __VA_ARGS__)
73
74 #define EVRSSATOMM_syslog(LEVEL, FORMAT)                                \
75         DBGLOG(LEVEL) syslog(LEVEL,                                     \
76                              "%s[%ld]CC[%d][%ld]RSSP" FORMAT,           \
77                              IOSTR, IO->ID, CCID, N)
78
79 #define EVRSSATOMCS_syslog(LEVEL, FORMAT, ...)                  \
80         DBGLOG(LEVEL) syslog(LEVEL, "%s[%ld][%ld]RSSP" FORMAT,  \
81                              IOSTR, IO->ID, N, __VA_ARGS__)
82
83 #define EVRSSATOMSM_syslog(LEVEL, FORMAT)                       \
84         DBGLOG(LEVEL) syslog(LEVEL, "%s[%ld][%ld]RSSP" FORMAT,  \
85                              IOSTR, IO->ID, N)
86
87 /*
88  * Convert an RDF/RSS datestamp into a time_t
89  */
90 time_t rdf_parsedate(const char *p)
91 {
92         struct tm tm;
93         time_t t = 0;
94
95         if (!p) return 0L;
96         if (strlen(p) < 10) return 0L;
97
98         memset(&tm, 0, sizeof tm);
99
100         /*
101          * If the timestamp appears to be in W3C datetime format, try to
102          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
103          *
104          * This code, along with parsedate.c, is a potential candidate for
105          * moving into libcitadel.
106          */
107         if ( (p[4] == '-') && (p[7] == '-') ) {
108                 tm.tm_year = atoi(&p[0]) - 1900;
109                 tm.tm_mon = atoi(&p[5]) - 1;
110                 tm.tm_mday = atoi(&p[8]);
111                 if ( (p[10] == 'T') && (p[13] == ':') ) {
112                         tm.tm_hour = atoi(&p[11]);
113                         tm.tm_min = atoi(&p[14]);
114                 }
115                 return mktime(&tm);
116         }
117
118         /* hmm... try RFC822 date stamp format */
119
120         t = parsedate(p);
121         if (t > 0) return(t);
122
123         /* yeesh.  ok, just return the current date and time. */
124         return(time(NULL));
125 }
126
127 void flush_rss_item(rss_item *ri)
128 {
129         /* Initialize the feed item data structure */
130         FreeStrBuf(&ri->guid);
131         FreeStrBuf(&ri->title);
132         FreeStrBuf(&ri->link);
133         FreeStrBuf(&ri->author_or_creator);
134         FreeStrBuf(&ri->author_email);
135         FreeStrBuf(&ri->author_url);
136         FreeStrBuf(&ri->description);
137
138         FreeStrBuf(&ri->linkTitle);
139         FreeStrBuf(&ri->reLink);
140         FreeStrBuf(&ri->reLinkTitle);
141         FreeStrBuf(&ri->channel_title);
142 }
143
144
145 /******************************************************************************
146  *                              XML-Handler                                   *
147  ******************************************************************************/
148
149
150 void RSS_item_rss_start (StrBuf *CData,
151                          rss_item *ri,
152                          rss_aggregator *RSSAggr,
153                          const char** Attr)
154 {
155         AsyncIO         *IO = &RSSAggr->IO;
156         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
157         RSSAggr->ItemType = RSS_RSS;
158 }
159
160 void RSS_item_rdf_start(StrBuf *CData,
161                         rss_item *ri,
162                         rss_aggregator *RSSAggr,
163                         const char** Attr)
164 {
165         AsyncIO         *IO = &RSSAggr->IO;
166         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
167         RSSAggr->ItemType = RSS_RSS;
168 }
169
170 void ATOM_item_feed_start(StrBuf *CData,
171                           rss_item *ri,
172                           rss_aggregator *RSSAggr,
173                           const char** Attr)
174 {
175         AsyncIO         *IO = &RSSAggr->IO;
176         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
177         RSSAggr->ItemType = RSS_ATOM;
178 }
179
180
181 void RSS_item_item_start(StrBuf *CData,
182                          rss_item *ri,
183                          rss_aggregator *RSSAggr,
184                          const char** Attr)
185 {
186         ri->item_tag_nesting ++;
187         flush_rss_item(ri);
188 }
189
190 void ATOM_item_entry_start(StrBuf *CData,
191                            rss_item *ri,
192                            rss_aggregator *RSSAggr,
193                            const char** Attr)
194 {
195 /* Atom feed... */
196         ri->item_tag_nesting ++;
197         flush_rss_item(ri);
198 }
199
200 void ATOM_item_link_start (StrBuf *CData,
201                            rss_item *ri,
202                            rss_aggregator *RSSAggr,
203                            const char** Attr)
204 {
205         int i;
206         const char *pHref = NULL;
207         const char *pType = NULL;
208         const char *pRel = NULL;
209         const char *pTitle = NULL;
210
211         for (i = 0; Attr[i] != NULL; i+=2)
212         {
213                 if (!strcmp(Attr[i], "href"))
214                 {
215                         pHref = Attr[i+1];
216                 }
217                 else if (!strcmp(Attr[i], "rel"))
218                 {
219                         pRel = Attr[i+1];
220                 }
221                 else if (!strcmp(Attr[i], "type"))
222                 {
223                         pType = Attr[i+1];
224                 }
225                 else if (!strcmp(Attr[i], "title"))
226                 {
227                         pTitle = Attr[i+1];
228                 }
229         }
230         if (pHref == NULL)
231                 return; /* WHUT? Pointing... where? */
232         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
233                 return;
234         /* these just point to other rss resources,
235            we're not interested in them. */
236         if (pRel != NULL)
237         {
238                 if (!strcasecmp (pRel, "replies"))
239                 {
240                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
241                         StrBufTrim(ri->link);
242                         NewStrBufDupAppendFlush(&ri->reLinkTitle,
243                                                 NULL,
244                                                 pTitle,
245                                                 -1);
246                 }
247                 else if (!strcasecmp(pRel, "alternate"))
248                 { /* Alternative representation of this Item... */
249                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
250                         StrBufTrim(ri->link);
251                         NewStrBufDupAppendFlush(&ri->linkTitle,
252                                                 NULL,
253                                                 pTitle,
254                                                 -1);
255
256                 }
257 #if 0 /* these are also defined, but dunno what to do with them.. */
258                 else if (!strcasecmp(pRel, "related"))
259                 {
260                 }
261                 else if (!strcasecmp(pRel, "self"))
262                 {
263                 }
264                 else if (!strcasecmp(pRel, "enclosure"))
265                 {/*...reference can get big, and is probably the full article*/
266                 }
267                 else if (!strcasecmp(pRel, "via"))
268                 {/* this article was provided via... */
269                 }
270 #endif
271         }
272         else if (StrLength(ri->link) == 0)
273         {
274                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
275                 StrBufTrim(ri->link);
276                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
277         }
278 }
279
280
281
282
283 void ATOMRSS_item_title_end(StrBuf *CData,
284                             rss_item *ri,
285                             rss_aggregator *RSSAggr,
286                             const char** Attr)
287 {
288         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
289                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
290                 StrBufTrim(ri->channel_title);
291         }
292 }
293
294 void RSS_item_guid_end(StrBuf *CData,
295                        rss_item *ri,
296                        rss_aggregator *RSSAggr,
297                        const char** Attr)
298 {
299         if (StrLength(CData) > 0) {
300                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
301         }
302 }
303
304 void ATOM_item_id_end(StrBuf *CData,
305                       rss_item *ri, rss_aggregator *RSSAggr, const char** Attr)
306 {
307         if (StrLength(CData) > 0) {
308                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
309         }
310 }
311
312
313 void RSS_item_link_end (StrBuf *CData,
314                         rss_item *ri,
315                         rss_aggregator *RSSAggr,
316                         const char** Attr)
317 {
318         if (StrLength(CData) > 0) {
319                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
320                 StrBufTrim(ri->link);
321         }
322 }
323 void RSS_item_relink_end(StrBuf *CData,
324                          rss_item *ri,
325                          rss_aggregator *RSSAggr,
326                          const char** Attr)
327 {
328         if (StrLength(CData) > 0) {
329                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
330                 StrBufTrim(ri->reLink);
331         }
332 }
333
334 void RSSATOM_item_title_end (StrBuf *CData,
335                              rss_item *ri,
336                              rss_aggregator *RSSAggr,
337                              const char** Attr)
338 {
339         if (StrLength(CData) > 0) {
340                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
341                 StrBufTrim(ri->title);
342         }
343 }
344
345 void ATOM_item_content_end (StrBuf *CData,
346                             rss_item *ri,
347                             rss_aggregator *RSSAggr,
348                             const char** Attr)
349 {
350         long olen = StrLength (ri->description);
351         long clen = StrLength (CData);
352         if (clen > 0)
353         {
354                 if (olen == 0) {
355                         NewStrBufDupAppendFlush(&ri->description,
356                                                 CData,
357                                                 NULL,
358                                                 0);
359                         StrBufTrim(ri->description);
360                 }
361                 else if (olen < clen) {
362                         FlushStrBuf(ri->description);
363                         NewStrBufDupAppendFlush(&ri->description,
364                                                 CData,
365                                                 NULL,
366                                                 0);
367
368                         StrBufTrim(ri->description);
369                 }
370         }
371 }
372 void ATOM_item_summary_end (StrBuf *CData,
373                             rss_item *ri,
374                             rss_aggregator *RSSAggr,
375                             const char** Attr)
376 {
377         /*
378          * this can contain an abstract of the article.
379          * but we don't want to verwrite a full document if we already have it.
380          */
381         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
382         {
383                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
384                 StrBufTrim(ri->description);
385         }
386 }
387
388 void RSS_item_description_end (StrBuf *CData,
389                                rss_item *ri,
390                                rss_aggregator *RSSAggr,
391                                const char** Attr)
392 {
393         long olen = StrLength (ri->description);
394         long clen = StrLength (CData);
395         if (clen > 0)
396         {
397                 if (olen == 0) {
398                         NewStrBufDupAppendFlush(&ri->description,
399                                                 CData,
400                                                 NULL,
401                                                 0);
402                         StrBufTrim(ri->description);
403                 }
404                 else if (olen < clen) {
405                         FlushStrBuf(ri->description);
406                         NewStrBufDupAppendFlush(&ri->description,
407                                                 CData,
408                                                 NULL,
409                                                 0);
410                         StrBufTrim(ri->description);
411                 }
412         }
413 }
414
415 void ATOM_item_published_end (StrBuf *CData,
416                               rss_item *ri,
417                               rss_aggregator *RSSAggr,
418                               const char** Attr)
419 {
420         if (StrLength(CData) > 0) {
421                 StrBufTrim(CData);
422                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
423         }
424 }
425
426 void ATOM_item_updated_end (StrBuf *CData,
427                             rss_item *ri,
428                             rss_aggregator *RSSAggr,
429                             const char** Attr)
430 {
431         if (StrLength(CData) > 0) {
432                 StrBufTrim(CData);
433                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
434         }
435 }
436
437 void RSS_item_pubdate_end (StrBuf *CData,
438                            rss_item *ri,
439                            rss_aggregator *RSSAggr,
440                            const char** Attr)
441 {
442         if (StrLength(CData) > 0) {
443                 StrBufTrim(CData);
444                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
445         }
446 }
447
448
449 void RSS_item_date_end (StrBuf *CData,
450                         rss_item *ri,
451                         rss_aggregator *RSSAggr,
452                         const char** Attr)
453 {
454         if (StrLength(CData) > 0) {
455                 StrBufTrim(CData);
456                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
457         }
458 }
459
460
461
462 void RSS_item_author_end(StrBuf *CData,
463                          rss_item *ri,
464                          rss_aggregator *RSSAggr,
465                          const char** Attr)
466 {
467         if (StrLength(CData) > 0) {
468                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
469                 StrBufTrim(ri->author_or_creator);
470         }
471 }
472
473
474 void ATOM_item_name_end(StrBuf *CData,
475                         rss_item *ri,
476                         rss_aggregator *RSSAggr,
477                         const char** Attr)
478 {
479         if (StrLength(CData) > 0) {
480                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
481                 StrBufTrim(ri->author_or_creator);
482         }
483 }
484
485 void ATOM_item_email_end(StrBuf *CData,
486                          rss_item *ri,
487                          rss_aggregator *RSSAggr,
488                          const char** Attr)
489 {
490         if (StrLength(CData) > 0) {
491                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
492                 StrBufTrim(ri->author_email);
493         }
494 }
495
496 void RSS_item_creator_end(StrBuf *CData,
497                           rss_item *ri,
498                           rss_aggregator *RSSAggr,
499                           const char** Attr)
500 {
501         if ((StrLength(CData) > 0) &&
502             (StrLength(ri->author_or_creator) == 0))
503         {
504                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
505                 StrBufTrim(ri->author_or_creator);
506         }
507 }
508
509
510 void ATOM_item_uri_end(StrBuf *CData,
511                        rss_item *ri,
512                        rss_aggregator *RSSAggr,
513                        const char** Attr)
514 {
515         if (StrLength(CData) > 0) {
516                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
517                 StrBufTrim(ri->author_url);
518         }
519 }
520
521 void RSS_item_item_end(StrBuf *CData,
522                        rss_item *ri,
523                        rss_aggregator *RSSAggr,
524                        const char** Attr)
525 {
526         --ri->item_tag_nesting;
527         rss_remember_item(ri, RSSAggr);
528 }
529
530
531 void ATOM_item_entry_end(StrBuf *CData,
532                          rss_item *ri,
533                          rss_aggregator *RSSAggr,
534                          const char** Attr)
535 {
536         --ri->item_tag_nesting;
537         rss_remember_item(ri, RSSAggr);
538 }
539
540 void RSS_item_rss_end(StrBuf *CData,
541                       rss_item *ri,
542                       rss_aggregator *RSSAggr,
543                       const char** Attr)
544 {
545         AsyncIO         *IO = &RSSAggr->IO;
546         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
547         ri->done_parsing = 1;
548 }
549
550 void RSS_item_rdf_end(StrBuf *CData,
551                       rss_item *ri,
552                       rss_aggregator *RSSAggr,
553                       const char** Attr)
554 {
555         AsyncIO         *IO = &RSSAggr->IO;
556         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
557         ri->done_parsing = 1;
558 }
559
560
561 void RSSATOM_item_ignore(StrBuf *CData,
562                          rss_item *ri,
563                          rss_aggregator *RSSAggr,
564                          const char** Attr)
565 {
566 }
567
568
569
570 /*
571  * This callback stores up the data which appears in between tags.
572  */
573 void rss_xml_cdata_start(void *data)
574 {
575         rss_aggregator *RSSAggr = (rss_aggregator*) data;
576
577         FlushStrBuf(RSSAggr->CData);
578 }
579
580 void rss_xml_cdata_end(void *data)
581 {
582 }
583 void rss_xml_chardata(void *data, const XML_Char *s, int len)
584 {
585         rss_aggregator *RSSAggr = (rss_aggregator*) data;
586
587         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
588 }
589
590
591 /******************************************************************************
592  *                            RSS parser logic                                *
593  ******************************************************************************/
594
595 extern pthread_mutex_t RSSQueueMutex;
596
597 HashList *StartHandlers = NULL;
598 HashList *EndHandlers = NULL;
599 HashList *KnownNameSpaces = NULL;
600
601 void FreeNetworkSaveMessage (void *vMsg)
602 {
603         networker_save_message *Msg = (networker_save_message *) vMsg;
604
605         CM_FreeContents(&Msg->Msg);
606         FreeStrBuf(&Msg->Message);
607         FreeStrBuf(&Msg->MsgGUID);
608
609         FreeStrBuf(&Msg->author_email);
610         FreeStrBuf(&Msg->author_or_creator);
611         FreeStrBuf(&Msg->title);
612         FreeStrBuf(&Msg->description);
613
614         FreeStrBuf(&Msg->link);
615         FreeStrBuf(&Msg->linkTitle);
616
617         FreeStrBuf(&Msg->reLink);
618         FreeStrBuf(&Msg->reLinkTitle);
619
620         free(Msg);
621 }
622
623
624 /*
625  * Commit a fetched and parsed RSS item to disk
626  */
627 void rss_remember_item(rss_item *ri, rss_aggregator *RSSAggr)
628 {
629         networker_save_message *SaveMsg;
630         struct MD5Context md5context;
631         u_char rawdigest[MD5_DIGEST_LEN];
632         StrBuf *guid;
633         AsyncIO *IO = &RSSAggr->IO;
634         int n;
635
636
637         SaveMsg = (networker_save_message *) malloc(
638                 sizeof(networker_save_message));
639         memset(SaveMsg, 0, sizeof(networker_save_message));
640
641         /* Construct a GUID to use in the S_USETABLE table.
642          * If one is not present in the item itself, make one up.
643          */
644         if (ri->guid != NULL) {
645                 StrBufSpaceToBlank(ri->guid);
646                 StrBufTrim(ri->guid);
647                 guid = NewStrBufPlain(HKEY("rss/"));
648                 StrBufAppendBuf(guid, ri->guid, 0);
649         }
650         else {
651                 MD5Init(&md5context);
652                 if (ri->title != NULL) {
653                         MD5Update(&md5context,
654                                   (const unsigned char*)SKEY(ri->title));
655                 }
656                 if (ri->link != NULL) {
657                         MD5Update(&md5context,
658                                   (const unsigned char*)SKEY(ri->link));
659                 }
660                 MD5Final(rawdigest, &md5context);
661                 guid = NewStrBufPlain(NULL,
662                                       MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
663                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
664                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
665         }
666
667         /* translate Item into message. */
668         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: translating item...\n");
669         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
670         StrBufSpaceToBlank(ri->description);
671         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
672         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
673         SaveMsg->Msg.cm_format_type = FMT_RFC822;
674
675         /* gather the cheaply computed information now... */
676
677         if (ri->guid != NULL) {
678                 CM_SetField(&SaveMsg->Msg, eExclusiveID, SKEY(ri->guid));
679         }
680
681         SaveMsg->MsgGUID = guid;
682
683         if (ri->pubdate <= 0) {
684                 ri->pubdate = time(NULL); /// TODO: use event time!
685         }
686         CM_SetFieldLONG(&SaveMsg->Msg, eTimestamp, ri->pubdate);
687         if (ri->channel_title != NULL) {
688                 if (StrLength(ri->channel_title) > 0) {
689                         CM_SetField(&SaveMsg->Msg, eOriginalRoom, SKEY(ri->channel_title));
690                 }
691         }
692
693         /* remember the ones for defferred processing to save computing power after we know if we realy need it. */
694
695         SaveMsg->author_or_creator = ri->author_or_creator;
696         ri->author_or_creator = NULL;
697
698         SaveMsg->author_email = ri->author_email;
699         ri->author_email = NULL;
700
701         SaveMsg->title = ri->title;
702         ri->title = NULL;
703
704         SaveMsg->link = ri->link;
705         ri->link = NULL;
706
707         SaveMsg->description = ri->description;
708         ri->description = NULL;
709
710         SaveMsg->linkTitle = ri->linkTitle;
711         ri->linkTitle = NULL;
712
713         SaveMsg->reLink = ri->reLink;
714         ri->reLink = NULL;
715
716         SaveMsg->reLinkTitle = ri->reLinkTitle;
717         ri->reLinkTitle = NULL;
718
719         n = GetCount(RSSAggr->Messages) + 1;
720         Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
721 }
722
723
724
725 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
726 {
727         rss_xml_handler *h;
728         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
729         AsyncIO         *IO = &RSSAggr->IO;
730         rss_item        *ri = RSSAggr->Item;
731         void            *pv;
732         const char      *pel;
733         char            *sep = NULL;
734
735         /* Axe the namespace, we don't care about it */
736         /*
737           syslog(LOG_DEBUG,
738           "RSS: supplied el %d: %s\n", RSSAggr->RSSAggr->ItemType, supplied_el);
739         */
740         pel = supplied_el;
741         while (sep = strchr(pel, ':'), sep) {
742                 pel = sep + 1;
743         }
744
745         if (pel != supplied_el)
746         {
747                 void *v;
748
749                 if (!GetHash(KnownNameSpaces,
750                              supplied_el,
751                              pel - supplied_el - 1,
752                              &v))
753                 {
754                         EVRSSATOM_syslog(LOG_DEBUG,
755                                          "RSS: START ignoring "
756                                          "because of wrong namespace [%s]\n",
757                                          supplied_el);
758                         return;
759                 }
760         }
761
762         StrBufPlain(RSSAggr->Key, pel, -1);
763         StrBufLowerCase(RSSAggr->Key);
764         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
765         {
766                 h = (rss_xml_handler*) pv;
767
768                 if (((h->Flags & RSS_UNSET) != 0) &&
769                     (RSSAggr->ItemType == RSS_UNSET))
770                 {
771                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
772                 }
773                 else if (((h->Flags & RSS_RSS) != 0) &&
774                     (RSSAggr->ItemType == RSS_RSS))
775                 {
776                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
777                 }
778                 else if (((h->Flags & RSS_ATOM) != 0) &&
779                          (RSSAggr->ItemType == RSS_ATOM))
780                 {
781                         h->Handler(RSSAggr->CData,
782                                    ri,
783                                    RSSAggr,
784                                    attr);
785                 }
786                 else
787                         EVRSSATOM_syslog(LOG_DEBUG,
788                                           "RSS: START unhandled: [%s] [%s]...\n",
789                                          pel,
790                                          supplied_el);
791         }
792         else
793                 EVRSSATOM_syslog(LOG_DEBUG,
794                                  "RSS: START unhandled: [%s] [%s]...\n",
795                                  pel,
796                                  supplied_el);
797 }
798
799 void rss_xml_end(void *data, const char *supplied_el)
800 {
801         rss_xml_handler *h;
802         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
803         AsyncIO         *IO = &RSSAggr->IO;
804         rss_item        *ri = RSSAggr->Item;
805         const char      *pel;
806         char            *sep = NULL;
807         void            *pv;
808
809         /* Axe the namespace, we don't care about it */
810         pel = supplied_el;
811         while (sep = strchr(pel, ':'), sep) {
812                 pel = sep + 1;
813         }
814         EVRSSATOM_syslog(LOG_DEBUG, "RSS: END %s...\n", supplied_el);
815         if (pel != supplied_el)
816         {
817                 void *v;
818
819                 if (!GetHash(KnownNameSpaces,
820                              supplied_el,
821                              pel - supplied_el - 1,
822                              &v))
823                 {
824                         EVRSSATOM_syslog(LOG_DEBUG,
825                                          "RSS: END ignoring because of wrong namespace"
826                                          "[%s] = [%s]\n",
827                                          supplied_el,
828                                          ChrPtr(RSSAggr->CData));
829                         FlushStrBuf(RSSAggr->CData);
830                         return;
831                 }
832         }
833
834         StrBufPlain(RSSAggr->Key, pel, -1);
835         StrBufLowerCase(RSSAggr->Key);
836         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
837         {
838                 h = (rss_xml_handler*) pv;
839
840                 if (((h->Flags & RSS_UNSET) != 0) &&
841                     (RSSAggr->ItemType == RSS_UNSET))
842                 {
843                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
844                 }
845                 else if (((h->Flags & RSS_RSS) != 0) &&
846                     (RSSAggr->ItemType == RSS_RSS))
847                 {
848                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
849                 }
850                 else if (((h->Flags & RSS_ATOM) != 0) &&
851                          (RSSAggr->ItemType == RSS_ATOM))
852                 {
853                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
854                 }
855                 else
856                         EVRSSATOM_syslog(LOG_DEBUG,
857                                          "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
858                                          pel,
859                                          supplied_el,
860                                          ChrPtr(RSSAggr->CData));
861         }
862         else
863                 EVRSSATOM_syslog(LOG_DEBUG,
864                                  "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
865                                  pel,
866                                  supplied_el,
867                                  ChrPtr(RSSAggr->CData));
868         FlushStrBuf(RSSAggr->CData);
869 }
870
871
872
873 /*
874  * Callback function for passing libcurl's output to expat for parsing
875  * we don't do streamed parsing so expat can handle non-utf8 documents
876 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
877 {
878         XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
879         return (size*nmemb);
880 }
881  */
882
883
884
885 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
886 {
887         StrBuf *Buf;
888         rss_aggregator *RSSAggr;
889         rss_item *ri;
890         const char *at;
891         char *ptr;
892         long len;
893         const char *Key;
894
895         RSSAggr = IO->Data;
896         ri = RSSAggr->Item;
897         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
898         RSSAggr->Key = NewStrBuf();
899         at = NULL;
900         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
901         ptr = NULL;
902
903 #define encoding "encoding=\""
904         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
905         if (ptr != NULL)
906         {
907                 char *pche;
908
909                 ptr += sizeof (encoding) - 1;
910                 pche = strchr(ptr, '"');
911                 if (pche != NULL)
912                         StrBufCutAt(RSSAggr->Key, -1, pche);
913                 else
914                         ptr = "UTF-8";
915         }
916         else
917                 ptr = "UTF-8";
918
919         EVRSSATOM_syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
920
921         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
922         if (!RSSAggr->xp) {
923                 EVRSSATOMM_syslog(LOG_ALERT, "Cannot create XML parser!\n");
924                 return eAbort;
925         }
926         FlushStrBuf(RSSAggr->Key);
927
928         RSSAggr->Messages = NewHash(1, Flathash);
929         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
930         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
931         XML_SetUserData(RSSAggr->xp, RSSAggr);
932         XML_SetCdataSectionHandler(RSSAggr->xp,
933                                    rss_xml_cdata_start,
934                                    rss_xml_cdata_end);
935
936
937         len = StrLength(IO->HttpReq.ReplyData);
938         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
939         XML_Parse(RSSAggr->xp, ptr, len, 0);
940         free (ptr);
941         if (ri->done_parsing == 0)
942                 XML_Parse(RSSAggr->xp, "", 0, 1);
943
944
945         EVRSSATOM_syslog(LOG_DEBUG, "RSS: XML Status [%s] \n",
946                          XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
947
948         XML_ParserFree(RSSAggr->xp);
949         flush_rss_item(ri);
950
951         Buf = NewStrBufDup(RSSAggr->rooms);
952         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
953         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
954         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
955
956         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
957
958 //RSSAggr->next_poll = time(NULL) + config.c_net_freq;
959         if (GetNextHashPos(RSSAggr->Messages,
960                            RSSAggr->Pos,
961                            &len,
962                            &Key,
963                            (void**) &RSSAggr->ThisMsg))
964                 return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry);
965         else
966                 return eAbort;
967 }
968
969
970 /******************************************************************************
971  *                    RSS handler registering logic                           *
972  ******************************************************************************/
973
974 void AddRSSStartHandler(rss_handler_func Handler,
975                         int Flags,
976                         const char *key,
977                         long len)
978 {
979         rss_xml_handler *h;
980         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
981         h->Flags = Flags;
982         h->Handler = Handler;
983         Put(StartHandlers, key, len, h, NULL);
984 }
985
986 void AddRSSEndHandler(rss_handler_func Handler,
987                       int Flags,
988                       const char *key,
989                       long len)
990 {
991         rss_xml_handler *h;
992         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
993         h->Flags = Flags;
994         h->Handler = Handler;
995         Put(EndHandlers, key, len, h, NULL);
996 }
997
998 void rss_parser_cleanup(void)
999 {
1000         DeleteHash(&StartHandlers);
1001         DeleteHash(&EndHandlers);
1002         DeleteHash(&KnownNameSpaces);
1003 }
1004
1005 void LogDebugEnableRSSATOMParser(const int n)
1006 {
1007         RSSAtomParserDebugEnabled = n;
1008 }
1009
1010 CTDL_MODULE_INIT(rssparser)
1011 {
1012         if (!threading)
1013         {
1014                 StartHandlers = NewHash(1, NULL);
1015                 EndHandlers = NewHash(1, NULL);
1016
1017                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
1018                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
1019                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
1020                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
1021                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
1022                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
1023
1024                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1025                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
1026                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
1027                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
1028 #if 0
1029 // hm, rss to the comments of that blog, might be interesting in future, but...
1030                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
1031 // comment count...
1032                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
1033 #endif
1034                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1035                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
1036                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
1037                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
1038                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
1039                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
1040                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
1041                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
1042                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
1043                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
1044                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
1045 /* <author> */
1046                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
1047                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
1048                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
1049 /* </author> */
1050                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
1051                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
1052                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
1053                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
1054
1055
1056 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
1057                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1058                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1059                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1060                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1061
1062 /* links to other feed generators... */
1063                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1064                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1065                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1066                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1067
1068                 KnownNameSpaces = NewHash(1, NULL);
1069                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
1070                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
1071                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
1072                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
1073                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
1074                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1075                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
1076                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
1077                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
1078                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1079                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
1080                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
1081                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1082                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1083                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1084                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1085                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1086                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1087                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1088                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1089                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1090                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1091                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1092 #if 0
1093                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1094                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1095 #endif
1096                 CtdlRegisterDebugFlagHook(HKEY("RSSAtomParser"), LogDebugEnableRSSATOMParser, &RSSAtomParserDebugEnabled);
1097                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1098         }
1099         return "rssparser";
1100 }