Move back to single threaded structure for rss feed puller
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2015 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  * 
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include <stdlib.h>
16 #include <unistd.h>
17 #include <stdio.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <ctype.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <expat.h>
36 #include <curl/curl.h>
37 #include <libcitadel.h>
38 #include "citadel.h"
39 #include "server.h"
40 #include "citserver.h"
41 #include "support.h"
42 #include "config.h"
43 #include "threads.h"
44 #include "ctdl_module.h"
45 #include "clientsocket.h"
46 #include "msgbase.h"
47 #include "parsedate.h"
48 #include "database.h"
49 #include "citadel_dirs.h"
50 #include "md5.h"
51 #include "context.h"
52 #include "event_client.h"
53 #include "rss_atom_parser.h"
54
55 void rss_remember_item(rss_item *ri, rss_aggregator *Cfg);
56
57 int RSSAtomParserDebugEnabled = 0;
58
59 #define N ((rss_aggregator*)IO->Data)->Cfg.QRnumber
60
61 /*
62  * Convert an RDF/RSS datestamp into a time_t
63  */
64 time_t rdf_parsedate(const char *p)
65 {
66         struct tm tm;
67         time_t t = 0;
68
69         if (!p) return 0L;
70         if (strlen(p) < 10) return 0L;
71
72         memset(&tm, 0, sizeof tm);
73
74         /*
75          * If the timestamp appears to be in W3C datetime format, try to
76          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
77          *
78          * This code, along with parsedate.c, is a potential candidate for
79          * moving into libcitadel.
80          */
81         if ( (p[4] == '-') && (p[7] == '-') ) {
82                 tm.tm_year = atoi(&p[0]) - 1900;
83                 tm.tm_mon = atoi(&p[5]) - 1;
84                 tm.tm_mday = atoi(&p[8]);
85                 if ( (p[10] == 'T') && (p[13] == ':') ) {
86                         tm.tm_hour = atoi(&p[11]);
87                         tm.tm_min = atoi(&p[14]);
88                 }
89                 return mktime(&tm);
90         }
91
92         /* hmm... try RFC822 date stamp format */
93
94         t = parsedate(p);
95         if (t > 0) return(t);
96
97         /* yeesh.  ok, just return the current date and time. */
98         return(time(NULL));
99 }
100
101 void flush_rss_item(rss_item *ri)
102 {
103         /* Initialize the feed item data structure */
104         FreeStrBuf(&ri->guid);
105         FreeStrBuf(&ri->title);
106         FreeStrBuf(&ri->link);
107         FreeStrBuf(&ri->author_or_creator);
108         FreeStrBuf(&ri->author_email);
109         FreeStrBuf(&ri->author_url);
110         FreeStrBuf(&ri->description);
111
112         FreeStrBuf(&ri->linkTitle);
113         FreeStrBuf(&ri->reLink);
114         FreeStrBuf(&ri->reLinkTitle);
115         FreeStrBuf(&ri->channel_title);
116 }
117
118
119 /******************************************************************************
120  *                              XML-Handler                                   *
121  ******************************************************************************/
122
123
124 void RSS_item_rss_start (StrBuf *CData,
125                          rss_item *ri,
126                          rss_aggregator *RSSAggr,
127                          const char** Attr)
128 {
129         syslog(LOG_DEBUG, "RSS: This is an RSS feed.");
130         RSSAggr->ItemType = RSS_RSS;
131 }
132
133 void RSS_item_rdf_start(StrBuf *CData,
134                         rss_item *ri,
135                         rss_aggregator *RSSAggr,
136                         const char** Attr)
137 {
138         syslog(LOG_DEBUG, "RSS: This is an RDF feed.");
139         RSSAggr->ItemType = RSS_RSS;
140 }
141
142 void ATOM_item_feed_start(StrBuf *CData,
143                           rss_item *ri,
144                           rss_aggregator *RSSAggr,
145                           const char** Attr)
146 {
147         syslog(LOG_DEBUG, "RSS: This is an ATOM feed.");
148         RSSAggr->ItemType = RSS_ATOM;
149 }
150
151
152 void RSS_item_item_start(StrBuf *CData,
153                          rss_item *ri,
154                          rss_aggregator *RSSAggr,
155                          const char** Attr)
156 {
157         ri->item_tag_nesting ++;
158         flush_rss_item(ri);
159 }
160
161 void ATOM_item_entry_start(StrBuf *CData,
162                            rss_item *ri,
163                            rss_aggregator *RSSAggr,
164                            const char** Attr)
165 {
166 /* Atom feed... */
167         ri->item_tag_nesting ++;
168         flush_rss_item(ri);
169 }
170
171 void ATOM_item_link_start (StrBuf *CData,
172                            rss_item *ri,
173                            rss_aggregator *RSSAggr,
174                            const char** Attr)
175 {
176         int i;
177         const char *pHref = NULL;
178         const char *pType = NULL;
179         const char *pRel = NULL;
180         const char *pTitle = NULL;
181
182         for (i = 0; Attr[i] != NULL; i+=2)
183         {
184                 if (!strcmp(Attr[i], "href"))
185                 {
186                         pHref = Attr[i+1];
187                 }
188                 else if (!strcmp(Attr[i], "rel"))
189                 {
190                         pRel = Attr[i+1];
191                 }
192                 else if (!strcmp(Attr[i], "type"))
193                 {
194                         pType = Attr[i+1];
195                 }
196                 else if (!strcmp(Attr[i], "title"))
197                 {
198                         pTitle = Attr[i+1];
199                 }
200         }
201         if (pHref == NULL)
202                 return; /* WHUT? Pointing... where? */
203         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
204                 return;
205         /* these just point to other rss resources,
206            we're not interested in them. */
207         if (pRel != NULL)
208         {
209                 if (!strcasecmp (pRel, "replies"))
210                 {
211                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
212                         StrBufTrim(ri->link);
213                         NewStrBufDupAppendFlush(&ri->reLinkTitle,
214                                                 NULL,
215                                                 pTitle,
216                                                 -1);
217                 }
218                 else if (!strcasecmp(pRel, "alternate"))
219                 { /* Alternative representation of this Item... */
220                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
221                         StrBufTrim(ri->link);
222                         NewStrBufDupAppendFlush(&ri->linkTitle,
223                                                 NULL,
224                                                 pTitle,
225                                                 -1);
226
227                 }
228 #if 0 /* these are also defined, but dunno what to do with them.. */
229                 else if (!strcasecmp(pRel, "related"))
230                 {
231                 }
232                 else if (!strcasecmp(pRel, "self"))
233                 {
234                 }
235                 else if (!strcasecmp(pRel, "enclosure"))
236                 {/*...reference can get big, and is probably the full article*/
237                 }
238                 else if (!strcasecmp(pRel, "via"))
239                 {/* this article was provided via... */
240                 }
241 #endif
242         }
243         else if (StrLength(ri->link) == 0)
244         {
245                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
246                 StrBufTrim(ri->link);
247                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
248         }
249 }
250
251
252
253
254 void ATOMRSS_item_title_end(StrBuf *CData,
255                             rss_item *ri,
256                             rss_aggregator *RSSAggr,
257                             const char** Attr)
258 {
259         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
260                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
261                 StrBufTrim(ri->channel_title);
262         }
263 }
264
265 void RSS_item_guid_end(StrBuf *CData,
266                        rss_item *ri,
267                        rss_aggregator *RSSAggr,
268                        const char** Attr)
269 {
270         if (StrLength(CData) > 0) {
271                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
272         }
273 }
274
275 void ATOM_item_id_end(StrBuf *CData,
276                       rss_item *ri, rss_aggregator *RSSAggr, const char** Attr)
277 {
278         if (StrLength(CData) > 0) {
279                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
280         }
281 }
282
283
284 void RSS_item_link_end (StrBuf *CData,
285                         rss_item *ri,
286                         rss_aggregator *RSSAggr,
287                         const char** Attr)
288 {
289         if (StrLength(CData) > 0) {
290                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
291                 StrBufTrim(ri->link);
292         }
293 }
294 void RSS_item_relink_end(StrBuf *CData,
295                          rss_item *ri,
296                          rss_aggregator *RSSAggr,
297                          const char** Attr)
298 {
299         if (StrLength(CData) > 0) {
300                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
301                 StrBufTrim(ri->reLink);
302         }
303 }
304
305 void RSSATOM_item_title_end (StrBuf *CData,
306                              rss_item *ri,
307                              rss_aggregator *RSSAggr,
308                              const char** Attr)
309 {
310         if (StrLength(CData) > 0) {
311                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
312                 StrBufTrim(ri->title);
313         }
314 }
315
316 void ATOM_item_content_end (StrBuf *CData,
317                             rss_item *ri,
318                             rss_aggregator *RSSAggr,
319                             const char** Attr)
320 {
321         long olen = StrLength (ri->description);
322         long clen = StrLength (CData);
323         if (clen > 0)
324         {
325                 if (olen == 0) {
326                         NewStrBufDupAppendFlush(&ri->description,
327                                                 CData,
328                                                 NULL,
329                                                 0);
330                         StrBufTrim(ri->description);
331                 }
332                 else if (olen < clen) {
333                         FlushStrBuf(ri->description);
334                         NewStrBufDupAppendFlush(&ri->description,
335                                                 CData,
336                                                 NULL,
337                                                 0);
338
339                         StrBufTrim(ri->description);
340                 }
341         }
342 }
343 void ATOM_item_summary_end (StrBuf *CData,
344                             rss_item *ri,
345                             rss_aggregator *RSSAggr,
346                             const char** Attr)
347 {
348         /*
349          * this can contain an abstract of the article.
350          * but we don't want to verwrite a full document if we already have it.
351          */
352         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
353         {
354                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
355                 StrBufTrim(ri->description);
356         }
357 }
358
359 void RSS_item_description_end (StrBuf *CData,
360                                rss_item *ri,
361                                rss_aggregator *RSSAggr,
362                                const char** Attr)
363 {
364         long olen = StrLength (ri->description);
365         long clen = StrLength (CData);
366         if (clen > 0)
367         {
368                 if (olen == 0) {
369                         NewStrBufDupAppendFlush(&ri->description,
370                                                 CData,
371                                                 NULL,
372                                                 0);
373                         StrBufTrim(ri->description);
374                 }
375                 else if (olen < clen) {
376                         FlushStrBuf(ri->description);
377                         NewStrBufDupAppendFlush(&ri->description,
378                                                 CData,
379                                                 NULL,
380                                                 0);
381                         StrBufTrim(ri->description);
382                 }
383         }
384 }
385
386 void ATOM_item_published_end (StrBuf *CData,
387                               rss_item *ri,
388                               rss_aggregator *RSSAggr,
389                               const char** Attr)
390 {
391         if (StrLength(CData) > 0) {
392                 StrBufTrim(CData);
393                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
394         }
395 }
396
397 void ATOM_item_updated_end (StrBuf *CData,
398                             rss_item *ri,
399                             rss_aggregator *RSSAggr,
400                             const char** Attr)
401 {
402         if (StrLength(CData) > 0) {
403                 StrBufTrim(CData);
404                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
405         }
406 }
407
408 void RSS_item_pubdate_end (StrBuf *CData,
409                            rss_item *ri,
410                            rss_aggregator *RSSAggr,
411                            const char** Attr)
412 {
413         if (StrLength(CData) > 0) {
414                 StrBufTrim(CData);
415                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
416         }
417 }
418
419
420 void RSS_item_date_end (StrBuf *CData,
421                         rss_item *ri,
422                         rss_aggregator *RSSAggr,
423                         const char** Attr)
424 {
425         if (StrLength(CData) > 0) {
426                 StrBufTrim(CData);
427                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
428         }
429 }
430
431
432
433 void RSS_item_author_end(StrBuf *CData,
434                          rss_item *ri,
435                          rss_aggregator *RSSAggr,
436                          const char** Attr)
437 {
438         if (StrLength(CData) > 0) {
439                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
440                 StrBufTrim(ri->author_or_creator);
441         }
442 }
443
444
445 void ATOM_item_name_end(StrBuf *CData,
446                         rss_item *ri,
447                         rss_aggregator *RSSAggr,
448                         const char** Attr)
449 {
450         if (StrLength(CData) > 0) {
451                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
452                 StrBufTrim(ri->author_or_creator);
453         }
454 }
455
456 void ATOM_item_email_end(StrBuf *CData,
457                          rss_item *ri,
458                          rss_aggregator *RSSAggr,
459                          const char** Attr)
460 {
461         if (StrLength(CData) > 0) {
462                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
463                 StrBufTrim(ri->author_email);
464         }
465 }
466
467 void RSS_item_creator_end(StrBuf *CData,
468                           rss_item *ri,
469                           rss_aggregator *RSSAggr,
470                           const char** Attr)
471 {
472         if ((StrLength(CData) > 0) &&
473             (StrLength(ri->author_or_creator) == 0))
474         {
475                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
476                 StrBufTrim(ri->author_or_creator);
477         }
478 }
479
480
481 void ATOM_item_uri_end(StrBuf *CData,
482                        rss_item *ri,
483                        rss_aggregator *RSSAggr,
484                        const char** Attr)
485 {
486         if (StrLength(CData) > 0) {
487                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
488                 StrBufTrim(ri->author_url);
489         }
490 }
491
492 void RSS_item_item_end(StrBuf *CData,
493                        rss_item *ri,
494                        rss_aggregator *RSSAggr,
495                        const char** Attr)
496 {
497         --ri->item_tag_nesting;
498         rss_remember_item(ri, RSSAggr);
499 }
500
501
502 void ATOM_item_entry_end(StrBuf *CData,
503                          rss_item *ri,
504                          rss_aggregator *RSSAggr,
505                          const char** Attr)
506 {
507         --ri->item_tag_nesting;
508         rss_remember_item(ri, RSSAggr);
509 }
510
511 void RSS_item_rss_end(StrBuf *CData,
512                       rss_item *ri,
513                       rss_aggregator *RSSAggr,
514                       const char** Attr)
515 {
516         syslog(LOG_DEBUG, "End of feed detected.  Closing parser.");
517         ri->done_parsing = 1;
518 }
519
520 void RSS_item_rdf_end(StrBuf *CData,
521                       rss_item *ri,
522                       rss_aggregator *RSSAggr,
523                       const char** Attr)
524 {
525         syslog(LOG_DEBUG, "End of feed detected.  Closing parser.");
526         ri->done_parsing = 1;
527 }
528
529
530 void RSSATOM_item_ignore(StrBuf *CData,
531                          rss_item *ri,
532                          rss_aggregator *RSSAggr,
533                          const char** Attr)
534 {
535 }
536
537
538
539 /*
540  * This callback stores up the data which appears in between tags.
541  */
542 void rss_xml_cdata_start(void *data)
543 {
544         rss_aggregator *RSSAggr = (rss_aggregator*) data;
545
546         FlushStrBuf(RSSAggr->CData);
547 }
548
549 void rss_xml_cdata_end(void *data)
550 {
551 }
552 void rss_xml_chardata(void *data, const XML_Char *s, int len)
553 {
554         rss_aggregator *RSSAggr = (rss_aggregator*) data;
555
556         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
557 }
558
559
560 /******************************************************************************
561  *                            RSS parser logic                                *
562  ******************************************************************************/
563
564 extern pthread_mutex_t RSSQueueMutex;
565
566 HashList *StartHandlers = NULL;
567 HashList *EndHandlers = NULL;
568 HashList *KnownNameSpaces = NULL;
569
570 void FreeNetworkSaveMessage (void *vMsg)
571 {
572         networker_save_message *Msg = (networker_save_message *) vMsg;
573
574         CM_FreeContents(&Msg->Msg);
575         FreeStrBuf(&Msg->Message);
576         FreeStrBuf(&Msg->MsgGUID);
577
578         FreeStrBuf(&Msg->author_email);
579         FreeStrBuf(&Msg->author_or_creator);
580         FreeStrBuf(&Msg->title);
581         FreeStrBuf(&Msg->description);
582
583         FreeStrBuf(&Msg->link);
584         FreeStrBuf(&Msg->linkTitle);
585
586         FreeStrBuf(&Msg->reLink);
587         FreeStrBuf(&Msg->reLinkTitle);
588
589         free(Msg);
590 }
591
592
593 /*
594  * Commit a fetched and parsed RSS item to disk
595  */
596 void rss_remember_item(rss_item *ri, rss_aggregator *RSSAggr)
597 {
598         networker_save_message *SaveMsg;
599         struct MD5Context md5context;
600         u_char rawdigest[MD5_DIGEST_LEN];
601         StrBuf *guid;
602         int n;
603
604         SaveMsg = (networker_save_message *) malloc(sizeof(networker_save_message));
605         memset(SaveMsg, 0, sizeof(networker_save_message));
606
607         /* Construct a GUID to use in the S_USETABLE table.
608          * If one is not present in the item itself, make one up.
609          */
610         if (ri->guid != NULL) {
611                 StrBufSpaceToBlank(ri->guid);
612                 StrBufTrim(ri->guid);
613                 guid = NewStrBufPlain(HKEY("rss/"));
614                 StrBufAppendBuf(guid, ri->guid, 0);
615         }
616         else {
617                 MD5Init(&md5context);
618                 if (ri->title != NULL) {
619                         MD5Update(&md5context, (const unsigned char*)SKEY(ri->title));
620                 }
621                 if (ri->link != NULL) {
622                         MD5Update(&md5context, (const unsigned char*)SKEY(ri->link));
623                 }
624                 MD5Final(rawdigest, &md5context);
625                 guid = NewStrBufPlain(NULL, MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
626                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
627                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
628         }
629
630         /* translate Item into message. */
631         syslog(LOG_DEBUG, "RSS: translating item...");
632         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
633         StrBufSpaceToBlank(ri->description);
634         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
635         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
636         SaveMsg->Msg.cm_format_type = FMT_RFC822;
637
638         /* gather the cheaply computed information now... */
639
640         if (ri->guid != NULL) {
641                 CM_SetField(&SaveMsg->Msg, eExclusiveID, SKEY(ri->guid));
642         }
643
644         SaveMsg->MsgGUID = guid;
645
646         if (ri->pubdate <= 0) {
647                 ri->pubdate = time(NULL);
648         }
649         CM_SetFieldLONG(&SaveMsg->Msg, eTimestamp, ri->pubdate);
650         if (ri->channel_title != NULL) {
651                 if (StrLength(ri->channel_title) > 0) {
652                         CM_SetField(&SaveMsg->Msg, eOriginalRoom, SKEY(ri->channel_title));
653                 }
654         }
655
656         /* remember the ones for defferred processing to save computing power after we know if we realy need it. */
657
658         SaveMsg->author_or_creator = ri->author_or_creator;
659         ri->author_or_creator = NULL;
660
661         SaveMsg->author_email = ri->author_email;
662         ri->author_email = NULL;
663
664         SaveMsg->title = ri->title;
665         ri->title = NULL;
666
667         SaveMsg->link = ri->link;
668         ri->link = NULL;
669
670         SaveMsg->description = ri->description;
671         ri->description = NULL;
672
673         SaveMsg->linkTitle = ri->linkTitle;
674         ri->linkTitle = NULL;
675
676         SaveMsg->reLink = ri->reLink;
677         ri->reLink = NULL;
678
679         SaveMsg->reLinkTitle = ri->reLinkTitle;
680         ri->reLinkTitle = NULL;
681
682         n = GetCount(RSSAggr->Messages) + 1;
683         Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
684 }
685
686
687
688 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
689 {
690         rss_xml_handler *h;
691         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
692         rss_item        *ri = RSSAggr->Item;
693         void            *pv;
694         const char      *pel;
695         char            *sep = NULL;
696
697         /* Axe the namespace, we don't care about it */
698         /*
699           syslog(LOG_DEBUG,
700           "RSS: supplied el %d: %s\n", RSSAggr->RSSAggr->ItemType, supplied_el);
701         */
702         pel = supplied_el;
703         while (sep = strchr(pel, ':'), sep) {
704                 pel = sep + 1;
705         }
706
707         if (pel != supplied_el)
708         {
709                 void *v;
710
711                 if (!GetHash(KnownNameSpaces,
712                              supplied_el,
713                              pel - supplied_el - 1,
714                              &v))
715                 {
716                         syslog(LOG_DEBUG,
717                                          "RSS: START ignoring "
718                                          "because of wrong namespace [%s]",
719                                          supplied_el
720                         );
721                         return;
722                 }
723         }
724
725         StrBufPlain(RSSAggr->Key, pel, -1);
726         StrBufLowerCase(RSSAggr->Key);
727         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
728         {
729                 h = (rss_xml_handler*) pv;
730
731                 if (((h->Flags & RSS_UNSET) != 0) &&
732                     (RSSAggr->ItemType == RSS_UNSET))
733                 {
734                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
735                 }
736                 else if (((h->Flags & RSS_RSS) != 0) &&
737                     (RSSAggr->ItemType == RSS_RSS))
738                 {
739                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
740                 }
741                 else if (((h->Flags & RSS_ATOM) != 0) &&
742                          (RSSAggr->ItemType == RSS_ATOM))
743                 {
744                         h->Handler(RSSAggr->CData,
745                                    ri,
746                                    RSSAggr,
747                                    attr);
748                 }
749                 else
750                         syslog(LOG_DEBUG,
751                                           "RSS: START unhandled: [%s] [%s]...",
752                                          pel,
753                                          supplied_el
754                         );
755         }
756         else
757                 syslog(LOG_DEBUG,
758                                  "RSS: START unhandled: [%s] [%s]...",
759                                  pel,
760                                  supplied_el
761                 );
762 }
763
764 void rss_xml_end(void *data, const char *supplied_el)
765 {
766         rss_xml_handler *h;
767         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
768         rss_item        *ri = RSSAggr->Item;
769         const char      *pel;
770         char            *sep = NULL;
771         void            *pv;
772
773         /* Axe the namespace, we don't care about it */
774         pel = supplied_el;
775         while (sep = strchr(pel, ':'), sep) {
776                 pel = sep + 1;
777         }
778         syslog(LOG_DEBUG, "RSS: END %s...", supplied_el);
779         if (pel != supplied_el)
780         {
781                 void *v;
782
783                 if (!GetHash(KnownNameSpaces,
784                              supplied_el,
785                              pel - supplied_el - 1,
786                              &v))
787                 {
788                         syslog(LOG_DEBUG,
789                                          "RSS: END ignoring because of wrong namespace"
790                                          "[%s] = [%s]",
791                                          supplied_el,
792                                          ChrPtr(RSSAggr->CData));
793                         FlushStrBuf(RSSAggr->CData);
794                         return;
795                 }
796         }
797
798         StrBufPlain(RSSAggr->Key, pel, -1);
799         StrBufLowerCase(RSSAggr->Key);
800         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
801         {
802                 h = (rss_xml_handler*) pv;
803
804                 if (((h->Flags & RSS_UNSET) != 0) &&
805                     (RSSAggr->ItemType == RSS_UNSET))
806                 {
807                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
808                 }
809                 else if (((h->Flags & RSS_RSS) != 0) &&
810                     (RSSAggr->ItemType == RSS_RSS))
811                 {
812                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
813                 }
814                 else if (((h->Flags & RSS_ATOM) != 0) &&
815                          (RSSAggr->ItemType == RSS_ATOM))
816                 {
817                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
818                 }
819                 else
820                         syslog(LOG_DEBUG,
821                                          "RSS: END   unhandled: [%s]  [%s] = [%s]...",
822                                          pel,
823                                          supplied_el,
824                                          ChrPtr(RSSAggr->CData));
825         }
826         else
827                 syslog(LOG_DEBUG,
828                                  "RSS: END   unhandled: [%s]  [%s] = [%s]...",
829                                  pel,
830                                  supplied_el,
831                                  ChrPtr(RSSAggr->CData));
832         FlushStrBuf(RSSAggr->CData);
833 }
834
835
836
837 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
838 {
839         StrBuf *Buf;
840         rss_aggregator *RSSAggr;
841         rss_item *ri;
842         const char *at;
843         char *ptr;
844         long len;
845         const char *Key;
846
847         RSSAggr = IO->Data;
848         ri = RSSAggr->Item;
849         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
850         RSSAggr->Key = NewStrBuf();
851         at = NULL;
852         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
853         ptr = NULL;
854
855 #define encoding "encoding=\""
856         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
857         if (ptr != NULL)
858         {
859                 char *pche;
860
861                 ptr += sizeof (encoding) - 1;
862                 pche = strchr(ptr, '"');
863                 if (pche != NULL)
864                         StrBufCutAt(RSSAggr->Key, -1, pche);
865                 else
866                         ptr = "UTF-8";
867         }
868         else
869                 ptr = "UTF-8";
870
871         syslog(LOG_DEBUG, "RSS: Now parsing [%s]", ChrPtr(RSSAggr->Url));
872
873         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
874         if (!RSSAggr->xp) {
875                 syslog(LOG_ALERT, "Cannot create XML parser!");
876                 return eAbort;
877         }
878         FlushStrBuf(RSSAggr->Key);
879
880         RSSAggr->Messages = NewHash(1, Flathash);
881         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
882         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
883         XML_SetUserData(RSSAggr->xp, RSSAggr);
884         XML_SetCdataSectionHandler(RSSAggr->xp,
885                                    rss_xml_cdata_start,
886                                    rss_xml_cdata_end
887         );
888
889         len = StrLength(IO->HttpReq.ReplyData);
890         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
891         XML_Parse(RSSAggr->xp, ptr, len, 0);
892         free (ptr);
893         if (ri->done_parsing == 0) {
894                 XML_Parse(RSSAggr->xp, "", 0, 1);
895         }
896
897         syslog(LOG_DEBUG, "RSS: XML Status [%s]", XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
898
899         XML_ParserFree(RSSAggr->xp);
900         flush_rss_item(ri);
901
902         Buf = NewStrBufDup(RSSAggr->rooms);
903         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
904         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
905         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
906
907         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
908
909 #if 0
910 // FIXME ajc
911         if (GetNextHashPos(RSSAggr->Messages,
912                            RSSAggr->Pos,
913                            &len,
914                            &Key,
915                            (void**) &RSSAggr->ThisMsg)) {
916                 return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry);
917         }
918         else {
919 #endif
920                 return eAbort;
921 #if 0
922         }
923 #endif
924 }
925
926
927 /******************************************************************************
928  *                    RSS handler registering logic                           *
929  ******************************************************************************/
930
931 void AddRSSStartHandler(rss_handler_func Handler,
932                         int Flags,
933                         const char *key,
934                         long len)
935 {
936         rss_xml_handler *h;
937         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
938         h->Flags = Flags;
939         h->Handler = Handler;
940         Put(StartHandlers, key, len, h, NULL);
941 }
942
943 void AddRSSEndHandler(rss_handler_func Handler,
944                       int Flags,
945                       const char *key,
946                       long len)
947 {
948         rss_xml_handler *h;
949         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
950         h->Flags = Flags;
951         h->Handler = Handler;
952         Put(EndHandlers, key, len, h, NULL);
953 }
954
955 void rss_parser_cleanup(void)
956 {
957         DeleteHash(&StartHandlers);
958         DeleteHash(&EndHandlers);
959         DeleteHash(&KnownNameSpaces);
960 }
961
962 void LogDebugEnableRSSATOMParser(const int n)
963 {
964         RSSAtomParserDebugEnabled = n;
965 }
966
967 CTDL_MODULE_INIT(rssparser)
968 {
969         if (!threading)
970         {
971                 StartHandlers = NewHash(1, NULL);
972                 EndHandlers = NewHash(1, NULL);
973
974                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
975                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
976                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
977                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
978                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
979                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
980
981                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
982                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
983                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
984                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
985 #if 0
986 // hm, rss to the comments of that blog, might be interesting in future, but...
987                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
988 // comment count...
989                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
990 #endif
991                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
992                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
993                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
994                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
995                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
996                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
997                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
998                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
999                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
1000                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
1001                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
1002 /* <author> */
1003                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
1004                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
1005                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
1006 /* </author> */
1007                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
1008                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
1009                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
1010                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
1011
1012
1013 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
1014                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1015                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1016                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1017                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1018
1019 /* links to other feed generators... */
1020                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1021                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1022                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1023                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1024
1025                 KnownNameSpaces = NewHash(1, NULL);
1026                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
1027                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
1028                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
1029                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
1030                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
1031                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1032                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
1033                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
1034                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
1035                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1036                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
1037                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
1038                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1039                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1040                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1041                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1042                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1043                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1044                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1045                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1046                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1047                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1048                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1049 #if 0
1050                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1051                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1052 #endif
1053                 CtdlRegisterDebugFlagHook(HKEY("RSSAtomParser"), LogDebugEnableRSSATOMParser, &RSSAtomParserDebugEnabled);
1054                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1055         }
1056         return "rssparser";
1057 }