610dd2b959ba18d329e0fa0292404184ea711a2e
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2012 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  * 
9  * 
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * 
17  * 
18  * 
19  */
20
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <expat.h>
42 #include <curl/curl.h>
43 #include <libcitadel.h>
44 #include "citadel.h"
45 #include "server.h"
46 #include "citserver.h"
47 #include "support.h"
48 #include "config.h"
49 #include "threads.h"
50 #include "ctdl_module.h"
51 #include "clientsocket.h"
52 #include "msgbase.h"
53 #include "parsedate.h"
54 #include "database.h"
55 #include "citadel_dirs.h"
56 #include "md5.h"
57 #include "context.h"
58 #include "event_client.h"
59 #include "rss_atom_parser.h"
60
61 void rss_save_item(rss_item *ri, rss_aggregator *Cfg);
62
63
64 /*
65  * Convert an RDF/RSS datestamp into a time_t
66  */
67 time_t rdf_parsedate(const char *p)
68 {
69         struct tm tm;
70         time_t t = 0;
71
72         if (!p) return 0L;
73         if (strlen(p) < 10) return 0L;
74
75         memset(&tm, 0, sizeof tm);
76
77         /*
78          * If the timestamp appears to be in W3C datetime format, try to
79          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
80          *
81          * This code, along with parsedate.c, is a potential candidate for
82          * moving into libcitadel.
83          */
84         if ( (p[4] == '-') && (p[7] == '-') ) {
85                 tm.tm_year = atoi(&p[0]) - 1900;
86                 tm.tm_mon = atoi(&p[5]) - 1;
87                 tm.tm_mday = atoi(&p[8]);
88                 if ( (p[10] == 'T') && (p[13] == ':') ) {
89                         tm.tm_hour = atoi(&p[11]);
90                         tm.tm_min = atoi(&p[14]);
91                 }
92                 return mktime(&tm);
93         }
94
95         /* hmm... try RFC822 date stamp format */
96
97         t = parsedate(p);
98         if (t > 0) return(t);
99
100         /* yeesh.  ok, just return the current date and time. */
101         return(time(NULL));
102 }
103
104 void flush_rss_item(rss_item *ri)
105 {
106         /* Initialize the feed item data structure */
107         FreeStrBuf(&ri->guid);
108         FreeStrBuf(&ri->title);
109         FreeStrBuf(&ri->link);
110         FreeStrBuf(&ri->author_or_creator);
111         FreeStrBuf(&ri->author_email);
112         FreeStrBuf(&ri->author_url);
113         FreeStrBuf(&ri->description);
114
115         FreeStrBuf(&ri->linkTitle);
116         FreeStrBuf(&ri->reLink);
117         FreeStrBuf(&ri->reLinkTitle);
118         FreeStrBuf(&ri->channel_title);
119 }
120
121
122 /******************************************************************************
123  *                              XML-Handler                                   *
124  ******************************************************************************/
125
126
127 void RSS_item_rss_start (StrBuf *CData,
128                          rss_item *ri,
129                          rss_aggregator *Cfg,
130                          const char** Attr)
131 {
132         syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
133         Cfg->ItemType = RSS_RSS;
134 }
135
136 void RSS_item_rdf_start(StrBuf *CData,
137                         rss_item *ri,
138                         rss_aggregator *Cfg,
139                         const char** Attr)
140 {
141         syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
142         Cfg->ItemType = RSS_RSS;
143 }
144
145 void ATOM_item_feed_start(StrBuf *CData,
146                           rss_item *ri,
147                           rss_aggregator *Cfg,
148                           const char** Attr)
149 {
150         syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
151         Cfg->ItemType = RSS_ATOM;
152 }
153
154
155 void RSS_item_item_start(StrBuf *CData,
156                          rss_item *ri,
157                          rss_aggregator *Cfg,
158                          const char** Attr)
159 {
160         ri->item_tag_nesting ++;
161         flush_rss_item(ri);
162 }
163
164 void ATOM_item_entry_start(StrBuf *CData,
165                            rss_item *ri,
166                            rss_aggregator *Cfg,
167                            const char** Attr)
168 {
169 /* Atom feed... */
170         ri->item_tag_nesting ++;
171         flush_rss_item(ri);
172 }
173
174 void ATOM_item_link_start (StrBuf *CData,
175                            rss_item *ri,
176                            rss_aggregator *Cfg,
177                            const char** Attr)
178 {
179         int i;
180         const char *pHref = NULL;
181         const char *pType = NULL;
182         const char *pRel = NULL;
183         const char *pTitle = NULL;
184
185         for (i = 0; Attr[i] != NULL; i+=2)
186         {
187                 if (!strcmp(Attr[i], "href"))
188                 {
189                         pHref = Attr[i+1];
190                 }
191                 else if (!strcmp(Attr[i], "rel"))
192                 {
193                         pRel = Attr[i+1];
194                 }
195                 else if (!strcmp(Attr[i], "type"))
196                 {
197                         pType = Attr[i+1];
198                 }
199                 else if (!strcmp(Attr[i], "title"))
200                 {
201                         pTitle = Attr[i+1];
202                 }
203         }
204         if (pHref == NULL)
205                 return; /* WHUT? Pointing... where? */
206         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
207                 return;
208         /* these just point to other rss resources,
209            we're not interested in them. */
210         if (pRel != NULL)
211         {
212                 if (!strcasecmp (pRel, "replies"))
213                 {
214                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
215                         StrBufTrim(ri->link);
216                         NewStrBufDupAppendFlush(&ri->reLinkTitle,
217                                                 NULL,
218                                                 pTitle,
219                                                 -1);
220                 }
221                 else if (!strcasecmp(pRel, "alternate"))
222                 { /* Alternative representation of this Item... */
223                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
224                         StrBufTrim(ri->link);
225                         NewStrBufDupAppendFlush(&ri->linkTitle,
226                                                 NULL,
227                                                 pTitle,
228                                                 -1);
229
230                 }
231 #if 0 /* these are also defined, but dunno what to do with them.. */
232                 else if (!strcasecmp(pRel, "related"))
233                 {
234                 }
235                 else if (!strcasecmp(pRel, "self"))
236                 {
237                 }
238                 else if (!strcasecmp(pRel, "enclosure"))
239                 {/*...reference can get big, and is probably the full article*/
240                 }
241                 else if (!strcasecmp(pRel, "via"))
242                 {/* this article was provided via... */
243                 }
244 #endif
245         }
246         else if (StrLength(ri->link) == 0)
247         {
248                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
249                 StrBufTrim(ri->link);
250                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
251         }
252 }
253
254
255
256
257 void ATOMRSS_item_title_end(StrBuf *CData,
258                             rss_item *ri,
259                             rss_aggregator *Cfg,
260                             const char** Attr)
261 {
262         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
263                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
264                 StrBufTrim(ri->channel_title);
265         }
266 }
267
268 void RSS_item_guid_end(StrBuf *CData,
269                        rss_item *ri,
270                        rss_aggregator *Cfg,
271                        const char** Attr)
272 {
273         if (StrLength(CData) > 0) {
274                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
275         }
276 }
277
278 void ATOM_item_id_end(StrBuf *CData,
279                       rss_item *ri, rss_aggregator *Cfg, const char** Attr)
280 {
281         if (StrLength(CData) > 0) {
282                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
283         }
284 }
285
286
287 void RSS_item_link_end (StrBuf *CData,
288                         rss_item *ri,
289                         rss_aggregator *Cfg,
290                         const char** Attr)
291 {
292         if (StrLength(CData) > 0) {
293                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
294                 StrBufTrim(ri->link);
295         }
296 }
297 void RSS_item_relink_end(StrBuf *CData,
298                          rss_item *ri,
299                          rss_aggregator *Cfg,
300                          const char** Attr)
301 {
302         if (StrLength(CData) > 0) {
303                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
304                 StrBufTrim(ri->reLink);
305         }
306 }
307
308 void RSSATOM_item_title_end (StrBuf *CData,
309                              rss_item *ri,
310                              rss_aggregator *Cfg,
311                              const char** Attr)
312 {
313         if (StrLength(CData) > 0) {
314                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
315                 StrBufTrim(ri->title);
316         }
317 }
318
319 void ATOM_item_content_end (StrBuf *CData,
320                             rss_item *ri,
321                             rss_aggregator *Cfg,
322                             const char** Attr)
323 {
324         long olen = StrLength (ri->description);
325         long clen = StrLength (CData);
326         if (clen > 0)
327         {
328                 if (olen == 0) {
329                         NewStrBufDupAppendFlush(&ri->description,
330                                                 CData,
331                                                 NULL,
332                                                 0);
333                         StrBufTrim(ri->description);
334                 }
335                 else if (olen < clen) {
336                         FlushStrBuf(ri->description);
337                         NewStrBufDupAppendFlush(&ri->description,
338                                                 CData,
339                                                 NULL,
340                                                 0);
341
342                         StrBufTrim(ri->description);
343                 }
344         }
345 }
346 void ATOM_item_summary_end (StrBuf *CData,
347                             rss_item *ri,
348                             rss_aggregator *Cfg,
349                             const char** Attr)
350 {
351         /*
352          * this can contain an abstract of the article.
353          * but we don't want to verwrite a full document if we already have it.
354          */
355         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
356         {
357                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
358                 StrBufTrim(ri->description);
359         }
360 }
361
362 void RSS_item_description_end (StrBuf *CData,
363                                rss_item *ri,
364                                rss_aggregator *Cfg,
365                                const char** Attr)
366 {
367         long olen = StrLength (ri->description);
368         long clen = StrLength (CData);
369         if (clen > 0)
370         {
371                 if (olen == 0) {
372                         NewStrBufDupAppendFlush(&ri->description,
373                                                 CData,
374                                                 NULL,
375                                                 0);
376                         StrBufTrim(ri->description);
377                 }
378                 else if (olen < clen) {
379                         FlushStrBuf(ri->description);
380                         NewStrBufDupAppendFlush(&ri->description,
381                                                 CData,
382                                                 NULL,
383                                                 0);
384                         StrBufTrim(ri->description);
385                 }
386         }
387 }
388
389 void ATOM_item_published_end (StrBuf *CData,
390                               rss_item *ri,
391                               rss_aggregator *Cfg,
392                               const char** Attr)
393 {
394         if (StrLength(CData) > 0) {
395                 StrBufTrim(CData);
396                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
397         }
398 }
399
400 void ATOM_item_updated_end (StrBuf *CData,
401                             rss_item *ri,
402                             rss_aggregator *Cfg,
403                             const char** Attr)
404 {
405         if (StrLength(CData) > 0) {
406                 StrBufTrim(CData);
407                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
408         }
409 }
410
411 void RSS_item_pubdate_end (StrBuf *CData,
412                            rss_item *ri,
413                            rss_aggregator *Cfg,
414                            const char** Attr)
415 {
416         if (StrLength(CData) > 0) {
417                 StrBufTrim(CData);
418                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
419         }
420 }
421
422
423 void RSS_item_date_end (StrBuf *CData,
424                         rss_item *ri,
425                         rss_aggregator *Cfg,
426                         const char** Attr)
427 {
428         if (StrLength(CData) > 0) {
429                 StrBufTrim(CData);
430                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
431         }
432 }
433
434
435
436 void RSS_item_author_end(StrBuf *CData,
437                          rss_item *ri,
438                          rss_aggregator *Cfg,
439                          const char** Attr)
440 {
441         if (StrLength(CData) > 0) {
442                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
443                 StrBufTrim(ri->author_or_creator);
444         }
445 }
446
447
448 void ATOM_item_name_end(StrBuf *CData,
449                         rss_item *ri,
450                         rss_aggregator *Cfg,
451                         const char** Attr)
452 {
453         if (StrLength(CData) > 0) {
454                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
455                 StrBufTrim(ri->author_or_creator);
456         }
457 }
458
459 void ATOM_item_email_end(StrBuf *CData,
460                          rss_item *ri,
461                          rss_aggregator *Cfg,
462                          const char** Attr)
463 {
464         if (StrLength(CData) > 0) {
465                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
466                 StrBufTrim(ri->author_email);
467         }
468 }
469
470 void RSS_item_creator_end(StrBuf *CData,
471                           rss_item *ri,
472                           rss_aggregator *Cfg,
473                           const char** Attr)
474 {
475         if ((StrLength(CData) > 0) &&
476             (StrLength(ri->author_or_creator) == 0))
477         {
478                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
479                 StrBufTrim(ri->author_or_creator);
480         }
481 }
482
483
484 void ATOM_item_uri_end(StrBuf *CData,
485                        rss_item *ri,
486                        rss_aggregator *Cfg,
487                        const char** Attr)
488 {
489         if (StrLength(CData) > 0) {
490                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
491                 StrBufTrim(ri->author_url);
492         }
493 }
494
495 void RSS_item_item_end(StrBuf *CData,
496                        rss_item *ri,
497                        rss_aggregator *Cfg,
498                        const char** Attr)
499 {
500         --ri->item_tag_nesting;
501         rss_save_item(ri, Cfg);
502 }
503
504
505 void ATOM_item_entry_end(StrBuf *CData,
506                          rss_item *ri,
507                          rss_aggregator *Cfg,
508                          const char** Attr)
509 {
510         --ri->item_tag_nesting;
511         rss_save_item(ri, Cfg);
512 }
513
514 void RSS_item_rss_end(StrBuf *CData,
515                       rss_item *ri,
516                       rss_aggregator *Cfg,
517                       const char** Attr)
518 {
519 //              syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
520         ri->done_parsing = 1;
521 }
522
523 void RSS_item_rdf_end(StrBuf *CData,
524                       rss_item *ri,
525                       rss_aggregator *Cfg,
526                       const char** Attr)
527 {
528 //              syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
529         ri->done_parsing = 1;
530 }
531
532
533 void RSSATOM_item_ignore(StrBuf *CData,
534                          rss_item *ri,
535                          rss_aggregator *Cfg,
536                          const char** Attr)
537 {
538 }
539
540
541
542 /*
543  * This callback stores up the data which appears in between tags.
544  */
545 void rss_xml_cdata_start(void *data)
546 {
547         rss_aggregator *RSSAggr = (rss_aggregator*) data;
548
549         FlushStrBuf(RSSAggr->CData);
550 }
551
552 void rss_xml_cdata_end(void *data)
553 {
554 }
555 void rss_xml_chardata(void *data, const XML_Char *s, int len)
556 {
557         rss_aggregator *RSSAggr = (rss_aggregator*) data;
558
559         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
560 }
561
562
563 /******************************************************************************
564  *                            RSS parser logic                                *
565  ******************************************************************************/
566
567 extern pthread_mutex_t RSSQueueMutex;
568
569 HashList *StartHandlers = NULL;
570 HashList *EndHandlers = NULL;
571 HashList *KnownNameSpaces = NULL;
572
573 void FreeNetworkSaveMessage (void *vMsg)
574 {
575         networker_save_message *Msg = (networker_save_message *) vMsg;
576
577         CtdlFreeMessageContents(&Msg->Msg);
578         FreeStrBuf(&Msg->Message);
579         FreeStrBuf(&Msg->MsgGUID);
580         free(Msg);
581 }
582
583
584 void AppendLink(StrBuf *Message,
585                 StrBuf *link,
586                 StrBuf *LinkTitle,
587                 const char *Title)
588 {
589         if (StrLength(link) > 0)
590         {
591                 StrBufAppendBufPlain(Message, HKEY("<a href=\""), 0);
592                 StrBufAppendBuf(Message, link, 0);
593                 StrBufAppendBufPlain(Message, HKEY("\">"), 0);
594                 if (StrLength(LinkTitle) > 0)
595                         StrBufAppendBuf(Message, LinkTitle, 0);
596                 else if ((Title != NULL) && !IsEmptyStr(Title))
597                         StrBufAppendBufPlain(Message, Title, -1, 0);
598                 else
599                         StrBufAppendBuf(Message, link, 0);
600                 StrBufAppendBufPlain(Message, HKEY("</a><br>\n"), 0);
601         }
602 }
603
604 /*
605  * Commit a fetched and parsed RSS item to disk
606  */
607 void rss_save_item(rss_item *ri, rss_aggregator *Cfg)
608 {
609         networker_save_message *SaveMsg;
610         struct MD5Context md5context;
611         u_char rawdigest[MD5_DIGEST_LEN];
612         int msglen = 0;
613         StrBuf *Message;
614         StrBuf *guid;
615         AsyncIO *IO = &Cfg->IO;
616         int n;
617
618
619         SaveMsg = (networker_save_message *) malloc(
620                 sizeof(networker_save_message));
621         memset(SaveMsg, 0, sizeof(networker_save_message));
622
623         /* Construct a GUID to use in the S_USETABLE table.
624          * If one is not present in the item itself, make one up.
625          */
626         if (ri->guid != NULL) {
627                 StrBufSpaceToBlank(ri->guid);
628                 StrBufTrim(ri->guid);
629                 guid = NewStrBufPlain(HKEY("rss/"));
630                 StrBufAppendBuf(guid, ri->guid, 0);
631         }
632         else {
633                 MD5Init(&md5context);
634                 if (ri->title != NULL) {
635                         MD5Update(&md5context,
636                                   (const unsigned char*)SKEY(ri->title));
637                 }
638                 if (ri->link != NULL) {
639                         MD5Update(&md5context,
640                                   (const unsigned char*)SKEY(ri->link));
641                 }
642                 MD5Final(rawdigest, &md5context);
643                 guid = NewStrBufPlain(NULL,
644                                       MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
645                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
646                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
647         }
648
649         /* translate Item into message. */
650         EVM_syslog(LOG_DEBUG, "RSS: translating item...\n");
651         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
652         StrBufSpaceToBlank(ri->description);
653         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
654         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
655         SaveMsg->Msg.cm_format_type = FMT_RFC822;
656
657         if (ri->guid != NULL) {
658                 SaveMsg->Msg.cm_fields['E'] = strdup(ChrPtr(ri->guid));
659         }
660
661         if (ri->author_or_creator != NULL) {
662                 char *From;
663                 StrBuf *Encoded = NULL;
664                 int FromAt;
665
666                 From = html_to_ascii(ChrPtr(ri->author_or_creator),
667                                      StrLength(ri->author_or_creator),
668                                      512, 0);
669                 StrBufPlain(ri->author_or_creator, From, -1);
670                 StrBufTrim(ri->author_or_creator);
671                 free(From);
672
673                 FromAt = strchr(ChrPtr(ri->author_or_creator), '@') != NULL;
674                 if (!FromAt && StrLength (ri->author_email) > 0)
675                 {
676                         StrBufRFC2047encode(&Encoded, ri->author_or_creator);
677                         SaveMsg->Msg.cm_fields['A'] = SmashStrBuf(&Encoded);
678                         SaveMsg->Msg.cm_fields['P'] =
679                                 SmashStrBuf(&ri->author_email);
680                 }
681                 else
682                 {
683                         if (FromAt)
684                         {
685                                 SaveMsg->Msg.cm_fields['A'] =
686                                         SmashStrBuf(&ri->author_or_creator);
687                                 SaveMsg->Msg.cm_fields['P'] =
688                                         strdup(SaveMsg->Msg.cm_fields['A']);
689                         }
690                         else
691                         {
692                                 StrBufRFC2047encode(&Encoded,
693                                                     ri->author_or_creator);
694                                 SaveMsg->Msg.cm_fields['A'] =
695                                         SmashStrBuf(&Encoded);
696                                 SaveMsg->Msg.cm_fields['P'] =
697                                         strdup("rss@localhost");
698
699                         }
700                         if (ri->pubdate <= 0) {
701                                 ri->pubdate = time(NULL);
702                         }
703                 }
704         }
705         else {
706                 SaveMsg->Msg.cm_fields['A'] = strdup("rss");
707         }
708
709         SaveMsg->Msg.cm_fields['N'] = strdup(NODENAME);
710         if (ri->title != NULL) {
711                 long len;
712                 char *Sbj;
713                 StrBuf *Encoded, *QPEncoded;
714
715                 QPEncoded = NULL;
716                 StrBufSpaceToBlank(ri->title);
717                 len = StrLength(ri->title);
718                 Sbj = html_to_ascii(ChrPtr(ri->title), len, 512, 0);
719                 len = strlen(Sbj);
720                 if (Sbj[len - 1] == '\n')
721                 {
722                         len --;
723                         Sbj[len] = '\0';
724                 }
725                 Encoded = NewStrBufPlain(Sbj, len);
726                 free(Sbj);
727
728                 StrBufTrim(Encoded);
729                 StrBufRFC2047encode(&QPEncoded, Encoded);
730
731                 SaveMsg->Msg.cm_fields['U'] = SmashStrBuf(&QPEncoded);
732                 FreeStrBuf(&Encoded);
733         }
734         SaveMsg->Msg.cm_fields['T'] = malloc(64);
735         snprintf(SaveMsg->Msg.cm_fields['T'], 64, "%ld", ri->pubdate);
736         if (ri->channel_title != NULL) {
737                 if (StrLength(ri->channel_title) > 0) {
738                         SaveMsg->Msg.cm_fields['O'] =
739                                 strdup(ChrPtr(ri->channel_title));
740                 }
741         }
742         if (ri->link == NULL)
743                 ri->link = NewStrBufPlain(HKEY(""));
744
745 #if 0 /* temporarily disable shorter urls. */
746         SaveMsg->Msg.cm_fields[TMP_SHORTER_URLS] =
747                 GetShorterUrls(ri->description);
748 #endif
749
750         msglen += 1024 + StrLength(ri->link) + StrLength(ri->description) ;
751
752         Message = NewStrBufPlain(NULL, StrLength(ri->description));
753
754         StrBufPlain(Message, HKEY(
755                             "Content-type: text/html; charset=\"UTF-8\"\r\n\r\n"
756                             "<html><body>\n"));
757 #if 0 /* disable shorter url for now. */
758         SaveMsg->Msg.cm_fields[TMP_SHORTER_URL_OFFSET] = StrLength(Message);
759 #endif
760         StrBufAppendBuf(Message, ri->description, 0);
761         StrBufAppendBufPlain(Message, HKEY("<br><br>\n"), 0);
762
763         AppendLink(Message, ri->link, ri->linkTitle, NULL);
764         AppendLink(Message, ri->reLink, ri->reLinkTitle, "Reply to this");
765         StrBufAppendBufPlain(Message, HKEY("</body></html>\n"), 0);
766
767         SaveMsg->MsgGUID = guid;
768         SaveMsg->Message = Message;
769
770         n = GetCount(Cfg->Messages) + 1;
771         Put(Cfg->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
772 }
773
774
775 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
776 {
777         rss_xml_handler *h;
778         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
779         rss_item        *ri = RSSAggr->Item;
780         void            *pv;
781         const char      *pel;
782         char            *sep = NULL;
783
784         /* Axe the namespace, we don't care about it */
785         /*
786           syslog(LOG_DEBUG,
787           "RSS: supplied el %d: %s\n", RSSAggr->Cfg->ItemType, supplied_el);
788         */
789         pel = supplied_el;
790         while (sep = strchr(pel, ':'), sep) {
791                 pel = sep + 1;
792         }
793
794         if (pel != supplied_el)
795         {
796                 void *v;
797
798                 if (!GetHash(KnownNameSpaces,
799                              supplied_el,
800                              pel - supplied_el - 1,
801                              &v))
802                 {
803 #ifdef DEBUG_RSS
804                         syslog(LOG_DEBUG,
805                                "RSS: START ignoring "
806                                "because of wrong namespace [%s]\n",
807                                supplied_el);
808 #endif
809                         return;
810                 }
811         }
812
813         StrBufPlain(RSSAggr->Key, pel, -1);
814         StrBufLowerCase(RSSAggr->Key);
815         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
816         {
817                 h = (rss_xml_handler*) pv;
818
819                 if (((h->Flags & RSS_UNSET) != 0) &&
820                     (RSSAggr->ItemType == RSS_UNSET))
821                 {
822                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
823                 }
824                 else if (((h->Flags & RSS_RSS) != 0) &&
825                     (RSSAggr->ItemType == RSS_RSS))
826                 {
827                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
828                 }
829                 else if (((h->Flags & RSS_ATOM) != 0) &&
830                          (RSSAggr->ItemType == RSS_ATOM))
831                 {
832                         h->Handler(RSSAggr->CData,
833                                    ri,
834                                    RSSAggr,
835                                    attr);
836                 }
837 #ifdef DEBUG_RSS
838                 else
839                         syslog(LOG_DEBUG,
840                                "RSS: START unhandled: [%s] [%s]...\n",
841                                pel,
842                                supplied_el);
843 #endif
844         }
845 #ifdef DEBUG_RSS
846         else
847                 syslog(LOG_DEBUG,
848                        "RSS: START unhandled: [%s] [%s]...\n",
849                        pel,
850                        supplied_el);
851 #endif
852 }
853
854 void rss_xml_end(void *data, const char *supplied_el)
855 {
856         rss_xml_handler *h;
857         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
858         rss_item        *ri = RSSAggr->Item;
859         const char      *pel;
860         char            *sep = NULL;
861         void            *pv;
862
863         /* Axe the namespace, we don't care about it */
864         pel = supplied_el;
865         while (sep = strchr(pel, ':'), sep) {
866                 pel = sep + 1;
867         }
868 //      syslog(LOG_DEBUG, "RSS: END %s...\n", el);
869         if (pel != supplied_el)
870         {
871                 void *v;
872
873                 if (!GetHash(KnownNameSpaces,
874                              supplied_el,
875                              pel - supplied_el - 1,
876                              &v))
877                 {
878 #ifdef DEBUG_RSS
879                         syslog(LOG_DEBUG,
880                                "RSS: END ignoring because of wrong namespace"
881                                "[%s] = [%s]\n",
882                                supplied_el,
883                                ChrPtr(RSSAggr->CData));
884 #endif
885                         FlushStrBuf(RSSAggr->CData);
886                         return;
887                 }
888         }
889
890         StrBufPlain(RSSAggr->Key, pel, -1);
891         StrBufLowerCase(RSSAggr->Key);
892         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
893         {
894                 h = (rss_xml_handler*) pv;
895
896                 if (((h->Flags & RSS_UNSET) != 0) &&
897                     (RSSAggr->ItemType == RSS_UNSET))
898                 {
899                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
900                 }
901                 else if (((h->Flags & RSS_RSS) != 0) &&
902                     (RSSAggr->ItemType == RSS_RSS))
903                 {
904                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
905                 }
906                 else if (((h->Flags & RSS_ATOM) != 0) &&
907                          (RSSAggr->ItemType == RSS_ATOM))
908                 {
909                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
910                 }
911 #ifdef DEBUG_RSS
912                 else
913                         syslog(LOG_DEBUG,
914                                "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
915                                pel,
916                                supplied_el,
917                                ChrPtr(RSSAggr->CData));
918 #endif
919         }
920 #ifdef DEBUG_RSS
921         else
922                 syslog(LOG_DEBUG,
923                        "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
924                        pel,
925                        supplied_el,
926                        ChrPtr(RSSAggr->CData));
927 #endif
928         FlushStrBuf(RSSAggr->CData);
929 }
930
931 /*
932  * Callback function for passing libcurl's output to expat for parsing
933  * we don't do streamed parsing so expat can handle non-utf8 documents
934 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
935 {
936         XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
937         return (size*nmemb);
938 }
939  */
940
941 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
942 {
943         StrBuf *Buf;
944         rss_aggregator *RSSAggr;
945         rss_item *ri;
946         const char *at;
947         char *ptr;
948         long len;
949         const char *Key;
950
951
952         if (IO->HttpReq.httpcode != 200)
953         {
954
955                 EV_syslog(LOG_DEBUG, "need a 200, got a %ld !\n",
956                           IO->HttpReq.httpcode);
957 // TODO: aide error message with rate limit
958                 return eAbort;
959         }
960
961         RSSAggr = IO->Data;
962         ri = RSSAggr->Item;
963         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
964         RSSAggr->Key = NewStrBuf();
965         at = NULL;
966         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
967         ptr = NULL;
968
969 #define encoding "encoding=\""
970         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
971         if (ptr != NULL)
972         {
973                 char *pche;
974
975                 ptr += sizeof (encoding) - 1;
976                 pche = strchr(ptr, '"');
977                 if (pche != NULL)
978                         StrBufCutAt(RSSAggr->Key, -1, pche);
979                 else
980                         ptr = "UTF-8";
981         }
982         else
983                 ptr = "UTF-8";
984
985         syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
986
987         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
988         if (!RSSAggr->xp) {
989                 syslog(LOG_DEBUG, "Cannot create XML parser!\n");
990                 return eAbort;
991         }
992         FlushStrBuf(RSSAggr->Key);
993
994         RSSAggr->Messages = NewHash(1, Flathash);
995         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
996         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
997         XML_SetUserData(RSSAggr->xp, RSSAggr);
998         XML_SetCdataSectionHandler(RSSAggr->xp,
999                                    rss_xml_cdata_start,
1000                                    rss_xml_cdata_end);
1001
1002
1003         len = StrLength(IO->HttpReq.ReplyData);
1004         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
1005         XML_Parse(RSSAggr->xp, ptr, len, 0);
1006         free (ptr);
1007         if (ri->done_parsing == 0)
1008                 XML_Parse(RSSAggr->xp, "", 0, 1);
1009
1010
1011         syslog(LOG_DEBUG, "RSS: XML Status [%s] \n",
1012                XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
1013
1014         XML_ParserFree(RSSAggr->xp);
1015         flush_rss_item(ri);
1016
1017         Buf = NewStrBufDup(RSSAggr->rooms);
1018         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
1019         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
1020         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
1021
1022         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
1023
1024 //Cfg->next_poll = time(NULL) + config.c_net_freq;
1025         if (GetNextHashPos(RSSAggr->Messages,
1026                            RSSAggr->Pos,
1027                            &len,
1028                            &Key,
1029                            (void**) &RSSAggr->ThisMsg))
1030                 return QueueDBOperation(IO, RSS_FetchNetworkUsetableEntry);
1031         else
1032                 return eAbort;
1033 }
1034
1035
1036 /******************************************************************************
1037  *                    RSS handler registering logic                           *
1038  ******************************************************************************/
1039
1040 void AddRSSStartHandler(rss_handler_func Handler,
1041                         int Flags,
1042                         const char *key,
1043                         long len)
1044 {
1045         rss_xml_handler *h;
1046         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
1047         h->Flags = Flags;
1048         h->Handler = Handler;
1049         Put(StartHandlers, key, len, h, NULL);
1050 }
1051
1052 void AddRSSEndHandler(rss_handler_func Handler,
1053                       int Flags,
1054                       const char *key,
1055                       long len)
1056 {
1057         rss_xml_handler *h;
1058         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
1059         h->Flags = Flags;
1060         h->Handler = Handler;
1061         Put(EndHandlers, key, len, h, NULL);
1062 }
1063
1064 void rss_parser_cleanup(void)
1065 {
1066         DeleteHash(&StartHandlers);
1067         DeleteHash(&EndHandlers);
1068         DeleteHash(&KnownNameSpaces);
1069 }
1070
1071
1072 CTDL_MODULE_INIT(rssparser)
1073 {
1074         if (!threading)
1075         {
1076                 StartHandlers = NewHash(1, NULL);
1077                 EndHandlers = NewHash(1, NULL);
1078
1079                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
1080                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
1081                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
1082                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
1083                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
1084                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
1085
1086                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1087                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
1088                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
1089                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
1090 #if 0
1091 // hm, rss to the comments of that blog, might be interesting in future, but...
1092                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
1093 // comment count...
1094                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
1095 #endif
1096                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1097                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
1098                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
1099                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
1100                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
1101                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
1102                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
1103                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
1104                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
1105                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
1106                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
1107 /* <author> */
1108                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
1109                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
1110                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
1111 /* </author> */
1112                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
1113                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
1114                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
1115                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
1116
1117
1118 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
1119                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1120                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1121                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1122                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1123
1124 /* links to other feed generators... */
1125                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1126                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1127                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1128                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1129
1130                 KnownNameSpaces = NewHash(1, NULL);
1131                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
1132                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
1133                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
1134                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
1135                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
1136                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1137                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
1138                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
1139                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
1140                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1141                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
1142                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
1143                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1144                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1145                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1146                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1147                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1148                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1149                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1150                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1151                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1152                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1153                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1154 #if 0
1155                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1156                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1157 #endif
1158                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1159         }
1160         return "rssparser";
1161 }