X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=citadel%2Fmodules%2Frssclient%2Frss_atom_parser.c;h=7bdb94bafe59969ae87d70cb644d82e7e7f504a9;hb=58f686487cf5f14d5da5357c67f2e6624dbde027;hp=9bbeb8a1ffe26b4d5749f5f3a5fa9b52663eab67;hpb=f6877197c739711b051826923760edec66006376;p=citadel.git diff --git a/citadel/modules/rssclient/rss_atom_parser.c b/citadel/modules/rssclient/rss_atom_parser.c index 9bbeb8a1f..7bdb94baf 100644 --- a/citadel/modules/rssclient/rss_atom_parser.c +++ b/citadel/modules/rssclient/rss_atom_parser.c @@ -1,21 +1,15 @@ /* * Bring external RSS feeds into rooms. * - * Copyright (c) 2007-2010 by the citadel.org team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. + * Copyright (c) 2007-2015 by the citadel.org team * + * This program is open source software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3. + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include @@ -58,29 +52,11 @@ #include "event_client.h" #include "rss_atom_parser.h" -extern pthread_mutex_t RSSQueueMutex; - -HashList *StartHandlers = NULL; -HashList *EndHandlers = NULL; -HashList *KnownNameSpaces = NULL; -void AddRSSStartHandler(rss_handler_func Handler, int Flags, const char *key, long len) -{ - rss_xml_handler *h; - h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler)); - h->Flags = Flags; - h->Handler = Handler; - Put(StartHandlers, key, len, h, NULL); -} -void AddRSSEndHandler(rss_handler_func Handler, int Flags, const char *key, long len) -{ - rss_xml_handler *h; - h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler)); - h->Flags = Flags; - h->Handler = Handler; - Put(EndHandlers, key, len, h, NULL); -} +void rss_remember_item(rss_item *ri, rss_aggregator *Cfg); +int RSSAtomParserDebugEnabled = 0; +#define N ((rss_aggregator*)IO->Data)->Cfg.QRnumber /* * Convert an RDF/RSS datestamp into a time_t @@ -132,177 +108,70 @@ void flush_rss_item(rss_item *ri) FreeStrBuf(&ri->author_email); FreeStrBuf(&ri->author_url); FreeStrBuf(&ri->description); -} - -void rss_xml_start(void *data, const char *supplied_el, const char **attr) -{ - rss_xml_handler *h; - rss_aggregator *rssc = (rss_aggregator*) data; - rss_item *ri = rssc->Item; - void *pv; - const char *pel; - char *sep = NULL; - - /* Axe the namespace, we don't care about it */ -/// CtdlLogPrintf(0, "RSS: supplied el %d: %s...\n", rssc->Cfg->ItemType, supplied_el); - pel = supplied_el; - while (sep = strchr(pel, ':'), sep) { - pel = sep + 1; - } - - if (pel != supplied_el) - { - void *v; - - if (!GetHash(KnownNameSpaces, - supplied_el, - pel - supplied_el - 1, - &v)) - { -#ifdef DEBUG_RSS - CtdlLogPrintf(0, "RSS: START ignoring because of wrong namespace [%s] = [%s]\n", - supplied_el); -#endif - return; - } - } - - StrBufPlain(rssc->Key, pel, -1); - StrBufLowerCase(rssc->Key); - if (GetHash(StartHandlers, SKEY(rssc->Key), &pv)) - { - rssc->Current = h = (rss_xml_handler*) pv; - - if (((h->Flags & RSS_UNSET) != 0) && - (rssc->ItemType == RSS_UNSET)) - { - h->Handler(rssc->CData, ri, rssc, attr); - } - else if (((h->Flags & RSS_RSS) != 0) && - (rssc->ItemType == RSS_RSS)) - { - h->Handler(rssc->CData, ri, rssc, attr); - } - else if (((h->Flags & RSS_ATOM) != 0) && - (rssc->ItemType == RSS_ATOM)) - { - h->Handler(rssc->CData, ri, rssc, attr); - } -#ifdef DEBUG_RSS - else - CtdlLogPrintf(0, "RSS: START unhandled: [%s] [%s]...\n", pel, supplied_el); -#endif - } -#ifdef DEBUG_RSS - else - CtdlLogPrintf(0, "RSS: START unhandled: [%s] [%s]...\n", pel, supplied_el); -#endif -} - -void rss_xml_end(void *data, const char *supplied_el) -{ - rss_xml_handler *h; - rss_aggregator *rssc = (rss_aggregator*) data; - rss_item *ri = rssc->Item; - const char *pel; - char *sep = NULL; - void *pv; - /* Axe the namespace, we don't care about it */ - pel = supplied_el; - while (sep = strchr(pel, ':'), sep) { - pel = sep + 1; - } -// CtdlLogPrintf(0, "RSS: END %s...\n", el); - if (pel != supplied_el) - { - void *v; - - if (!GetHash(KnownNameSpaces, - supplied_el, - pel - supplied_el - 1, - &v)) - { -#ifdef DEBUG_RSS - CtdlLogPrintf(0, "RSS: END ignoring because of wrong namespace [%s] = [%s]\n", - supplied_el, ChrPtr(rssc->CData)); -#endif - FlushStrBuf(rssc->CData); - return; - } - } - - StrBufPlain(rssc->Key, pel, -1); - StrBufLowerCase(rssc->Key); - if (GetHash(EndHandlers, SKEY(rssc->Key), &pv)) - { - h = (rss_xml_handler*) pv; - - if (((h->Flags & RSS_UNSET) != 0) && - (rssc->ItemType == RSS_UNSET)) - { - h->Handler(rssc->CData, ri, rssc, NULL); - } - else if (((h->Flags & RSS_RSS) != 0) && - (rssc->ItemType == RSS_RSS)) - { - h->Handler(rssc->CData, ri, rssc, NULL); - } - else if (((h->Flags & RSS_ATOM) != 0) && - (rssc->ItemType == RSS_ATOM)) - { - h->Handler(rssc->CData, ri, rssc, NULL); - } -#ifdef DEBUG_RSS - else - CtdlLogPrintf(0, "RSS: END unhandled: [%s] [%s] = [%s]...\n", pel, supplied_el, ChrPtr(rssc->CData)); -#endif - } -#ifdef DEBUG_RSS - else - CtdlLogPrintf(0, "RSS: END unhandled: [%s] [%s] = [%s]...\n", pel, supplied_el, ChrPtr(rssc->CData)); -#endif - FlushStrBuf(rssc->CData); - rssc->Current = NULL; + FreeStrBuf(&ri->linkTitle); + FreeStrBuf(&ri->reLink); + FreeStrBuf(&ri->reLinkTitle); + FreeStrBuf(&ri->channel_title); } +/****************************************************************************** + * XML-Handler * + ******************************************************************************/ - -void RSS_item_rss_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_rss_start (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { - syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n"); - Cfg->ItemType = RSS_RSS; + syslog(LOG_DEBUG, "RSS: This is an RSS feed."); + RSSAggr->ItemType = RSS_RSS; } -void RSS_item_rdf_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_rdf_start(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { - syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n"); - Cfg->ItemType = RSS_RSS; + syslog(LOG_DEBUG, "RSS: This is an RDF feed."); + RSSAggr->ItemType = RSS_RSS; } -void ATOM_item_feed_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_feed_start(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { - syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n"); - Cfg->ItemType = RSS_ATOM; + syslog(LOG_DEBUG, "RSS: This is an ATOM feed."); + RSSAggr->ItemType = RSS_ATOM; } -void RSS_item_item_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_item_start(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { ri->item_tag_nesting ++; flush_rss_item(ri); } -void ATOM_item_entry_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_entry_start(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { /* Atom feed... */ ri->item_tag_nesting ++; flush_rss_item(ri); } -void ATOM_item_link_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_link_start (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { int i; const char *pHref = NULL; @@ -332,20 +201,28 @@ void ATOM_item_link_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, con if (pHref == NULL) return; /* WHUT? Pointing... where? */ if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml")) - return; /* these just point to other rss resources, we're not interested in them. */ + return; + /* these just point to other rss resources, + we're not interested in them. */ if (pRel != NULL) { if (!strcasecmp (pRel, "replies")) { NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1); StrBufTrim(ri->link); - NewStrBufDupAppendFlush(&ri->reLinkTitle, NULL, pTitle, -1); + NewStrBufDupAppendFlush(&ri->reLinkTitle, + NULL, + pTitle, + -1); } - else if (!strcasecmp(pRel, "alternate")) /* Alternative representation of this Item... */ - { + else if (!strcasecmp(pRel, "alternate")) + { /* Alternative representation of this Item... */ NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1); StrBufTrim(ri->link); - NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1); + NewStrBufDupAppendFlush(&ri->linkTitle, + NULL, + pTitle, + -1); } #if 0 /* these are also defined, but dunno what to do with them.. */ @@ -356,7 +233,7 @@ void ATOM_item_link_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, con { } else if (!strcasecmp(pRel, "enclosure")) - {/* this reference can get big, and is probably the full article... */ + {/*...reference can get big, and is probably the full article*/ } else if (!strcasecmp(pRel, "via")) {/* this article was provided via... */ @@ -374,7 +251,10 @@ void ATOM_item_link_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, con -void ATOMRSS_item_title_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOMRSS_item_title_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) { NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0); @@ -382,14 +262,18 @@ void ATOMRSS_item_title_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, co } } -void RSS_item_guid_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_guid_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0); } } -void ATOM_item_id_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_id_end(StrBuf *CData, + rss_item *ri, rss_aggregator *RSSAggr, const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0); @@ -397,14 +281,20 @@ void ATOM_item_id_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const ch } -void RSS_item_link_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_link_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0); StrBufTrim(ri->link); } } -void RSS_item_relink_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_relink_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0); @@ -412,7 +302,10 @@ void RSS_item_relink_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const } } -void RSSATOM_item_title_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSSATOM_item_title_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0); @@ -420,26 +313,42 @@ void RSSATOM_item_title_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, c } } -void ATOM_item_content_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_content_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { long olen = StrLength (ri->description); long clen = StrLength (CData); - if (clen > 0) + if (clen > 0) { if (olen == 0) { - NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0); + NewStrBufDupAppendFlush(&ri->description, + CData, + NULL, + 0); StrBufTrim(ri->description); } else if (olen < clen) { FlushStrBuf(ri->description); - NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0); + NewStrBufDupAppendFlush(&ri->description, + CData, + NULL, + 0); + StrBufTrim(ri->description); } } } -void ATOM_item_summary_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_summary_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { - /* this can contain an abstract of the article. but we don't want to verwrite a full document if we already have it. */ + /* + * this can contain an abstract of the article. + * but we don't want to verwrite a full document if we already have it. + */ if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0)) { NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0); @@ -447,33 +356,48 @@ void ATOM_item_summary_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, co } } -void RSS_item_description_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_description_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { long olen = StrLength (ri->description); long clen = StrLength (CData); - if (clen > 0) + if (clen > 0) { if (olen == 0) { - NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0); + NewStrBufDupAppendFlush(&ri->description, + CData, + NULL, + 0); StrBufTrim(ri->description); } else if (olen < clen) { FlushStrBuf(ri->description); - NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0); + NewStrBufDupAppendFlush(&ri->description, + CData, + NULL, + 0); StrBufTrim(ri->description); } } } -void ATOM_item_published_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) -{ +void ATOM_item_published_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) +{ if (StrLength(CData) > 0) { StrBufTrim(CData); ri->pubdate = rdf_parsedate(ChrPtr(CData)); } } -void ATOM_item_updated_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_updated_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { StrBufTrim(CData); @@ -481,7 +405,10 @@ void ATOM_item_updated_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, co } } -void RSS_item_pubdate_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_pubdate_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { StrBufTrim(CData); @@ -490,7 +417,10 @@ void RSS_item_pubdate_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, con } -void RSS_item_date_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_date_end (StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { StrBufTrim(CData); @@ -500,7 +430,10 @@ void RSS_item_date_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const -void RSS_item_author_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_author_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0); @@ -509,7 +442,10 @@ void RSS_item_author_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const } -void ATOM_item_name_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_name_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0); @@ -517,7 +453,10 @@ void ATOM_item_name_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const } } -void ATOM_item_email_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_email_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0); @@ -525,9 +464,12 @@ void ATOM_item_email_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const } } -void RSS_item_creator_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_creator_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { - if ((StrLength(CData) > 0) && + if ((StrLength(CData) > 0) && (StrLength(ri->author_or_creator) == 0)) { NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0); @@ -536,7 +478,10 @@ void RSS_item_creator_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, cons } -void ATOM_item_uri_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_uri_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { if (StrLength(CData) > 0) { NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0); @@ -544,33 +489,48 @@ void ATOM_item_uri_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const c } } -void RSS_item_item_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_item_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { --ri->item_tag_nesting; - rss_save_item(ri, Cfg); + rss_remember_item(ri, RSSAggr); } -void ATOM_item_entry_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void ATOM_item_entry_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { --ri->item_tag_nesting; - rss_save_item(ri, Cfg); + rss_remember_item(ri, RSSAggr); } -void RSS_item_rss_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSS_item_rss_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { -// syslog(LOG_DEBUG, "End of feed detected. Closing parser.\n"); + syslog(LOG_DEBUG, "End of feed detected. Closing parser."); ri->done_parsing = 1; - } -void RSS_item_rdf_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) + +void RSS_item_rdf_end(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { -// syslog(LOG_DEBUG, "End of feed detected. Closing parser.\n"); + syslog(LOG_DEBUG, "End of feed detected. Closing parser."); ri->done_parsing = 1; } -void RSSATOM_item_ignore(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr) +void RSSATOM_item_ignore(StrBuf *CData, + rss_item *ri, + rss_aggregator *RSSAggr, + const char** Attr) { } @@ -579,64 +539,321 @@ void RSSATOM_item_ignore(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const /* * This callback stores up the data which appears in between tags. */ -void rss_xml_cdata_start(void *data) +void rss_xml_cdata_start(void *data) { - rss_aggregator *rssc = (rss_aggregator*) data; + rss_aggregator *RSSAggr = (rss_aggregator*) data; - FlushStrBuf(rssc->CData); + FlushStrBuf(RSSAggr->CData); } -void rss_xml_cdata_end(void *data) +void rss_xml_cdata_end(void *data) { } -void rss_xml_chardata(void *data, const XML_Char *s, int len) +void rss_xml_chardata(void *data, const XML_Char *s, int len) { - rss_aggregator *rssc = (rss_aggregator*) data; + rss_aggregator *RSSAggr = (rss_aggregator*) data; - StrBufAppendBufPlain (rssc->CData, s, len, 0); + StrBufAppendBufPlain (RSSAggr->CData, s, len, 0); } + +/****************************************************************************** + * RSS parser logic * + ******************************************************************************/ + +extern pthread_mutex_t RSSQueueMutex; + +HashList *StartHandlers = NULL; +HashList *EndHandlers = NULL; +HashList *KnownNameSpaces = NULL; + +void FreeNetworkSaveMessage (void *vMsg) +{ + networker_save_message *Msg = (networker_save_message *) vMsg; + + CM_FreeContents(&Msg->Msg); + FreeStrBuf(&Msg->Message); + FreeStrBuf(&Msg->MsgGUID); + + FreeStrBuf(&Msg->author_email); + FreeStrBuf(&Msg->author_or_creator); + FreeStrBuf(&Msg->title); + FreeStrBuf(&Msg->description); + + FreeStrBuf(&Msg->link); + FreeStrBuf(&Msg->linkTitle); + + FreeStrBuf(&Msg->reLink); + FreeStrBuf(&Msg->reLinkTitle); + + free(Msg); +} + + /* - * Callback function for passing libcurl's output to expat for parsing + * Commit a fetched and parsed RSS item to disk */ -size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream) +void rss_remember_item(rss_item *ri, rss_aggregator *RSSAggr) +{ + networker_save_message *SaveMsg; + struct MD5Context md5context; + u_char rawdigest[MD5_DIGEST_LEN]; + StrBuf *guid; + int n; + + SaveMsg = (networker_save_message *) malloc(sizeof(networker_save_message)); + memset(SaveMsg, 0, sizeof(networker_save_message)); + + /* Construct a GUID to use in the S_USETABLE table. + * If one is not present in the item itself, make one up. + */ + if (ri->guid != NULL) { + StrBufSpaceToBlank(ri->guid); + StrBufTrim(ri->guid); + guid = NewStrBufPlain(HKEY("rss/")); + StrBufAppendBuf(guid, ri->guid, 0); + } + else { + MD5Init(&md5context); + if (ri->title != NULL) { + MD5Update(&md5context, (const unsigned char*)SKEY(ri->title)); + } + if (ri->link != NULL) { + MD5Update(&md5context, (const unsigned char*)SKEY(ri->link)); + } + MD5Final(rawdigest, &md5context); + guid = NewStrBufPlain(NULL, MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/); + StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN); + StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0); + } + + /* translate Item into message. */ + syslog(LOG_DEBUG, "RSS: translating item..."); + if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY("")); + StrBufSpaceToBlank(ri->description); + SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC; + SaveMsg->Msg.cm_anon_type = MES_NORMAL; + SaveMsg->Msg.cm_format_type = FMT_RFC822; + + /* gather the cheaply computed information now... */ + + if (ri->guid != NULL) { + CM_SetField(&SaveMsg->Msg, eExclusiveID, SKEY(ri->guid)); + } + + SaveMsg->MsgGUID = guid; + + if (ri->pubdate <= 0) { + ri->pubdate = time(NULL); + } + CM_SetFieldLONG(&SaveMsg->Msg, eTimestamp, ri->pubdate); + if (ri->channel_title != NULL) { + if (StrLength(ri->channel_title) > 0) { + CM_SetField(&SaveMsg->Msg, eOriginalRoom, SKEY(ri->channel_title)); + } + } + + /* remember the ones for defferred processing to save computing power after we know if we realy need it. */ + + SaveMsg->author_or_creator = ri->author_or_creator; + ri->author_or_creator = NULL; + + SaveMsg->author_email = ri->author_email; + ri->author_email = NULL; + + SaveMsg->title = ri->title; + ri->title = NULL; + + SaveMsg->link = ri->link; + ri->link = NULL; + + SaveMsg->description = ri->description; + ri->description = NULL; + + SaveMsg->linkTitle = ri->linkTitle; + ri->linkTitle = NULL; + + SaveMsg->reLink = ri->reLink; + ri->reLink = NULL; + + SaveMsg->reLinkTitle = ri->reLinkTitle; + ri->reLinkTitle = NULL; + + n = GetCount(RSSAggr->Messages) + 1; + Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage); +} + + + +void rss_xml_start(void *data, const char *supplied_el, const char **attr) +{ + rss_xml_handler *h; + rss_aggregator *RSSAggr = (rss_aggregator*) data; + rss_item *ri = RSSAggr->Item; + void *pv; + const char *pel; + char *sep = NULL; + + /* Axe the namespace, we don't care about it */ + /* + syslog(LOG_DEBUG, + "RSS: supplied el %d: %s\n", RSSAggr->RSSAggr->ItemType, supplied_el); + */ + pel = supplied_el; + while (sep = strchr(pel, ':'), sep) { + pel = sep + 1; + } + + if (pel != supplied_el) + { + void *v; + + if (!GetHash(KnownNameSpaces, + supplied_el, + pel - supplied_el - 1, + &v)) + { + syslog(LOG_DEBUG, + "RSS: START ignoring " + "because of wrong namespace [%s]", + supplied_el + ); + return; + } + } + + StrBufPlain(RSSAggr->Key, pel, -1); + StrBufLowerCase(RSSAggr->Key); + if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv)) + { + h = (rss_xml_handler*) pv; + + if (((h->Flags & RSS_UNSET) != 0) && + (RSSAggr->ItemType == RSS_UNSET)) + { + h->Handler(RSSAggr->CData, ri, RSSAggr, attr); + } + else if (((h->Flags & RSS_RSS) != 0) && + (RSSAggr->ItemType == RSS_RSS)) + { + h->Handler(RSSAggr->CData, ri, RSSAggr, attr); + } + else if (((h->Flags & RSS_ATOM) != 0) && + (RSSAggr->ItemType == RSS_ATOM)) + { + h->Handler(RSSAggr->CData, + ri, + RSSAggr, + attr); + } + else + syslog(LOG_DEBUG, + "RSS: START unhandled: [%s] [%s]...", + pel, + supplied_el + ); + } + else + syslog(LOG_DEBUG, + "RSS: START unhandled: [%s] [%s]...", + pel, + supplied_el + ); +} + +void rss_xml_end(void *data, const char *supplied_el) { - XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0); - return (size*nmemb); + rss_xml_handler *h; + rss_aggregator *RSSAggr = (rss_aggregator*) data; + rss_item *ri = RSSAggr->Item; + const char *pel; + char *sep = NULL; + void *pv; + + /* Axe the namespace, we don't care about it */ + pel = supplied_el; + while (sep = strchr(pel, ':'), sep) { + pel = sep + 1; + } + syslog(LOG_DEBUG, "RSS: END %s...", supplied_el); + if (pel != supplied_el) + { + void *v; + + if (!GetHash(KnownNameSpaces, + supplied_el, + pel - supplied_el - 1, + &v)) + { + syslog(LOG_DEBUG, + "RSS: END ignoring because of wrong namespace" + "[%s] = [%s]", + supplied_el, + ChrPtr(RSSAggr->CData)); + FlushStrBuf(RSSAggr->CData); + return; + } + } + + StrBufPlain(RSSAggr->Key, pel, -1); + StrBufLowerCase(RSSAggr->Key); + if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv)) + { + h = (rss_xml_handler*) pv; + + if (((h->Flags & RSS_UNSET) != 0) && + (RSSAggr->ItemType == RSS_UNSET)) + { + h->Handler(RSSAggr->CData, ri, RSSAggr, NULL); + } + else if (((h->Flags & RSS_RSS) != 0) && + (RSSAggr->ItemType == RSS_RSS)) + { + h->Handler(RSSAggr->CData, ri, RSSAggr, NULL); + } + else if (((h->Flags & RSS_ATOM) != 0) && + (RSSAggr->ItemType == RSS_ATOM)) + { + h->Handler(RSSAggr->CData, ri, RSSAggr, NULL); + } + else + syslog(LOG_DEBUG, + "RSS: END unhandled: [%s] [%s] = [%s]...", + pel, + supplied_el, + ChrPtr(RSSAggr->CData)); + } + else + syslog(LOG_DEBUG, + "RSS: END unhandled: [%s] [%s] = [%s]...", + pel, + supplied_el, + ChrPtr(RSSAggr->CData)); + FlushStrBuf(RSSAggr->CData); } -eNextState ParseRSSReply(AsyncIO *IO) +eNextState RSSAggregator_ParseReply(AsyncIO *IO) { StrBuf *Buf; - rss_aggregator *rssc; + rss_aggregator *RSSAggr; rss_item *ri; const char *at; char *ptr; long len; const char *Key; - - if (IO->HttpReq.httpcode != 200) - { - - EV_syslog(LOG_DEBUG, "need a 200, got a %ld !\n", - IO->HttpReq.httpcode); -// TODO: aide error message with rate limit - return eAbort; - } - - rssc = IO->Data; - ri = rssc->Item; - rssc->CData = NewStrBufPlain(NULL, SIZ); - rssc->Key = NewStrBuf(); + RSSAggr = IO->Data; + ri = RSSAggr->Item; + RSSAggr->CData = NewStrBufPlain(NULL, SIZ); + RSSAggr->Key = NewStrBuf(); at = NULL; - StrBufSipLine(rssc->Key, IO->HttpReq.ReplyData, &at); + StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at); ptr = NULL; #define encoding "encoding=\"" - ptr = strstr(ChrPtr(rssc->Key), encoding); + ptr = strstr(ChrPtr(RSSAggr->Key), encoding); if (ptr != NULL) { char *pche; @@ -644,70 +861,108 @@ eNextState ParseRSSReply(AsyncIO *IO) ptr += sizeof (encoding) - 1; pche = strchr(ptr, '"'); if (pche != NULL) - StrBufCutAt(rssc->Key, -1, pche); - else + StrBufCutAt(RSSAggr->Key, -1, pche); + else ptr = "UTF-8"; } else ptr = "UTF-8"; - syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(rssc->Url)); + syslog(LOG_DEBUG, "RSS: Now parsing [%s]", ChrPtr(RSSAggr->Url)); - rssc->xp = XML_ParserCreateNS(ptr, ':'); - if (!rssc->xp) { - syslog(LOG_DEBUG, "Cannot create XML parser!\n"); + RSSAggr->xp = XML_ParserCreateNS(ptr, ':'); + if (!RSSAggr->xp) { + syslog(LOG_ALERT, "Cannot create XML parser!"); return eAbort; } - FlushStrBuf(rssc->Key); + FlushStrBuf(RSSAggr->Key); - rssc->Messages = NewHash(1, Flathash); - XML_SetElementHandler(rssc->xp, rss_xml_start, rss_xml_end); - XML_SetCharacterDataHandler(rssc->xp, rss_xml_chardata); - XML_SetUserData(rssc->xp, rssc); - XML_SetCdataSectionHandler(rssc->xp, + RSSAggr->Messages = NewHash(1, Flathash); + XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end); + XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata); + XML_SetUserData(RSSAggr->xp, RSSAggr); + XML_SetCdataSectionHandler(RSSAggr->xp, rss_xml_cdata_start, - rss_xml_cdata_end); - + rss_xml_cdata_end + ); len = StrLength(IO->HttpReq.ReplyData); ptr = SmashStrBuf(&IO->HttpReq.ReplyData); - XML_Parse(rssc->xp, ptr, len, 0); + XML_Parse(RSSAggr->xp, ptr, len, 0); free (ptr); - if (ri->done_parsing == 0) - XML_Parse(rssc->xp, "", 0, 1); - + if (ri->done_parsing == 0) { + XML_Parse(RSSAggr->xp, "", 0, 1); + } - syslog(LOG_DEBUG, "RSS: XML Status [%s] \n", - XML_ErrorString( - XML_GetErrorCode(rssc->xp))); + syslog(LOG_DEBUG, "RSS: XML Status [%s]", XML_ErrorString(XML_GetErrorCode(RSSAggr->xp))); - XML_ParserFree(rssc->xp); + XML_ParserFree(RSSAggr->xp); flush_rss_item(ri); - FreeStrBuf(&rssc->CData); - FreeStrBuf(&rssc->Key); - Buf = NewStrBufDup(rssc->rooms); - rssc->recp.recp_room = SmashStrBuf(&Buf); - rssc->recp.num_room = rssc->roomlist_parts; - rssc->recp.recptypes_magic = RECPTYPES_MAGIC; + Buf = NewStrBufDup(RSSAggr->rooms); + RSSAggr->recp.recp_room = SmashStrBuf(&Buf); + RSSAggr->recp.num_room = RSSAggr->roomlist_parts; + RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC; - rssc->Pos = GetNewHashPos(rssc->Messages, 1); + RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1); - ///Cfg->next_poll = time(NULL) + config.c_net_freq; - if (GetNextHashPos(rssc->Messages, rssc->Pos, &len, &Key, (void**) &rssc->ThisMsg)) - return QueueDBOperation(IO, RSS_FetchNetworkUsetableEntry); - else +#if 0 +// FIXME ajc + if (GetNextHashPos(RSSAggr->Messages, + RSSAggr->Pos, + &len, + &Key, + (void**) &RSSAggr->ThisMsg)) { + return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry); + } + else { +#endif return eAbort; +#if 0 + } +#endif } +/****************************************************************************** + * RSS handler registering logic * + ******************************************************************************/ + +void AddRSSStartHandler(rss_handler_func Handler, + int Flags, + const char *key, + long len) +{ + rss_xml_handler *h; + h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler)); + h->Flags = Flags; + h->Handler = Handler; + Put(StartHandlers, key, len, h, NULL); +} + +void AddRSSEndHandler(rss_handler_func Handler, + int Flags, + const char *key, + long len) +{ + rss_xml_handler *h; + h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler)); + h->Flags = Flags; + h->Handler = Handler; + Put(EndHandlers, key, len, h, NULL); +} + void rss_parser_cleanup(void) { - DeleteHash(&StartHandlers); - DeleteHash(&EndHandlers); + DeleteHash(&StartHandlers); + DeleteHash(&EndHandlers); DeleteHash(&KnownNameSpaces); } +void LogDebugEnableRSSATOMParser(const int n) +{ + RSSAtomParserDebugEnabled = n; +} CTDL_MODULE_INIT(rssparser) { @@ -718,7 +973,7 @@ CTDL_MODULE_INIT(rssparser) AddRSSStartHandler(RSS_item_rss_start, RSS_UNSET, HKEY("rss")); AddRSSStartHandler(RSS_item_rdf_start, RSS_UNSET, HKEY("rdf")); - AddRSSStartHandler(ATOM_item_feed_start, RSS_UNSET, HKEY("feed")); + AddRSSStartHandler(ATOM_item_feed_start, RSS_UNSET, HKEY("feed")); AddRSSStartHandler(RSS_item_item_start, RSS_RSS, HKEY("item")); AddRSSStartHandler(ATOM_item_entry_start, RSS_ATOM, HKEY("entry")); AddRSSStartHandler(ATOM_item_link_start, RSS_ATOM, HKEY("link")); @@ -727,8 +982,8 @@ CTDL_MODULE_INIT(rssparser) AddRSSEndHandler(RSS_item_guid_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid")); AddRSSEndHandler(ATOM_item_id_end, RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id")); AddRSSEndHandler(RSS_item_link_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("link")); -#if 0 -// hm, rss to the comments of that blog, might be interesting in future, but... +#if 0 +// hm, rss to the comments of that blog, might be interesting in future, but... AddRSSEndHandler(RSS_item_relink_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss")); // comment count... AddRSSEndHandler(RSS_item_relink_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments")); @@ -795,7 +1050,8 @@ CTDL_MODULE_INIT(rssparser) /* we don't like these namespaces because of they shadow our usefull parameters. */ Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler); #endif - CtdlRegisterCleanupHook(rss_parser_cleanup); + CtdlRegisterDebugFlagHook(HKEY("RSSAtomParser"), LogDebugEnableRSSATOMParser, &RSSAtomParserDebugEnabled); + CtdlRegisterCleanupHook(rss_parser_cleanup); } return "rssparser"; }