X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=citadel%2Fmodules%2Frssclient%2Fserv_rssclient.c;h=ca05dd62d5067d619f77e030bf797b11a9b450a4;hb=0387f48886a9395d89eaca01cd40ab751610426f;hp=909587d6284f21d50852117f38870c56317196b4;hpb=58f686487cf5f14d5da5357c67f2e6624dbde027;p=citadel.git diff --git a/citadel/modules/rssclient/serv_rssclient.c b/citadel/modules/rssclient/serv_rssclient.c index 909587d62..ca05dd62d 100644 --- a/citadel/modules/rssclient/serv_rssclient.c +++ b/citadel/modules/rssclient/serv_rssclient.c @@ -1,14 +1,16 @@ /* - * Bring external RSS feeds into rooms. + * Bring external RSS and/or Atom feeds into rooms. This module implements a + * very loose parser that scrapes both kinds of feeds and is not picky about + * the standards compliance of the source data. * - * Copyright (c) 2007-2017 by the citadel.org team + * Copyright (c) 2007-2020 by the citadel.org team * - * This program is open source software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 3. + * This program is open source software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 3. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ @@ -46,9 +48,8 @@ #include "parsedate.h" #include "database.h" #include "citadel_dirs.h" -#include "md5.h" #include "context.h" -#include "rss_atom_parser.h" +#include "internet_addressing.h" struct rssroom { struct rssroom *next; @@ -61,12 +62,257 @@ struct rssurl { struct rssroom *rooms; }; +struct rssparser { + StrBuf *CData; + struct CtdlMessage *msg; + char *link; + char *description; + char *item_id; + struct rssroom *rooms; +}; time_t last_run = 0L; -struct CitContext rss_CC; struct rssurl *rsstodo = NULL; +// This handler is called whenever an XML tag opens. +// +void rss_start_element(void *data, const char *el, const char **attribute) +{ + struct rssparser *r = (struct rssparser *)data; + int i; + + if ( + (!strcasecmp(el, "entry")) + || (!strcasecmp(el, "item")) + ) { + // this is the start of a new item(rss) or entry(atom) + if (r->msg != NULL) { + CM_Free(r->msg); + r->msg = NULL; + } + r->msg = malloc(sizeof(struct CtdlMessage)); + memset(r->msg, 0, sizeof(struct CtdlMessage)); + r->msg->cm_magic = CTDLMESSAGE_MAGIC; + r->msg->cm_anon_type = MES_NORMAL; + r->msg->cm_format_type = FMT_RFC822; + } + + else if (!strcasecmp(el, "link")) { // atom feeds have the link as an attribute + for(i = 0; attribute[i]; i += 2) { + if (!strcasecmp(attribute[i], "href")) { + if (r->link != NULL) { + free(r->link); + r->link = NULL; + } + r->link = strdup(attribute[i+1]); + striplt(r->link); + } + } + } +} + + +// This handler is called whenever an XML tag closes. +// +void rss_end_element(void *data, const char *el) +{ + struct rssparser *r = (struct rssparser *)data; + StrBuf *encoded_field; + + if (StrLength(r->CData) > 0) { // strip leading/trailing whitespace from field + StrBufTrim(r->CData); + } + + if ( // end of a new item(rss) or entry(atom) + (!strcasecmp(el, "entry")) + || (!strcasecmp(el, "item")) + ) { + if (r->msg != NULL) { // Save the message to the rooms + + // use the link as an item id if nothing else is available + if ((r->item_id == NULL) && (r->link != NULL)) { + r->item_id = strdup(r->link); + } + + // check the use table + StrBuf *u = NewStrBuf(); + StrBufAppendPrintf(u, "rss/%s", r->item_id); + int already_seen = CheckIfAlreadySeen(u); + FreeStrBuf(&u); + + if (already_seen == 0) { + + // Compose the message text + StrBuf *TheMessage = NewStrBuf(); + StrBufAppendPrintf(TheMessage, + "Content-type: text/html\n\n" + "\n\n" + "" + ); + + if (r->description != NULL) { + StrBufAppendPrintf(TheMessage, "%s

\r\n", r->description); + free(r->description); + r->description = NULL; + } + + if (r->link != NULL) { + StrBufAppendPrintf(TheMessage, "%s\r\n", r->link, r->link); + free(r->link); + r->link = NULL; + } + + StrBufAppendPrintf(TheMessage, "\r\n"); + CM_SetField(r->msg, eMesageText, ChrPtr(TheMessage), StrLength(TheMessage)); + FreeStrBuf(&TheMessage); + + if (CM_IsEmpty(r->msg, eAuthor)) { + CM_SetField(r->msg, eAuthor, HKEY("rss")); + } + + if (CM_IsEmpty(r->msg, eTimestamp)) { + CM_SetFieldLONG(r->msg, eTimestamp, time(NULL)); + } + + // Save it to the room(s) + struct rssroom *rr = NULL; + long msgnum = (-1); + for (rr=r->rooms; rr!=NULL; rr=rr->next) { + if (rr == r->rooms) { + msgnum = CtdlSubmitMsg(r->msg, NULL, rr->room); // in first room, save msg + } + else { + CtdlSaveMsgPointerInRoom(rr->room, msgnum, 0, NULL); // elsewhere, save a pointer + } + syslog(LOG_DEBUG, "rssclient: saved message %ld to %s", msgnum, rr->room); + } + } + else { + syslog(LOG_DEBUG, "rssclient: already seen %s", r->item_id); + } + + CM_Free(r->msg); + r->msg = NULL; + } + + if (r->item_id != NULL) { + free(r->item_id); + r->item_id = NULL; + } + } + + else if (!strcasecmp(el, "title")) { // item subject (rss and atom) + if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eMsgSubject))) { + encoded_field = NewStrBuf(); + StrBufRFC2047encode(&encoded_field, r->CData); + CM_SetAsFieldSB(r->msg, eMsgSubject, &encoded_field); + } + } + + else if (!strcasecmp(el, "creator")) { // can be used if is not present + if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eAuthor))) { + encoded_field = NewStrBuf(); + StrBufRFC2047encode(&encoded_field, r->CData); + CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field); + } + } + + else if (!strcasecmp(el, "author")) { // supercedes if both are present + if (r->msg != NULL) { + encoded_field = NewStrBuf(); + StrBufRFC2047encode(&encoded_field, r->CData); + CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field); + } + } + + else if (!strcasecmp(el, "pubdate")) { // date/time stamp (rss) Sat, 25 Feb 2017 14:28:01 EST + if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) { + CM_SetFieldLONG(r->msg, eTimestamp, parsedate(ChrPtr(r->CData))); + } + } + + else if (!strcasecmp(el, "updated")) { // date/time stamp (atom) 2003-12-13T18:30:02Z + if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) { + struct tm t; + char zulu; + memset(&t, 0, sizeof t); + sscanf(ChrPtr(r->CData), "%d-%d-%dT%d:%d:%d%c", &t.tm_year, &t.tm_mon, &t.tm_mday, &t.tm_hour, &t.tm_min, &t.tm_sec, &zulu); + t.tm_year -= 1900; + t.tm_mon -= 1; + CM_SetFieldLONG(r->msg, eTimestamp, mktime(&t)); + } + } + + else if (!strcasecmp(el, "link")) { // link to story (rss) + if (r->link != NULL) { + free(r->link); + r->link = NULL; + } + r->link = strdup(ChrPtr(r->CData)); + } + + else if ( + (!strcasecmp(el, "guid")) // unique item id (rss) + || (!strcasecmp(el, "id")) // unique item id (atom) + ) { + if (r->item_id != NULL) { + free(r->item_id); + r->item_id = NULL; + } + r->item_id = strdup(ChrPtr(r->CData)); + } + + else if ( + (!strcasecmp(el, "description")) // message text (rss) + || (!strcasecmp(el, "summary")) // message text (atom) + || (!strcasecmp(el, "content")) // message text (atom) + ) { + if (r->description != NULL) { + free(r->description); + r->description = NULL; + } + r->description = strdup(ChrPtr(r->CData)); + } + + if (r->CData != NULL) { + FreeStrBuf(&r->CData); + r->CData = NULL; + } +} + + +// This handler is called whenever data appears between opening and closing tags. +// +void rss_handle_data(void *data, const char *content, int length) +{ + struct rssparser *r = (struct rssparser *)data; + + if (r->CData == NULL) { + r->CData = NewStrBuf(); + } + + StrBufAppendBufPlain(r->CData, content, length, 0); +} + + +// Feed has been downloaded, now parse it. +// +void rss_parse_feed(StrBuf *Feed, struct rssroom *rooms) +{ + struct rssparser r; + + memset(&r, 0, sizeof r); + r.rooms = rooms; + XML_Parser p = XML_ParserCreate("UTF-8"); + XML_SetElementHandler(p, rss_start_element, rss_end_element); + XML_SetCharacterDataHandler(p, rss_handle_data); + XML_SetUserData(p, (void *)&r); + XML_Parse(p, ChrPtr(Feed), StrLength(Feed), XML_TRUE); + XML_ParserFree(p); +} + + // Add a feed/room pair into the todo list // void rssclient_push_todo(char *rssurl, char *roomname) @@ -75,13 +321,14 @@ void rssclient_push_todo(char *rssurl, char *roomname) struct rssurl *thisone = NULL; struct rssroom *newroom = NULL; - syslog(LOG_DEBUG, "rssclient_push_todo(%s, %s)", rssurl, roomname); + syslog(LOG_DEBUG, "rssclient: will fetch %s to %s", rssurl, roomname); for (r=rsstodo; r!=NULL; r=r->next) { if (!strcasecmp(r->url, rssurl)) { thisone = r; } } + if (thisone == NULL) { thisone = malloc(sizeof(struct rssurl)); thisone->url = strdup(rssurl); @@ -97,27 +344,15 @@ void rssclient_push_todo(char *rssurl, char *roomname) } -// Callback function for curl -// -size_t rss_pof_write_data(void *buffer, size_t size, size_t nmemb, void *userp) -{ - StrBuf *Downloaded = (StrBuf *)userp; - size_t bytes = size * nmemb; - StrBufAppendBufPlain(Downloaded, buffer, bytes, 0); - return(bytes); -} - - // pull one feed (possibly multiple rooms) // void rss_pull_one_feed(struct rssurl *url) { - struct rssroom *r; CURL *curl; CURLcode res; StrBuf *Downloaded = NULL; - syslog(LOG_DEBUG, "rss_pull_one_feed(%s)", url->url); + syslog(LOG_DEBUG, "rssclient: fetching %s", url->url); curl = curl_easy_init(); if (!curl) { @@ -127,24 +362,20 @@ void rss_pull_one_feed(struct rssurl *url) Downloaded = NewStrBuf(); curl_easy_setopt(curl, CURLOPT_URL, url->url); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); // Follow redirects - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, rss_pof_write_data); // What to do with downloaded data + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, CurlFillStrBuf_callback); // What to do with downloaded data curl_easy_setopt(curl, CURLOPT_WRITEDATA, Downloaded); // Give it our StrBuf to work with curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L); // Time out after 20 seconds res = curl_easy_perform(curl); // Perform the request if (res != CURLE_OK) { - syslog(LOG_WARNING, "Failed to load feed: %s", curl_easy_strerror(res)); + syslog(LOG_WARNING, "rssclient: failed to load feed: %s", curl_easy_strerror(res)); } curl_easy_cleanup(curl); - // FIXME parse the feed, dummeh ... it's in ChrPtr(Downloaded) - - for (r=url->rooms; r!=NULL; r=r->next) { - syslog(LOG_DEBUG, "Saving item to %s", r->room); - // FIXME save to rooms - } - - FreeStrBuf(&Downloaded); + rss_parse_feed(Downloaded, url->rooms); // parse the feed + FreeStrBuf(&Downloaded); // free the downloaded feed data } @@ -211,18 +442,17 @@ void rssclient_scan(void) { /* Run no more than once every 15 minutes. */ if ((now - last_run) < 900) { syslog(LOG_DEBUG, - "Client: polling interval not yet reached; last run was %ldm%lds ago", - ((now - last_run) / 60), - ((now - last_run) % 60) + "rssclient: polling interval not yet reached; last run was %ldm%lds ago", + ((now - last_run) / 60), + ((now - last_run) % 60) ); return; } - become_session(&rss_CC); - syslog(LOG_DEBUG, "rssclient started"); + syslog(LOG_DEBUG, "rssclient: started"); CtdlForEachRoom(rssclient_scan_room, NULL); rss_pull_feeds(); - syslog(LOG_DEBUG, "rssclient ended"); + syslog(LOG_DEBUG, "rssclient: ended"); last_run = time(NULL); return; } @@ -232,13 +462,9 @@ CTDL_MODULE_INIT(rssclient) { if (!threading) { - syslog(LOG_INFO, "%s", curl_version()); + syslog(LOG_INFO, "rssclient: using %s", curl_version()); CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER, PRIO_AGGR + 300); } - else - { - CtdlFillSystemContext(&rss_CC, "rssclient"); - } return "rssclient"; }