d13d1e9582edcd443e6e888a8791537ba4adccf4
[citadel.git] / citadel / modules / urldeshortener / serv_expand_shorter_urls.c
1 /*
2  *
3  * Copyright (c) 1998-2009 by the citadel.org team
4  *
5  *  This program is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, write to the Free Software
17  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19
20 #include "sysdep.h"
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24 #include <termios.h>
25 #include <fcntl.h>
26 #include <signal.h>
27 #include <pwd.h>
28 #include <errno.h>
29 #include <sys/types.h>
30 #include <syslog.h>
31
32 #if TIME_WITH_SYS_TIME
33 # include <sys/time.h>
34 # include <time.h>
35 #else
36 # if HAVE_SYS_TIME_H
37 #  include <sys/time.h>
38 # else
39 #  include <time.h>
40 # endif
41 #endif
42 #include <sys/wait.h>
43 #include <ctype.h>
44 #include <string.h>
45 #include <limits.h>
46 #include <sys/socket.h>
47 #include <netinet/in.h>
48 #include <arpa/inet.h>
49 #include <assert.h>
50
51 #include <libcitadel.h>
52 #include "citadel.h"
53 #include "server.h"
54 #include "citserver.h"
55 #include "support.h"
56 #include "config.h"
57 #include "control.h"
58 #include "user_ops.h"
59 #include "database.h"
60 #include "msgbase.h"
61 #include "internet_addressing.h"
62 #include "genstamp.h"
63 #include "domain.h"
64 #include "ctdl_module.h"
65 #include "locate_host.h"
66 #include "citadel_dirs.h"
67
68 #include "event_client.h"
69
70 HashList *UrlShorteners = NULL;
71
72 size_t GetLocationString( void *ptr, size_t size, size_t nmemb, void *userdata)
73 {
74 #define LOCATION "location"
75         if (strncasecmp((char*)ptr, LOCATION, sizeof(LOCATION) - 1) == 0)
76         {
77                 StrBuf *pURL = (StrBuf*) userdata;
78                 char *pch = (char*) ptr;
79                 char *pche;
80                 
81                 pche = pch + (size * nmemb);
82                 pch += sizeof(LOCATION);
83                 
84                 while (isspace(*pch) || (*pch == ':'))
85                         pch ++;
86
87                 while (isspace(*pche) || (*pche == '\0'))
88                         pche--;
89                 
90                 FlushStrBuf(pURL);
91                 StrBufPlain(pURL, pch, pche - pch + 1); 
92         }
93         return size * nmemb;
94 }
95
96 eNextState TerminateLookupUrl(AsyncIO *IO)
97 {
98 //TOOD
99         return eAbort;
100 }
101 eNextState LookupUrlResult(AsyncIO *IO)
102 {
103         return eTerminateConnection; /// /TODO
104 }
105
106 int LookupUrl(StrBuf *ShorterUrlStr)
107 {
108         CURLcode sta;
109         int rc = 0;
110         CURL *chnd;
111         AsyncIO *IO;
112
113
114         IO = (AsyncIO*) malloc(sizeof(AsyncIO));
115         memset(IO, 0, sizeof(AsyncIO));
116         IO->CitContext = CloneContext(CC);
117
118         ParseURL(&IO->ConnectMe, ShorterUrlStr, 80);
119         CurlPrepareURL(IO->ConnectMe);
120         if (! evcurl_init(IO, 
121 //                        Ctx, 
122                           NULL,
123                           "Citadel RSS ShorterURL Expander",
124                           LookupUrlResult, 
125                           TerminateLookupUrl))
126         {
127                 syslog(LOG_ALERT, "Unable to initialize libcurl.\n");
128                 goto shutdown;
129         }
130         chnd = IO->HttpReq.chnd;
131
132         OPT(SSL_VERIFYPEER, 0);
133         OPT(SSL_VERIFYHOST, 0);
134         OPT(FOLLOWLOCATION, 10);
135 #ifdef CURLOPT_HTTP_CONTENT_DECODING
136         OPT(HTTP_CONTENT_DECODING, 1);
137         OPT(ENCODING, "");
138 #endif 
139         OPT(HEADERFUNCTION , GetLocationString);
140         OPT(WRITEHEADER, ShorterUrlStr);
141
142
143         if (server_shutting_down)
144                 goto shutdown ;
145
146         evcurl_handle_start(IO);
147
148 shutdown:
149
150         return rc;
151
152 }
153
154
155
156 void CrawlMessageForShorterUrls(HashList *pUrls, StrBuf *Message)
157 {
158         int nHits = 0;
159         void *pv;
160         int nShorter = 0;
161         const char *pch;
162         const char *pUrl;
163         ConstStr *pCUrl;
164
165         while (GetHash(UrlShorteners, IKEY(nShorter), &pv))
166         {
167                 nShorter++;
168                 pch = ChrPtr(Message);
169                 pUrl = strstr(pch, ChrPtr((StrBuf*)pv));
170                 while ((pUrl != NULL) && (nHits < 99))
171                 {
172                         pCUrl = malloc(sizeof(ConstStr));
173
174                         pCUrl->Key = pUrl;
175                         pch = pUrl + StrLength((StrBuf*)pv);
176                         while (isalnum(*pch)||(*pch == '-')||(*pch == '/'))
177                                 pch++;
178                         pCUrl->len = pch - pCUrl->Key;
179
180                         Put(pUrls, IKEY(nHits), pCUrl, NULL);
181                         nHits ++;
182                         pUrl = strstr(pch, ChrPtr((StrBuf*)pv));
183                 }
184         }
185 }
186
187 int SortConstStrByPosition(const void *Item1, const void *Item2)
188 {
189         const ConstStr *p1, *p2;
190         p1 = (const ConstStr*) Item1;
191         p2 = (const ConstStr*) Item2;
192         if (p1->Key == p2->Key)
193                 return 0;
194         if (p1->Key > p2->Key)
195                 return 1;
196         return -1;
197 }
198
199 HashList *GetShorterUrls(StrBuf *Message)
200 {
201         HashList *pUrls;
202         /* we just suspect URL shorteners to be inside of feeds from twitter
203          * or other short content messages, so don't crawl through real blogs.
204          */
205         if (StrLength(Message) > 500)
206                 return NULL;
207
208         pUrls = NewHash(1, Flathash);
209         CrawlMessageForShorterUrls(pUrls, Message);
210
211         if (GetCount(pUrls) > 0)
212                 return pUrls;
213         else 
214                 return NULL;
215
216 }
217
218 void ExpandShortUrls(StrBuf *Message, HashList *pUrls, int Callback)
219 {
220         StrBuf *Shadow;
221         ConstStr *pCUrl;
222         const char *pch;
223         const char *pche;
224
225         StrBuf *ShorterUrlStr;
226         HashPos *Pos;
227         const char *Key;
228         void *pv;
229         long len;
230         
231         Shadow = NewStrBufPlain(NULL, StrLength(Message));
232         SortByPayload (pUrls, SortConstStrByPosition);
233                 
234         ShorterUrlStr = NewStrBufPlain(NULL, StrLength(Message));
235                 
236         pch = ChrPtr(Message);
237         pche = pch + StrLength(Message);
238         Pos = GetNewHashPos(pUrls, 1);
239         while (GetNextHashPos(pUrls, Pos, &len, &Key, &pv))
240         {
241                 pCUrl = (ConstStr*) pv;
242
243                 if (pch != pCUrl->Key)
244                         StrBufAppendBufPlain(Shadow, pch, pCUrl->Key - pch, 0);
245                         
246                 StrBufPlain(ShorterUrlStr, CKEY(*pCUrl));
247                 if (LookupUrl(ShorterUrlStr))
248                 {
249                         StrBufAppendBufPlain(Shadow, HKEY("<a href=\""), 0);
250                         StrBufAppendBuf(Shadow, ShorterUrlStr, 0);
251                         StrBufAppendBufPlain(Shadow, HKEY("\">"), 0);
252                         StrBufAppendBuf(Shadow, ShorterUrlStr, 0);
253                         StrBufAppendBufPlain(Shadow, HKEY("["), 0);
254                         StrBufAppendBufPlain(Shadow, pCUrl->Key, pCUrl->len, 0);
255                         StrBufAppendBufPlain(Shadow, HKEY("]</a>"), 0);
256                 }
257                 else
258                 {
259                         StrBufAppendBufPlain(Shadow, HKEY("<a href=\""), 0);
260                         StrBufAppendBufPlain(Shadow, pCUrl->Key, pCUrl->len, 0);
261                         StrBufAppendBufPlain(Shadow, HKEY("\">"), 0);
262                         StrBufAppendBufPlain(Shadow, pCUrl->Key, pCUrl->len, 0);
263                         StrBufAppendBufPlain(Shadow, HKEY("</a>"), 0);
264                 }
265                 pch = pCUrl->Key + pCUrl->len + 1;
266
267         }
268         if (pch < pche)
269                 StrBufAppendBufPlain(Shadow, pch, pche - pch, 0);
270         FlushStrBuf(Message);
271         StrBufAppendBuf(Message, Shadow, 0);
272
273         FreeStrBuf(&ShorterUrlStr);
274         FreeStrBuf(&Shadow);
275         DeleteHashPos(&Pos);
276         
277
278         DeleteHash(&pUrls);
279 }
280
281 void LoadUrlShorteners(void)
282 {
283         int i = 0;
284         int fd;
285         const char *POS = NULL;
286         const char *Err = NULL;
287         StrBuf *Content, *Line;
288
289
290         UrlShorteners = NewHash(0, Flathash);
291
292         fd = open(file_citadel_urlshorteners, 0);
293
294         if (fd != 0)
295         {
296                 Content = NewStrBufPlain(NULL, SIZ);
297                 Line = NewStrBuf();
298                 while (POS != StrBufNOTNULL)
299                 {
300                         StrBufTCP_read_buffered_line_fast (Line, Content, &POS, &fd, 1, 1, &Err);
301                         StrBufTrim(Line);
302                         if ((*ChrPtr(Line) != '#') && (StrLength(Line) > 0))
303                         {
304                                 Put(UrlShorteners, IKEY(i), Line, HFreeStrBuf);
305                                 i++;
306                                 Line = NewStrBuf();
307                         }
308                         else
309                                 FlushStrBuf(Line);
310                         if (POS == NULL)
311                                 POS = StrBufNOTNULL;
312                 }
313                 FreeStrBuf(&Line);
314                 FreeStrBuf(&Content);
315         }
316         close(fd);
317 }
318
319 void shorter_url_cleanup(void)
320 {
321         DeleteHash(&UrlShorteners);
322 }
323
324
325 CTDL_MODULE_INIT(urldeshortener)
326 {
327         if (threading)
328         {
329                 syslog(LOG_INFO, "%s\n", curl_version());
330         }
331         else 
332         {
333                 LoadUrlShorteners ();
334                 CtdlRegisterCleanupHook(shorter_url_cleanup);
335         }
336         return "UrlShortener";
337 }