rss2ctdl/xmlparse.c

   1 /*
   2  * $Id$
   3  *
   4  * Copyright 2003-2004 Rene Puls <rpuls@gmx.net> and
   5  *                     Oliver Feiler <kiza@kcore.de>
   6  *
   7  * http://kiza.kcore.de/software/snownews/
   8  * http://home.kcore.de/~kianga/study/c/xmlparse.c
   9  *
  10  * xmlparse.c
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License version 2 as
  14  * published by the Free Software Foundation.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  24  *
  25  */
  26
  27
  28 #include <string.h>
  29
  30 #include "config.h"
  31 #include "xmlparse.h"
  32 #include "conversions.h"
  33
  34 #include "parsedate.h"
  35 #include "rdf_parsedate.h"
  36
  37 int saverestore;
  38 struct newsitem *copy;
  39 struct newsitem *firstcopy;
  40
  41 /* During the parsens one calls, if we meet a <channel> element.
  42  * The function returns a new Struct for the new feed. */
  43
  44 void parse_rdf10_channel(struct feed *feed, xmlDocPtr doc, xmlNodePtr node) {
  45         xmlNodePtr cur;
  46
  47         /* Free everything before we write to it again. */
  48         free (feed->title);
  49         free (feed->link);
  50
  51         if (feed->items != NULL) {
  52                 while (feed->items->next_ptr != NULL) {
  53                         feed->items = feed->items->next_ptr;
  54                         free (feed->items->prev_ptr->data->title);
  55                         free (feed->items->prev_ptr->data->link);
  56                         free (feed->items->prev_ptr->data->guid);
  57                         free (feed->items->prev_ptr->data->description);
  58                         free (feed->items->prev_ptr->data);
  59                         free (feed->items->prev_ptr);
  60                 }
  61                 free (feed->items->data->title);
  62                 free (feed->items->data->link);
  63                 free (feed->items->data->guid);
  64                 free (feed->items->data->description);
  65                 free (feed->items->data);
  66                 free (feed->items);
  67         }
  68
  69         /* At the moment we have still no Items, so set the list to null. */
  70         feed->items = NULL;
  71         feed->title = NULL;
  72         feed->link= NULL;
  73         feed->description = NULL;
  74
  75         /* Go through all <channel> tags and extract the information */
  76         for (cur = node; cur != NULL; cur = cur->next) {
  77                 if (cur->type != XML_ELEMENT_NODE)
  78                         continue;
  79                 if (xmlStrcmp(cur->name, "title") == 0) {
  80                         feed->title = xmlNodeListGetString(doc, cur->children, 1);
  81                         CleanupString (feed->title, 1);
  82                         /* Remove trailing newline */
  83                         if (feed->title != NULL) {
  84                                 if (strlen(feed->title) > 1) {
  85                                         if (feed->title[strlen(feed->title)-1] == '\n')
  86                                                 feed->title[strlen(feed->title)-1] = '\0';
  87                                 }
  88                         }
  89                 }
  90                 else if (xmlStrcmp(cur->name, "link") == 0) {
  91                         feed->link = xmlNodeListGetString(doc, cur->children, 1);
  92                         /* Remove trailing newline */
  93                         if (feed->link != NULL) {
  94                                 if (strlen(feed->link) > 1) {
  95                                         if (feed->link[strlen(feed->link)-1] == '\n')
  96                                                 feed->link[strlen(feed->link)-1] = '\0';
  97                                 }
  98                         }
  99                 }
 100                 else if (xmlStrcmp(cur->name, "description") == 0) {
 101                         feed->description = xmlNodeListGetString(doc, cur->children, 1);
 102                         CleanupString (feed->description, 0);
 103                 }
 104         }
 105 }
 106
 107
 108 void parse_rdf20_channel(struct feed *feed, xmlDocPtr doc, xmlNodePtr node)
 109 {
 110         xmlNodePtr cur;
 111
 112         /* Free everything before we write to it again. */
 113         free (feed->title);
 114         free (feed->link);
 115         free (feed->description);
 116
 117         if (feed->items != NULL) {
 118                 while (feed->items->next_ptr != NULL) {
 119                         feed->items = feed->items->next_ptr;
 120                         free (feed->items->prev_ptr->data->title);
 121                         free (feed->items->prev_ptr->data->link);
 122                         free (feed->items->prev_ptr->data->guid);
 123                         free (feed->items->prev_ptr->data->description);
 124                         free (feed->items->prev_ptr->data);
 125                         free (feed->items->prev_ptr);
 126                 }
 127                 free (feed->items->data->title);
 128                 free (feed->items->data->link);
 129                 free (feed->items->data->guid);
 130                 free (feed->items->data->description);
 131                 free (feed->items->data);
 132                 free (feed->items);
 133         }
 134
 135         /* At the moment we have still no Items, so set the list to NULL. */
 136         feed->items = NULL;
 137         feed->title = NULL;
 138         feed->link = NULL;
 139         feed->description = NULL;
 140
 141         /* Go through all tags in <channel> and extract the information. */
 142         for (cur = node; cur != NULL; cur = cur->next) {
 143                 if (cur->type != XML_ELEMENT_NODE)
 144                         continue;
 145                 if (xmlStrcmp(cur->name, "title") == 0) {
 146                         feed->title = xmlNodeListGetString(doc, cur->children, 1);
 147                         CleanupString (feed->title, 1);
 148                         /* Remove trailing newline */
 149                         if (feed->title != NULL) {
 150                                 if (strlen(feed->title) > 1) {
 151                                         if (feed->title[strlen(feed->title)-1] == '\n')
 152                                                 feed->title[strlen(feed->title)-1] = '\0';
 153                                 }
 154                         }
 155                 }
 156                 else if (xmlStrcmp(cur->name, "link") == 0) {
 157                         feed->link = xmlNodeListGetString(doc, cur->children, 1);
 158                         /* Remove trailing newline */
 159                         if (feed->link != NULL) {
 160                                 if (strlen(feed->link) > 1) {
 161                                         if (feed->link[strlen(feed->link)-1] == '\n')
 162                                                 feed->link[strlen(feed->link)-1] = '\0';
 163                                 }
 164                         }
 165                 }
 166                 else if (xmlStrcmp(cur->name, "description") == 0) {
 167                         feed->description = xmlNodeListGetString(doc, cur->children, 1);
 168                         CleanupString (feed->description, 0);
 169                 } else if (xmlStrcmp(cur->name, "item") == 0) {
 170                         parse_rdf10_item(feed, doc, cur->children);
 171                 }
 172         }
 173 }
 174
 175 /* This function is called each mark, if we meet on. As parameter it needs the
 176  * current new feed (new feed struct *), as well as the current XML
 177  * document-acts and the current element, both comes directly of libxml.
 178  */
 179
 180 void parse_rdf10_item(struct feed *feed, xmlDocPtr doc, xmlNodePtr node)
 181 {
 182         xmlNodePtr cur;
 183         xmlChar *readstatusstring;
 184
 185         struct newsitem *item;
 186         struct newsitem *current;
 187
 188         /* Speicher für ein neues Newsitem reservieren */
 189         item = malloc(sizeof (struct newsitem));
 190         item->data = malloc (sizeof (struct newsdata));
 191
 192         item->data->title = NULL;
 193         item->data->link = NULL;
 194         item->data->guid = NULL;
 195         item->data->description = NULL;
 196         item->data->date = 0L;
 197         item->data->readstatus = 0;
 198         item->data->parent = feed;
 199
 200         /* Alle Tags im <item> Tag durchgehen und die Informationen extrahieren.
 201            Selbe Vorgehensweise wie in der parse_channel() Funktion */
 202         for (cur = node; cur != NULL; cur = cur->next) {
 203                 if (cur->type != XML_ELEMENT_NODE)
 204                         continue;
 205                 if (xmlStrcmp(cur->name, "title") == 0) {
 206                         item->data->title = xmlNodeListGetString(doc, cur->children, 1);
 207                         CleanupString (item->data->title, 1);
 208                         /* Remove trailing newline */
 209                         if (item->data->title != NULL) {
 210                                 if (strlen(item->data->title) > 1) {
 211                                         if (item->data->title[strlen(item->data->title)-1] == '\n')
 212                                                 item->data->title[strlen(item->data->title)-1] = '\0';
 213                                 }
 214                         }
 215                 }
 216                 else if (xmlStrcmp(cur->name, "link") == 0) {
 217                         item->data->link = xmlNodeListGetString(doc, cur->children, 1);
 218                         if (item->data->link == NULL) {
 219                                 if (xmlStrcmp(cur->name, "guid") == 0)
 220                                         item->data->link = xmlNodeListGetString(doc, cur->children, 1);
 221                         }
 222                         /* Remove trailing newline */
 223                         if (item->data->link != NULL) {
 224                                 if (strlen(item->data->link) > 1) {
 225                                         if (item->data->link[strlen(item->data->link)-1] == '\n')
 226                                                 item->data->link[strlen(item->data->link)-1] = '\0';
 227                                 }
 228                         }
 229                 }
 230                 else if (xmlStrcmp(cur->name, "guid") == 0) {
 231                         item->data->guid = xmlNodeListGetString(doc, cur->children, 1);
 232                         if (item->data->guid == NULL) {
 233                                 if (xmlStrcmp(cur->name, "guid") == 0)
 234                                         item->data->guid = xmlNodeListGetString(doc, cur->children, 1);
 235                         }
 236                         /* Remove trailing newline */
 237                         if (item->data->guid != NULL) {
 238                                 if (strlen(item->data->guid) > 1) {
 239                                         if (item->data->guid[strlen(item->data->guid)-1] == '\n')
 240                                                 item->data->guid[strlen(item->data->guid)-1] = '\0';
 241                                 }
 242                         }
 243                 }
 244                 else if (xmlStrcmp(cur->name, "description") == 0) {
 245                         item->data->description = xmlNodeListGetString(doc, cur->children, 1);
 246                         CleanupString (item->data->description, 0);
 247                 }
 248                 /* pubDate will be in the form of: Thu, 15 Sep 2005 14:32:44 +0000 */
 249                 else if (xmlStrcmp(cur->name, "pubDate") == 0) {
 250                         item->data->date = parsedate(xmlNodeListGetString(doc, cur->children, 1));
 251                 }
 252                 /* RSS style date will be in the form of: 2005-09-17T06:18:00+00:00
 253                  * Only use it if no pubDate was already found.
 254                  */
 255                 else if (xmlStrcmp(cur->name, "date") == 0) {
 256                         if (item->data->date <= 0L) {
 257                                 item->data->date = rdf_parsedate(xmlNodeListGetString(doc, cur->children, 1));
 258                         }
 259                 }
 260                 else if (xmlStrcmp(cur->name, "readstatus") == 0) {
 261                         /* Will cause memory leak otherwise, xmlNodeListGetString must be freed. */
 262                         readstatusstring = xmlNodeListGetString(doc, cur->children, 1);
 263                         item->data->readstatus = atoi (readstatusstring);
 264                         xmlFree (readstatusstring);
 265                 }
 266         }
 267
 268         /* If saverestore == 1, restore readstatus. */
 269         if (saverestore == 1) {
 270                 for (current = firstcopy; current != NULL; current = current->next_ptr) {
 271                         if ((current->data->link != NULL) && (item->data->link != NULL)) {
 272                                 if ((current->data->title != NULL) && (item->data->title != NULL)) {
 273                                         if ((strcmp(item->data->link, current->data->link) == 0) &&
 274                                                 (strcmp(item->data->title, current->data->title) == 0))
 275                                                 item->data->readstatus = current->data->readstatus;
 276                                 } else {
 277                                         if (strcmp(item->data->link, current->data->link) == 0)
 278                                                 item->data->readstatus = current->data->readstatus;
 279                                 }
 280                         }
 281                 }
 282         }
 283
 284         item->next_ptr = NULL;
 285         if (feed->items == NULL) {
 286                 item->prev_ptr = NULL;
 287                 feed->items = item;
 288         } else {
 289                 item->prev_ptr = feed->items;
 290                 while (item->prev_ptr->next_ptr != NULL)
 291                         item->prev_ptr = item->prev_ptr->next_ptr;
 292                 item->prev_ptr->next_ptr = item;
 293         }
 294 }
 295
 296
 297 /* rrr */
 298
 299 int DeXML (struct feed *cur_ptr) {
 300         xmlDocPtr doc;
 301         xmlNodePtr cur;
 302         struct newsitem *cur_item;
 303
 304         if (cur_ptr->feed == NULL)
 305                 return -1;
 306
 307         saverestore = 0;
 308         /* Wenn cur_ptr->items != NULL dann können wir uns item->readstatus
 309            zwischenspeichern. */
 310         if (cur_ptr->items != NULL) {
 311                 saverestore = 1;
 312
 313                 firstcopy = NULL;
 314
 315                 /* Copy current newsitem struct. */
 316                 for (cur_item = cur_ptr->items; cur_item != NULL; cur_item = cur_item->next_ptr) {
 317                         copy = malloc (sizeof(struct newsitem));
 318                         copy->data = malloc (sizeof (struct newsdata));
 319                         copy->data->title = NULL;
 320                         copy->data->link = NULL;
 321                         copy->data->guid = NULL;
 322                         copy->data->description = NULL;
 323                         copy->data->date = 0L;
 324                         copy->data->readstatus = cur_item->data->readstatus;
 325                         if (cur_item->data->link != NULL)
 326                                 copy->data->link = strdup (cur_item->data->link);
 327                         if (cur_item->data->title != NULL)
 328                                 copy->data->title = strdup (cur_item->data->title);
 329
 330                         copy->next_ptr = NULL;
 331                         if (firstcopy == NULL) {
 332                                 copy->prev_ptr = NULL;
 333                                 firstcopy = copy;
 334                         } else {
 335                                 copy->prev_ptr = firstcopy;
 336                                 while (copy->prev_ptr->next_ptr != NULL)
 337                                         copy->prev_ptr = copy->prev_ptr->next_ptr;
 338                                 copy->prev_ptr->next_ptr = copy;
 339                         }
 340                 }
 341         }
 342
 343         /* xmlRecoverMemory:
 344            parse an XML in-memory document and build a tree.
 345        In case the document is not Well Formed, a tree is built anyway. */
 346         doc = xmlRecoverMemory(cur_ptr->feed, strlen(cur_ptr->feed));
 347
 348         if (doc == NULL)
 349                 return 2;
 350
 351         /* Das Root-Element finden (in unserem Fall sollte es "<RDF:RDF>" heißen.
 352            Dabei wird das RDF: Prefix fürs Erste ignoriert, bis der Jaguar
 353            herausfindet, wie man das genau auslesen kann (jau). */
 354         cur = xmlDocGetRootElement(doc);
 355
 356         if (cur == NULL) {
 357                 xmlFreeDoc (doc);
 358                 return 2;
 359         }
 360
 361         /* Überprüfen, ob das Element auch wirklich <RDF> heißt */
 362         if (xmlStrcmp(cur->name, "RDF") == 0) {
 363
 364                 /* Jetzt gehen wir alle Elemente im Dokument durch. Diese Schleife
 365                    selbst läuft jedoch nur durch die Elemente auf höchster Ebene
 366                    (bei HTML wären das nur HEAD und BODY), wandert also nicht die
 367                    gesamte Struktur nach unten durch. Dafür sind die Funktionen zuständig,
 368                    die wir dann in der Schleife selbst aufrufen. */
 369                 for (cur = cur->children; cur != NULL; cur = cur->next) {
 370                         if (cur->type != XML_ELEMENT_NODE)
 371                                 continue;
 372                         if (xmlStrcmp(cur->name, "channel") == 0)
 373                                 parse_rdf10_channel(cur_ptr, doc, cur->children);
 374                         if (xmlStrcmp(cur->name, "item") == 0)
 375                                 parse_rdf10_item(cur_ptr, doc, cur->children);
 376                         /* Last-Modified is only used when reading from internal feeds (disk cache). */
 377                         if (xmlStrcmp(cur->name, "lastmodified") == 0)
 378                                 cur_ptr->lastmodified = xmlNodeListGetString(doc, cur->children, 1);
 379                 }
 380         } else if (xmlStrcmp(cur->name, "rss") == 0) {
 381                 for (cur = cur->children; cur != NULL; cur = cur->next) {
 382                         if (cur->type != XML_ELEMENT_NODE)
 383                                 continue;
 384                         if (xmlStrcmp(cur->name, "channel") == 0)
 385                                 parse_rdf20_channel(cur_ptr, doc, cur->children);
 386                 }
 387         } else {
 388                 xmlFreeDoc(doc);
 389                 return 3;
 390         }
 391
 392         xmlFreeDoc(doc);
 393
 394         if (saverestore == 1) {
 395                 /* free struct newsitem *copy. */
 396                 while (firstcopy->next_ptr != NULL) {
 397                         firstcopy = firstcopy->next_ptr;
 398                         free (firstcopy->prev_ptr->data->link);
 399                         free (firstcopy->prev_ptr->data->guid);
 400                         free (firstcopy->prev_ptr->data->title);
 401                         free (firstcopy->prev_ptr->data);
 402                         free (firstcopy->prev_ptr);
 403                 }
 404                 free (firstcopy->data->link);
 405                 free (firstcopy->data->guid);
 406                 free (firstcopy->data->title);
 407                 free (firstcopy->data);
 408                 free (firstcopy);
 409         }
 410
 411         if (cur_ptr->original != NULL)
 412                 free (cur_ptr->original);
 413
 414         /* Set -> title to something if it's a NULL pointer to avoid crash with strdup below. */
 415         if (cur_ptr->title == NULL)
 416                 cur_ptr->title = strdup (cur_ptr->feedurl);
 417         cur_ptr->original = strdup (cur_ptr->title);
 418
 419         return 0;
 420 }