rss2ctdl/xmlparse.c

   1 /*
   2  * $Id$
   3  *
   4  * Copyright 2003-2004 Rene Puls <rpuls@gmx.net> and
   5  *                     Oliver Feiler <kiza@kcore.de>
   6  *
   7  * http://kiza.kcore.de/software/snownews/
   8  * http://home.kcore.de/~kianga/study/c/xmlparse.c
   9  *
  10  * xmlparse.c
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License version 2 as
  14  * published by the Free Software Foundation.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  24  *
  25  */
  26
  27
  28 #include <string.h>
  29
  30 #include "config.h"
  31 #include "xmlparse.h"
  32 #include "conversions.h"
  33
  34 int saverestore;
  35 struct newsitem *copy;
  36 struct newsitem *firstcopy;
  37
  38 /* During the parsens one calls, if we meet a <channel> element.
  39  * The function returns a new Struct for the new feed. */
  40
  41 void parse_rdf10_channel(struct feed *feed, xmlDocPtr doc, xmlNodePtr node) {
  42         xmlNodePtr cur;
  43
  44         /* Free everything before we write to it again. */
  45         free (feed->title);
  46         free (feed->link);
  47         free (feed->description);
  48
  49         if (feed->items != NULL) {
  50                 while (feed->items->next_ptr != NULL) {
  51                         feed->items = feed->items->next_ptr;
  52                         free (feed->items->prev_ptr->data->title);
  53                         free (feed->items->prev_ptr->data->link);
  54                         free (feed->items->prev_ptr->data->guid);
  55                         free (feed->items->prev_ptr->data->description);
  56                         free (feed->items->prev_ptr->data);
  57                         free (feed->items->prev_ptr);
  58                 }
  59                 free (feed->items->data->title);
  60                 free (feed->items->data->link);
  61                 free (feed->items->data->guid);
  62                 free (feed->items->data->description);
  63                 free (feed->items->data);
  64                 free (feed->items);
  65         }
  66
  67         /* At the moment we have still no Items, so set the list to null. */
  68         feed->items = NULL;
  69         feed->title = NULL;
  70         feed->link= NULL;
  71         feed->description = NULL;
  72
  73         /* Go through all <channel> tags and extract the information */
  74         for (cur = node; cur != NULL; cur = cur->next) {
  75                 if (cur->type != XML_ELEMENT_NODE)
  76                         continue;
  77                 if (xmlStrcmp(cur->name, "title") == 0) {
  78                         feed->title = xmlNodeListGetString(doc, cur->children, 1);
  79                         CleanupString (feed->title, 1);
  80                         /* Remove trailing newline */
  81                         if (feed->title != NULL) {
  82                                 if (strlen(feed->title) > 1) {
  83                                         if (feed->title[strlen(feed->title)-1] == '\n')
  84                                                 feed->title[strlen(feed->title)-1] = '\0';
  85                                 }
  86                         }
  87                 }
  88                 else if (xmlStrcmp(cur->name, "link") == 0) {
  89                         feed->link = xmlNodeListGetString(doc, cur->children, 1);
  90                         /* Remove trailing newline */
  91                         if (feed->link != NULL) {
  92                                 if (strlen(feed->link) > 1) {
  93                                         if (feed->link[strlen(feed->link)-1] == '\n')
  94                                                 feed->link[strlen(feed->link)-1] = '\0';
  95                                 }
  96                         }
  97                 }
  98                 else if (xmlStrcmp(cur->name, "description") == 0) {
  99                         feed->description = xmlNodeListGetString(doc, cur->children, 1);
 100                         CleanupString (feed->description, 0);
 101                 }
 102         }
 103 }
 104
 105
 106 void parse_rdf20_channel(struct feed *feed, xmlDocPtr doc, xmlNodePtr node)
 107 {
 108         xmlNodePtr cur;
 109
 110         /* Free everything before we write to it again. */
 111         free (feed->title);
 112         free (feed->link);
 113         free (feed->description);
 114
 115         if (feed->items != NULL) {
 116                 while (feed->items->next_ptr != NULL) {
 117                         feed->items = feed->items->next_ptr;
 118                         free (feed->items->prev_ptr->data->title);
 119                         free (feed->items->prev_ptr->data->link);
 120                         free (feed->items->prev_ptr->data->guid);
 121                         free (feed->items->prev_ptr->data->description);
 122                         free (feed->items->prev_ptr->data);
 123                         free (feed->items->prev_ptr);
 124                 }
 125                 free (feed->items->data->title);
 126                 free (feed->items->data->link);
 127                 free (feed->items->data->guid);
 128                 free (feed->items->data->description);
 129                 free (feed->items->data);
 130                 free (feed->items);
 131         }
 132
 133         /* Im Augenblick haben wir noch keine Items, also die Liste auf NULL setzen. */
 134         feed->items = NULL;
 135         feed->title = NULL;
 136         feed->link = NULL;
 137         feed->description = NULL;
 138
 139         /* Alle Tags im <channel> Tag durchgehen und die Informationen extrahieren */
 140         for (cur = node; cur != NULL; cur = cur->next) {
 141                 if (cur->type != XML_ELEMENT_NODE)
 142                         continue;
 143                 if (xmlStrcmp(cur->name, "title") == 0) {
 144                         feed->title = xmlNodeListGetString(doc, cur->children, 1);
 145                         CleanupString (feed->title, 1);
 146                         /* Remove trailing newline */
 147                         if (feed->title != NULL) {
 148                                 if (strlen(feed->title) > 1) {
 149                                         if (feed->title[strlen(feed->title)-1] == '\n')
 150                                                 feed->title[strlen(feed->title)-1] = '\0';
 151                                 }
 152                         }
 153                 }
 154                 else if (xmlStrcmp(cur->name, "link") == 0) {
 155                         feed->link = xmlNodeListGetString(doc, cur->children, 1);
 156                         /* Remove trailing newline */
 157                         if (feed->link != NULL) {
 158                                 if (strlen(feed->link) > 1) {
 159                                         if (feed->link[strlen(feed->link)-1] == '\n')
 160                                                 feed->link[strlen(feed->link)-1] = '\0';
 161                                 }
 162                         }
 163                 }
 164                 else if (xmlStrcmp(cur->name, "description") == 0) {
 165                         feed->description = xmlNodeListGetString(doc, cur->children, 1);
 166                         CleanupString (feed->description, 0);
 167                 } else if (xmlStrcmp(cur->name, "item") == 0) {
 168                         parse_rdf10_item(feed, doc, cur->children);
 169                 }
 170         }
 171 }
 172
 173 /* This function is called each mark, if we meet on. As parameter it needs the
 174  * current new feed (new feed struct *), as well as the current XML
 175  * document-acts and the current element, both comes directly of libxml.
 176  */
 177
 178 void parse_rdf10_item(struct feed *feed, xmlDocPtr doc, xmlNodePtr node)
 179 {
 180         xmlNodePtr cur;
 181         xmlChar *readstatusstring;
 182
 183         struct newsitem *item;
 184         struct newsitem *current;
 185
 186         /* Speicher für ein neues Newsitem reservieren */
 187         item = malloc(sizeof (struct newsitem));
 188         item->data = malloc (sizeof (struct newsdata));
 189
 190         item->data->title = NULL;
 191         item->data->link = NULL;
 192         item->data->guid = NULL;
 193         item->data->description = NULL;
 194         item->data->readstatus = 0;
 195         item->data->parent = feed;
 196
 197         /* Alle Tags im <item> Tag durchgehen und die Informationen extrahieren.
 198            Selbe Vorgehensweise wie in der parse_channel() Funktion */
 199         for (cur = node; cur != NULL; cur = cur->next) {
 200                 if (cur->type != XML_ELEMENT_NODE)
 201                         continue;
 202                 if (xmlStrcmp(cur->name, "title") == 0) {
 203                         item->data->title = xmlNodeListGetString(doc, cur->children, 1);
 204                         CleanupString (item->data->title, 1);
 205                         /* Remove trailing newline */
 206                         if (item->data->title != NULL) {
 207                                 if (strlen(item->data->title) > 1) {
 208                                         if (item->data->title[strlen(item->data->title)-1] == '\n')
 209                                                 item->data->title[strlen(item->data->title)-1] = '\0';
 210                                 }
 211                         }
 212                 }
 213                 else if (xmlStrcmp(cur->name, "link") == 0) {
 214                         item->data->link = xmlNodeListGetString(doc, cur->children, 1);
 215                         if (item->data->link == NULL) {
 216                                 if (xmlStrcmp(cur->name, "guid") == 0)
 217                                         item->data->link = xmlNodeListGetString(doc, cur->children, 1);
 218                         }
 219                         /* Remove trailing newline */
 220                         if (item->data->link != NULL) {
 221                                 if (strlen(item->data->link) > 1) {
 222                                         if (item->data->link[strlen(item->data->link)-1] == '\n')
 223                                                 item->data->link[strlen(item->data->link)-1] = '\0';
 224                                 }
 225                         }
 226                 }
 227                 else if (xmlStrcmp(cur->name, "guid") == 0) {
 228                         item->data->guid = xmlNodeListGetString(doc, cur->children, 1);
 229                         if (item->data->guid == NULL) {
 230                                 if (xmlStrcmp(cur->name, "guid") == 0)
 231                                         item->data->guid = xmlNodeListGetString(doc, cur->children, 1);
 232                         }
 233                         /* Remove trailing newline */
 234                         if (item->data->guid != NULL) {
 235                                 if (strlen(item->data->guid) > 1) {
 236                                         if (item->data->guid[strlen(item->data->guid)-1] == '\n')
 237                                                 item->data->guid[strlen(item->data->guid)-1] = '\0';
 238                                 }
 239                         }
 240                 }
 241                 else if (xmlStrcmp(cur->name, "description") == 0) {
 242                         item->data->description = xmlNodeListGetString(doc, cur->children, 1);
 243                         CleanupString (item->data->description, 0);
 244                 }
 245                 else if (xmlStrcmp(cur->name, "readstatus") == 0) {
 246                         /* Will cause memory leak otherwise, xmlNodeListGetString must be freed. */
 247                         readstatusstring = xmlNodeListGetString(doc, cur->children, 1);
 248                         item->data->readstatus = atoi (readstatusstring);
 249                         xmlFree (readstatusstring);
 250                 }
 251         }
 252
 253         /* If saverestore == 1, restore readstatus. */
 254         if (saverestore == 1) {
 255                 for (current = firstcopy; current != NULL; current = current->next_ptr) {
 256                         if ((current->data->link != NULL) && (item->data->link != NULL)) {
 257                                 if ((current->data->title != NULL) && (item->data->title != NULL)) {
 258                                         if ((strcmp(item->data->link, current->data->link) == 0) &&
 259                                                 (strcmp(item->data->title, current->data->title) == 0))
 260                                                 item->data->readstatus = current->data->readstatus;
 261                                 } else {
 262                                         if (strcmp(item->data->link, current->data->link) == 0)
 263                                                 item->data->readstatus = current->data->readstatus;
 264                                 }
 265                         }
 266                 }
 267         }
 268
 269         item->next_ptr = NULL;
 270         if (feed->items == NULL) {
 271                 item->prev_ptr = NULL;
 272                 feed->items = item;
 273         } else {
 274                 item->prev_ptr = feed->items;
 275                 while (item->prev_ptr->next_ptr != NULL)
 276                         item->prev_ptr = item->prev_ptr->next_ptr;
 277                 item->prev_ptr->next_ptr = item;
 278         }
 279 }
 280
 281
 282 /* rrr */
 283
 284 int DeXML (struct feed *cur_ptr) {
 285         xmlDocPtr doc;
 286         xmlNodePtr cur;
 287         struct newsitem *cur_item;
 288
 289         if (cur_ptr->feed == NULL)
 290                 return -1;
 291
 292         saverestore = 0;
 293         /* Wenn cur_ptr->items != NULL dann können wir uns item->readstatus
 294            zwischenspeichern. */
 295         if (cur_ptr->items != NULL) {
 296                 saverestore = 1;
 297
 298                 firstcopy = NULL;
 299
 300                 /* Copy current newsitem struct. */
 301                 for (cur_item = cur_ptr->items; cur_item != NULL; cur_item = cur_item->next_ptr) {
 302                         copy = malloc (sizeof(struct newsitem));
 303                         copy->data = malloc (sizeof (struct newsdata));
 304                         copy->data->title = NULL;
 305                         copy->data->link = NULL;
 306                         copy->data->guid = NULL;
 307                         copy->data->description = NULL;
 308                         copy->data->readstatus = cur_item->data->readstatus;
 309                         if (cur_item->data->link != NULL)
 310                                 copy->data->link = strdup (cur_item->data->link);
 311                         if (cur_item->data->title != NULL)
 312                                 copy->data->title = strdup (cur_item->data->title);
 313
 314                         copy->next_ptr = NULL;
 315                         if (firstcopy == NULL) {
 316                                 copy->prev_ptr = NULL;
 317                                 firstcopy = copy;
 318                         } else {
 319                                 copy->prev_ptr = firstcopy;
 320                                 while (copy->prev_ptr->next_ptr != NULL)
 321                                         copy->prev_ptr = copy->prev_ptr->next_ptr;
 322                                 copy->prev_ptr->next_ptr = copy;
 323                         }
 324                 }
 325         }
 326
 327         /* xmlRecoverMemory:
 328            parse an XML in-memory document and build a tree.
 329        In case the document is not Well Formed, a tree is built anyway. */
 330         doc = xmlRecoverMemory(cur_ptr->feed, strlen(cur_ptr->feed));
 331
 332         if (doc == NULL)
 333                 return 2;
 334
 335         /* Das Root-Element finden (in unserem Fall sollte es "<RDF:RDF>" heißen.
 336            Dabei wird das RDF: Prefix fürs Erste ignoriert, bis der Jaguar
 337            herausfindet, wie man das genau auslesen kann (jau). */
 338         cur = xmlDocGetRootElement(doc);
 339
 340         if (cur == NULL) {
 341                 xmlFreeDoc (doc);
 342                 return 2;
 343         }
 344
 345         /* Überprüfen, ob das Element auch wirklich <RDF> heißt */
 346         if (xmlStrcmp(cur->name, "RDF") == 0) {
 347
 348                 /* Jetzt gehen wir alle Elemente im Dokument durch. Diese Schleife
 349                    selbst läuft jedoch nur durch die Elemente auf höchster Ebene
 350                    (bei HTML wären das nur HEAD und BODY), wandert also nicht die
 351                    gesamte Struktur nach unten durch. Dafür sind die Funktionen zuständig,
 352                    die wir dann in der Schleife selbst aufrufen. */
 353                 for (cur = cur->children; cur != NULL; cur = cur->next) {
 354                         if (cur->type != XML_ELEMENT_NODE)
 355                                 continue;
 356                         if (xmlStrcmp(cur->name, "channel") == 0)
 357                                 parse_rdf10_channel(cur_ptr, doc, cur->children);
 358                         if (xmlStrcmp(cur->name, "item") == 0)
 359                                 parse_rdf10_item(cur_ptr, doc, cur->children);
 360                         /* Last-Modified is only used when reading from internal feeds (disk cache). */
 361                         if (xmlStrcmp(cur->name, "lastmodified") == 0)
 362                                 cur_ptr->lastmodified = xmlNodeListGetString(doc, cur->children, 1);
 363                 }
 364         } else if (xmlStrcmp(cur->name, "rss") == 0) {
 365                 for (cur = cur->children; cur != NULL; cur = cur->next) {
 366                         if (cur->type != XML_ELEMENT_NODE)
 367                                 continue;
 368                         if (xmlStrcmp(cur->name, "channel") == 0)
 369                                 parse_rdf20_channel(cur_ptr, doc, cur->children);
 370                 }
 371         } else {
 372                 xmlFreeDoc(doc);
 373                 return 3;
 374         }
 375
 376         xmlFreeDoc(doc);
 377
 378         if (saverestore == 1) {
 379                 /* free struct newsitem *copy. */
 380                 while (firstcopy->next_ptr != NULL) {
 381                         firstcopy = firstcopy->next_ptr;
 382                         free (firstcopy->prev_ptr->data->link);
 383                         free (firstcopy->prev_ptr->data->guid);
 384                         free (firstcopy->prev_ptr->data->title);
 385                         free (firstcopy->prev_ptr->data);
 386                         free (firstcopy->prev_ptr);
 387                 }
 388                 free (firstcopy->data->link);
 389                 free (firstcopy->data->guid);
 390                 free (firstcopy->data->title);
 391                 free (firstcopy->data);
 392                 free (firstcopy);
 393         }
 394
 395         if (cur_ptr->original != NULL)
 396                 free (cur_ptr->original);
 397
 398         /* Set -> title to something if it's a NULL pointer to avoid crash with strdup below. */
 399         if (cur_ptr->title == NULL)
 400                 cur_ptr->title = strdup (cur_ptr->feedurl);
 401         cur_ptr->original = strdup (cur_ptr->title);
 402
 403         return 0;
 404 }