htmltitle.cpp
Go to the documentation of this file.
00001 /***************************************************************************
00002  *                                  _   _ ____  _
00003  *  Project                     ___| | | |  _ \| |
00004  *                             / __| | | | |_) | |
00005  *                            | (__| |_| |  _ <| |___
00006  *                             \___|\___/|_| \_\_____|
00007  *
00008  * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel@haxx.se>, et al.
00009  *
00010  * This software is licensed as described in the file COPYING, which
00011  * you should have received as part of this distribution. The terms
00012  * are also available at https://curl.haxx.se/docs/copyright.html.
00013  *
00014  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
00015  * copies of the Software, and permit persons to whom the Software is
00016  * furnished to do so, under the terms of the COPYING file.
00017  *
00018  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
00019  * KIND, either express or implied.
00020  *
00021  ***************************************************************************/
00022 /* <DESC>
00023  * Get a web page, extract the title with libxml.
00024  * </DESC>
00025  */
00026 // Written by Lars Nilsson
00027 //
00028 // GNU C++ compile command line suggestion (edit paths accordingly):
00029 //
00030 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
00031 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
00032 
00033 #include <stdio.h>
00034 #include <string.h>
00035 #include <stdlib.h>
00036 #include <string>
00037 #include <curl/curl.h>
00038 #include <libxml/HTMLparser.h>
00039 
00040 //
00041 //  Case-insensitive string comparison
00042 //
00043 
00044 #ifdef _MSC_VER
00045 #define COMPARE(a, b) (!_stricmp((a), (b)))
00046 #else
00047 #define COMPARE(a, b) (!strcasecmp((a), (b)))
00048 #endif
00049 
00050 //
00051 //  libxml callback context structure
00052 //
00053 
00054 struct Context
00055 {
00056   Context(): addTitle(false) { }
00057 
00058   bool addTitle;
00059   std::string title;
00060 };
00061 
00062 //
00063 //  libcurl variables for error strings and returned data
00064 
00065 static char errorBuffer[CURL_ERROR_SIZE];
00066 static std::string buffer;
00067 
00068 //
00069 //  libcurl write callback function
00070 //
00071 
00072 static int writer(char *data, size_t size, size_t nmemb,
00073                   std::string *writerData)
00074 {
00075   if (writerData == NULL)
00076     return 0;
00077 
00078   writerData->append(data, size*nmemb);
00079 
00080   return size * nmemb;
00081 }
00082 
00083 //
00084 //  libcurl connection initialization
00085 //
00086 
00087 static bool init(CURL *&conn, char *url)
00088 {
00089   CURLcode code;
00090 
00091   conn = curl_easy_init();
00092 
00093   if (conn == NULL)
00094   {
00095     fprintf(stderr, "Failed to create CURL connection\n");
00096 
00097     exit(EXIT_FAILURE);
00098   }
00099 
00100   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
00101   if (code != CURLE_OK)
00102   {
00103     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
00104 
00105     return false;
00106   }
00107 
00108   code = curl_easy_setopt(conn, CURLOPT_URL, url);
00109   if (code != CURLE_OK)
00110   {
00111     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
00112 
00113     return false;
00114   }
00115 
00116   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
00117   if (code != CURLE_OK)
00118   {
00119     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
00120 
00121     return false;
00122   }
00123 
00124   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
00125   if (code != CURLE_OK)
00126   {
00127     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
00128 
00129     return false;
00130   }
00131 
00132   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
00133   if (code != CURLE_OK)
00134   {
00135     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
00136 
00137     return false;
00138   }
00139 
00140   return true;
00141 }
00142 
00143 //
00144 //  libxml start element callback function
00145 //
00146 
00147 static void StartElement(void *voidContext,
00148                          const xmlChar *name,
00149                          const xmlChar **attributes)
00150 {
00151   Context *context = (Context *)voidContext;
00152 
00153   if (COMPARE((char *)name, "TITLE"))
00154   {
00155     context->title = "";
00156     context->addTitle = true;
00157   }
00158   (void) attributes;
00159 }
00160 
00161 //
00162 //  libxml end element callback function
00163 //
00164 
00165 static void EndElement(void *voidContext,
00166                        const xmlChar *name)
00167 {
00168   Context *context = (Context *)voidContext;
00169 
00170   if (COMPARE((char *)name, "TITLE"))
00171     context->addTitle = false;
00172 }
00173 
00174 //
00175 //  Text handling helper function
00176 //
00177 
00178 static void handleCharacters(Context *context,
00179                              const xmlChar *chars,
00180                              int length)
00181 {
00182   if (context->addTitle)
00183     context->title.append((char *)chars, length);
00184 }
00185 
00186 //
00187 //  libxml PCDATA callback function
00188 //
00189 
00190 static void Characters(void *voidContext,
00191                        const xmlChar *chars,
00192                        int length)
00193 {
00194   Context *context = (Context *)voidContext;
00195 
00196   handleCharacters(context, chars, length);
00197 }
00198 
00199 //
00200 //  libxml CDATA callback function
00201 //
00202 
00203 static void cdata(void *voidContext,
00204                   const xmlChar *chars,
00205                   int length)
00206 {
00207   Context *context = (Context *)voidContext;
00208 
00209   handleCharacters(context, chars, length);
00210 }
00211 
00212 //
00213 //  libxml SAX callback structure
00214 //
00215 
00216 static htmlSAXHandler saxHandler =
00217 {
00218   NULL,
00219   NULL,
00220   NULL,
00221   NULL,
00222   NULL,
00223   NULL,
00224   NULL,
00225   NULL,
00226   NULL,
00227   NULL,
00228   NULL,
00229   NULL,
00230   NULL,
00231   NULL,
00232   StartElement,
00233   EndElement,
00234   NULL,
00235   Characters,
00236   NULL,
00237   NULL,
00238   NULL,
00239   NULL,
00240   NULL,
00241   NULL,
00242   NULL,
00243   cdata,
00244   NULL
00245 };
00246 
00247 //
00248 //  Parse given (assumed to be) HTML text and return the title
00249 //
00250 
00251 static void parseHtml(const std::string &html,
00252                       std::string &title)
00253 {
00254   htmlParserCtxtPtr ctxt;
00255   Context context;
00256 
00257   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
00258                                   XML_CHAR_ENCODING_NONE);
00259 
00260   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
00261   htmlParseChunk(ctxt, "", 0, 1);
00262 
00263   htmlFreeParserCtxt(ctxt);
00264 
00265   title = context.title;
00266 }
00267 
00268 int main(int argc, char *argv[])
00269 {
00270   CURL *conn = NULL;
00271   CURLcode code;
00272   std::string title;
00273 
00274   // Ensure one argument is given
00275 
00276   if (argc != 2)
00277   {
00278     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
00279 
00280     exit(EXIT_FAILURE);
00281   }
00282 
00283   curl_global_init(CURL_GLOBAL_DEFAULT);
00284 
00285   // Initialize CURL connection
00286 
00287   if (!init(conn, argv[1]))
00288   {
00289     fprintf(stderr, "Connection initializion failed\n");
00290 
00291     exit(EXIT_FAILURE);
00292   }
00293 
00294   // Retrieve content for the URL
00295 
00296   code = curl_easy_perform(conn);
00297   curl_easy_cleanup(conn);
00298 
00299   if (code != CURLE_OK)
00300   {
00301     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
00302 
00303     exit(EXIT_FAILURE);
00304   }
00305 
00306   // Parse the (assumed) HTML code
00307 
00308   parseHtml(buffer, title);
00309 
00310   // Display the extracted title
00311 
00312   printf("Title: %s\n", title.c_str());
00313 
00314   return EXIT_SUCCESS;
00315 }


rc_visard_driver
Author(s): Heiko Hirschmueller , Christian Emmerich , Felix Ruess
autogenerated on Thu Jun 6 2019 20:43:04