htmltitle.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  * _ _ ____ _
3  * Project ___| | | | _ \| |
4  * / __| | | | |_) | |
5  * | (__| |_| | _ <| |___
6  * \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.haxx.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  ***************************************************************************/
22 /* <DESC>
23  * Get a web page, extract the title with libxml.
24  * </DESC>
25 
26  Written by Lars Nilsson
27 
28  GNU C++ compile command line suggestion (edit paths accordingly):
29 
30  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
31  -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
32 */
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <string>
37 #include <curl/curl.h>
38 #include <libxml/HTMLparser.h>
39 
40 //
41 // Case-insensitive string comparison
42 //
43 
44 #ifdef _MSC_VER
45 #define COMPARE(a, b) (!_stricmp((a), (b)))
46 #else
47 #define COMPARE(a, b) (!strcasecmp((a), (b)))
48 #endif
49 
50 //
51 // libxml callback context structure
52 //
53 
54 struct Context
55 {
57 
58  bool addTitle;
60 };
61 
62 //
63 // libcurl variables for error strings and returned data
64 
67 
68 //
69 // libcurl write callback function
70 //
71 
72 static int writer(char *data, size_t size, size_t nmemb,
73  std::string *writerData)
74 {
75  if(writerData == NULL)
76  return 0;
77 
78  writerData->append(data, size*nmemb);
79 
80  return size * nmemb;
81 }
82 
83 //
84 // libcurl connection initialization
85 //
86 
87 static bool init(CURL *&conn, char *url)
88 {
89  CURLcode code;
90 
91  conn = curl_easy_init();
92 
93  if(conn == NULL) {
94  fprintf(stderr, "Failed to create CURL connection\n");
95  exit(EXIT_FAILURE);
96  }
97 
98  code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
99  if(code != CURLE_OK) {
100  fprintf(stderr, "Failed to set error buffer [%d]\n", code);
101  return false;
102  }
103 
104  code = curl_easy_setopt(conn, CURLOPT_URL, url);
105  if(code != CURLE_OK) {
106  fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
107  return false;
108  }
109 
110  code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
111  if(code != CURLE_OK) {
112  fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
113  return false;
114  }
115 
116  code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
117  if(code != CURLE_OK) {
118  fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
119  return false;
120  }
121 
122  code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
123  if(code != CURLE_OK) {
124  fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
125  return false;
126  }
127 
128  return true;
129 }
130 
131 //
132 // libxml start element callback function
133 //
134 
135 static void StartElement(void *voidContext,
136  const xmlChar *name,
137  const xmlChar **attributes)
138 {
139  Context *context = (Context *)voidContext;
140 
141  if(COMPARE((char *)name, "TITLE")) {
142  context->title = "";
143  context->addTitle = true;
144  }
145  (void) attributes;
146 }
147 
148 //
149 // libxml end element callback function
150 //
151 
152 static void EndElement(void *voidContext,
153  const xmlChar *name)
154 {
155  Context *context = (Context *)voidContext;
156 
157  if(COMPARE((char *)name, "TITLE"))
158  context->addTitle = false;
159 }
160 
161 //
162 // Text handling helper function
163 //
164 
165 static void handleCharacters(Context *context,
166  const xmlChar *chars,
167  int length)
168 {
169  if(context->addTitle)
170  context->title.append((char *)chars, length);
171 }
172 
173 //
174 // libxml PCDATA callback function
175 //
176 
177 static void Characters(void *voidContext,
178  const xmlChar *chars,
179  int length)
180 {
181  Context *context = (Context *)voidContext;
182 
183  handleCharacters(context, chars, length);
184 }
185 
186 //
187 // libxml CDATA callback function
188 //
189 
190 static void cdata(void *voidContext,
191  const xmlChar *chars,
192  int length)
193 {
194  Context *context = (Context *)voidContext;
195 
196  handleCharacters(context, chars, length);
197 }
198 
199 //
200 // libxml SAX callback structure
201 //
202 
203 static htmlSAXHandler saxHandler =
204 {
205  NULL,
206  NULL,
207  NULL,
208  NULL,
209  NULL,
210  NULL,
211  NULL,
212  NULL,
213  NULL,
214  NULL,
215  NULL,
216  NULL,
217  NULL,
218  NULL,
219  StartElement,
220  EndElement,
221  NULL,
222  Characters,
223  NULL,
224  NULL,
225  NULL,
226  NULL,
227  NULL,
228  NULL,
229  NULL,
230  cdata,
231  NULL
232 };
233 
234 //
235 // Parse given (assumed to be) HTML text and return the title
236 //
237 
238 static void parseHtml(const std::string &html,
240 {
241  htmlParserCtxtPtr ctxt;
242  Context context;
243 
244  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
245  XML_CHAR_ENCODING_NONE);
246 
247  htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
248  htmlParseChunk(ctxt, "", 0, 1);
249 
250  htmlFreeParserCtxt(ctxt);
251 
252  title = context.title;
253 }
254 
255 int main(int argc, char *argv[])
256 {
257  CURL *conn = NULL;
258  CURLcode code;
260 
261  // Ensure one argument is given
262 
263  if(argc != 2) {
264  fprintf(stderr, "Usage: %s <url>\n", argv[0]);
265  exit(EXIT_FAILURE);
266  }
267 
269 
270  // Initialize CURL connection
271 
272  if(!init(conn, argv[1])) {
273  fprintf(stderr, "Connection initializion failed\n");
274  exit(EXIT_FAILURE);
275  }
276 
277  // Retrieve content for the URL
278 
279  code = curl_easy_perform(conn);
280  curl_easy_cleanup(conn);
281 
282  if(code != CURLE_OK) {
283  fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
284  exit(EXIT_FAILURE);
285  }
286 
287  // Parse the (assumed) HTML code
288  parseHtml(buffer, title);
289 
290  // Display the extracted title
291  printf("Title: %s\n", title.c_str());
292 
293  return EXIT_SUCCESS;
294 }
static int writer(char *data, size_t size, size_t nmemb, std::string *writerData)
Definition: htmltitle.cpp:72
static bool init(CURL *&conn, char *url)
Definition: htmltitle.cpp:87
#define false
bool addTitle
Definition: htmltitle.cpp:58
::std::string string
Definition: gtest-port.h:1129
static void StartElement(void *voidContext, const xmlChar *name, const xmlChar **attributes)
Definition: htmltitle.cpp:135
CURLcode
Definition: curl.h:454
int main(int argc, char *argv[])
Definition: htmltitle.cpp:255
static void handleCharacters(Context *context, const xmlChar *chars, int length)
Definition: htmltitle.cpp:165
#define curl_easy_setopt(handle, option, value)
Definition: typecheck-gcc.h:41
std::string title
Definition: htmltitle.cpp:59
static void cdata(void *voidContext, const xmlChar *chars, int length)
Definition: htmltitle.cpp:190
static std::string buffer
Definition: htmltitle.cpp:66
#define CURL_GLOBAL_DEFAULT
Definition: curl.h:2521
#define printf
Definition: curl_printf.h:40
CURL_EXTERN CURL * curl_easy_init(void)
Definition: easy.c:343
static void parseHtml(const std::string &html, std::string &title)
Definition: htmltitle.cpp:238
CURL_EXTERN void curl_easy_cleanup(CURL *curl)
static void EndElement(void *voidContext, const xmlChar *name)
Definition: htmltitle.cpp:152
Definition: curl.h:455
#define COMPARE(a, b)
Definition: htmltitle.cpp:47
CURL_EXTERN CURLcode curl_global_init(long flags)
curl_global_init() globally initializes curl given a bitwise set of the different features of what to...
Definition: easy.c:271
void CURL
Definition: curl.h:102
#define CURL_ERROR_SIZE
Definition: curl.h:724
static htmlSAXHandler saxHandler
Definition: htmltitle.cpp:203
static void Characters(void *voidContext, const xmlChar *chars, int length)
Definition: htmltitle.cpp:177
size_t size
Definition: unit1302.c:52
#define fprintf
Definition: curl_printf.h:41
static char errorBuffer[CURL_ERROR_SIZE]
Definition: htmltitle.cpp:65
const char * name
Definition: curl_sasl.c:54
Definition: debug.c:29
CURL_EXTERN CURLcode curl_easy_perform(CURL *curl)


rc_tagdetect_client
Author(s): Monika Florek-Jasinska , Raphael Schaller
autogenerated on Sat Feb 13 2021 03:42:15