htmltidy.c
Go to the documentation of this file.
1 /***************************************************************************
2  * _ _ ____ _
3  * Project ___| | | | _ \| |
4  * / __| | | | |_) | |
5  * | (__| |_| | _ <| |___
6  * \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.haxx.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  ***************************************************************************/
22 /* <DESC>
23  * Download a document and use libtidy to parse the HTML.
24  * </DESC>
25  */
26 /*
27  * LibTidy => http://tidy.sourceforge.net
28  */
29 
30 #include <stdio.h>
31 #include <tidy/tidy.h>
32 #include <tidy/buffio.h>
33 #include <curl/curl.h>
34 
35 /* curl write callback, to fill tidy's input buffer... */
36 uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
37 {
38  uint r;
39  r = size * nmemb;
40  tidyBufAppend(out, in, r);
41  return r;
42 }
43 
44 /* Traverse the document tree */
45 void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
46 {
47  TidyNode child;
48  for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
49  ctmbstr name = tidyNodeGetName(child);
50  if(name) {
51  /* if it has a name, then it's an HTML tag ... */
52  TidyAttr attr;
53  printf("%*.*s%s ", indent, indent, "<", name);
54  /* walk the attribute list */
55  for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
56  printf(tidyAttrName(attr));
57  tidyAttrValue(attr)?printf("=\"%s\" ",
58  tidyAttrValue(attr)):printf(" ");
59  }
60  printf(">\n");
61  }
62  else {
63  /* if it doesn't have a name, then it's probably text, cdata, etc... */
64  TidyBuffer buf;
65  tidyBufInit(&buf);
66  tidyNodeGetText(doc, child, &buf);
67  printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
68  tidyBufFree(&buf);
69  }
70  dumpNode(doc, child, indent + 4); /* recursive */
71  }
72 }
73 
74 
75 int main(int argc, char **argv)
76 {
77  CURL *curl;
78  char curl_errbuf[CURL_ERROR_SIZE];
79  TidyDoc tdoc;
80  TidyBuffer docbuf = {0};
81  TidyBuffer tidy_errbuf = {0};
82  int err;
83  if(argc == 2) {
84  curl = curl_easy_init();
85  curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
86  curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
87  curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
88  curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
89  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
90 
91  tdoc = tidyCreate();
92  tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
93  tidyOptSetInt(tdoc, TidyWrapLen, 4096);
94  tidySetErrorBuffer(tdoc, &tidy_errbuf);
95  tidyBufInit(&docbuf);
96 
97  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
98  err = curl_easy_perform(curl);
99  if(!err) {
100  err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
101  if(err >= 0) {
102  err = tidyCleanAndRepair(tdoc); /* fix any problems */
103  if(err >= 0) {
104  err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
105  if(err >= 0) {
106  dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */
107  fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
108  }
109  }
110  }
111  }
112  else
113  fprintf(stderr, "%s\n", curl_errbuf);
114 
115  /* clean-up */
116  curl_easy_cleanup(curl);
117  tidyBufFree(&docbuf);
118  tidyBufFree(&tidy_errbuf);
119  tidyRelease(tdoc);
120  return err;
121 
122  }
123  else
124  printf("usage: %s <url>\n", argv[0]);
125 
126  return 0;
127 }
void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
Definition: htmltidy.c:45
#define curl_easy_setopt(handle, option, value)
Definition: typecheck-gcc.h:41
int main(int argc, char **argv)
Definition: htmltidy.c:75
uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
Definition: htmltidy.c:36
#define printf
Definition: curl_printf.h:40
CURL_EXTERN CURL * curl_easy_init(void)
Definition: easy.c:343
CURL_EXTERN void curl_easy_cleanup(CURL *curl)
char buf[3]
Definition: unit1398.c:32
void CURL
Definition: curl.h:102
#define CURL_ERROR_SIZE
Definition: curl.h:724
size_t size
Definition: unit1302.c:52
#define fprintf
Definition: curl_printf.h:41
const char * name
Definition: curl_sasl.c:54
static CURL * curl
Definition: sessioninfo.c:35
CURL_EXTERN CURLcode curl_easy_perform(CURL *curl)


rc_tagdetect_client
Author(s): Monika Florek-Jasinska , Raphael Schaller
autogenerated on Sat Feb 13 2021 03:42:15