HTMLparser.h

00001 /*
00002 
00003  * Summary: interface for an HTML 4.0 non-verifying parser
00004 
00005  * Description: this module implements an HTML 4.0 non-verifying parser
00006 
00007  *              with API compatible with the XML parser ones. It should
00008 
00009  *              be able to parse "real world" HTML, even if severely
00010 
00011  *              broken from a specification point of view.
00012 
00013  *
00014 
00015  * Copy: See Copyright for the status of this software.
00016 
00017  *
00018 
00019  * Author: Daniel Veillard
00020 
00021  */
00022 
00023 
00024 
00025 #ifndef __HTML_PARSER_H__
00026 
00027 #define __HTML_PARSER_H__
00028 
00029 #include <libxml/xmlversion.h>
00030 
00031 #include <libxml/parser.h>
00032 
00033 
00034 
00035 #ifdef LIBXML_HTML_ENABLED
00036 
00037 
00038 
00039 #ifdef __cplusplus
00040 
00041 extern "C" {
00042 
00043 #endif
00044 
00045 
00046 
00047 /*
00048 
00049  * Most of the back-end structures from XML and HTML are shared.
00050 
00051  */
00052 
00053 typedef xmlParserCtxt htmlParserCtxt;
00054 
00055 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
00056 
00057 typedef xmlParserNodeInfo htmlParserNodeInfo;
00058 
00059 typedef xmlSAXHandler htmlSAXHandler;
00060 
00061 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
00062 
00063 typedef xmlParserInput htmlParserInput;
00064 
00065 typedef xmlParserInputPtr htmlParserInputPtr;
00066 
00067 typedef xmlDocPtr htmlDocPtr;
00068 
00069 typedef xmlNodePtr htmlNodePtr;
00070 
00071 
00072 
00073 /*
00074 
00075  * Internal description of an HTML element, representing HTML 4.01
00076 
00077  * and XHTML 1.0 (which share the same structure).
00078 
00079  */
00080 
00081 typedef struct _htmlElemDesc htmlElemDesc;
00082 
00083 typedef htmlElemDesc *htmlElemDescPtr;
00084 
00085 struct _htmlElemDesc {
00086 
00087     const char *name;   /* The tag name */
00088 
00089     char startTag;      /* Whether the start tag can be implied */
00090 
00091     char endTag;        /* Whether the end tag can be implied */
00092 
00093     char saveEndTag;    /* Whether the end tag should be saved */
00094 
00095     char empty;         /* Is this an empty element ? */
00096 
00097     char depr;          /* Is this a deprecated element ? */
00098 
00099     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
00100 
00101     char isinline;      /* is this a block 0 or inline 1 element */
00102 
00103     const char *desc;   /* the description */
00104 
00105 
00106 
00107 /* NRK Jan.2003
00108 
00109  * New fields encapsulating HTML structure
00110 
00111  *
00112 
00113  * Bugs:
00114 
00115  *      This is a very limited representation.  It fails to tell us when
00116 
00117  *      an element *requires* subelements (we only have whether they're
00118 
00119  *      allowed or not), and it doesn't tell us where CDATA and PCDATA
00120 
00121  *      are allowed.  Some element relationships are not fully represented:
00122 
00123  *      these are flagged with the word MODIFIER
00124 
00125  */
00126 
00127     const char** subelts;               /* allowed sub-elements of this element */
00128 
00129     const char* defaultsubelt;  /* subelement for suggested auto-repair
00130 
00131                                            if necessary or NULL */
00132 
00133     const char** attrs_opt;             /* Optional Attributes */
00134 
00135     const char** attrs_depr;            /* Additional deprecated attributes */
00136 
00137     const char** attrs_req;             /* Required attributes */
00138 
00139 };
00140 
00141 
00142 
00143 /*
00144 
00145  * Internal description of an HTML entity.
00146 
00147  */
00148 
00149 typedef struct _htmlEntityDesc htmlEntityDesc;
00150 
00151 typedef htmlEntityDesc *htmlEntityDescPtr;
00152 
00153 struct _htmlEntityDesc {
00154 
00155     unsigned int value; /* the UNICODE value for the character */
00156 
00157     const char *name;   /* The entity name */
00158 
00159     const char *desc;   /* the description */
00160 
00161 };
00162 
00163 
00164 
00165 /*
00166 
00167  * There is only few public functions.
00168 
00169  */
00170 
00171 XMLPUBFUN const htmlElemDesc * XMLCALL  
00172 
00173                         htmlTagLookup   (const xmlChar *tag);
00174 
00175 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00176 
00177                         htmlEntityLookup(const xmlChar *name);
00178 
00179 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00180 
00181                         htmlEntityValueLookup(unsigned int value);
00182 
00183 
00184 
00185 XMLPUBFUN int XMLCALL                   
00186 
00187                         htmlIsAutoClosed(htmlDocPtr doc,
00188 
00189                                          htmlNodePtr elem);
00190 
00191 XMLPUBFUN int XMLCALL                   
00192 
00193                         htmlAutoCloseTag(htmlDocPtr doc,
00194 
00195                                          const xmlChar *name,
00196 
00197                                          htmlNodePtr elem);
00198 
00199 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00200 
00201                         htmlParseEntityRef(htmlParserCtxtPtr ctxt,
00202 
00203                                          const xmlChar **str);
00204 
00205 XMLPUBFUN int XMLCALL                   
00206 
00207                         htmlParseCharRef(htmlParserCtxtPtr ctxt);
00208 
00209 XMLPUBFUN void XMLCALL                  
00210 
00211                         htmlParseElement(htmlParserCtxtPtr ctxt);
00212 
00213 
00214 
00215 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00216 
00217                         htmlCreateMemoryParserCtxt(const char *buffer,
00218 
00219                                                    int size);
00220 
00221 
00222 
00223 XMLPUBFUN int XMLCALL                   
00224 
00225                         htmlParseDocument(htmlParserCtxtPtr ctxt);
00226 
00227 XMLPUBFUN htmlDocPtr XMLCALL            
00228 
00229                         htmlSAXParseDoc (xmlChar *cur,
00230 
00231                                          const char *encoding,
00232 
00233                                          htmlSAXHandlerPtr sax,
00234 
00235                                          void *userData);
00236 
00237 XMLPUBFUN htmlDocPtr XMLCALL            
00238 
00239                         htmlParseDoc    (xmlChar *cur,
00240 
00241                                          const char *encoding);
00242 
00243 XMLPUBFUN htmlDocPtr XMLCALL            
00244 
00245                         htmlSAXParseFile(const char *filename,
00246 
00247                                          const char *encoding,
00248 
00249                                          htmlSAXHandlerPtr sax,
00250 
00251                                          void *userData);
00252 
00253 XMLPUBFUN htmlDocPtr XMLCALL            
00254 
00255                         htmlParseFile   (const char *filename,
00256 
00257                                          const char *encoding);
00258 
00259 XMLPUBFUN int XMLCALL                   
00260 
00261                         UTF8ToHtml      (unsigned char *out,
00262 
00263                                          int *outlen,
00264 
00265                                          const unsigned char *in,
00266 
00267                                          int *inlen);
00268 
00269 XMLPUBFUN int XMLCALL                   
00270 
00271                         htmlEncodeEntities(unsigned char *out,
00272 
00273                                          int *outlen,
00274 
00275                                          const unsigned char *in,
00276 
00277                                          int *inlen, int quoteChar);
00278 
00279 XMLPUBFUN int XMLCALL                   
00280 
00281                         htmlIsScriptAttribute(const xmlChar *name);
00282 
00283 XMLPUBFUN int XMLCALL                   
00284 
00285                         htmlHandleOmittedElem(int val);
00286 
00287 
00288 
00289 #ifdef LIBXML_PUSH_ENABLED
00290 
00297 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00298 
00299                         htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
00300 
00301                                                  void *user_data,
00302 
00303                                                  const char *chunk,
00304 
00305                                                  int size,
00306 
00307                                                  const char *filename,
00308 
00309                                                  xmlCharEncoding enc);
00310 
00311 XMLPUBFUN int XMLCALL                   
00312 
00313                         htmlParseChunk          (htmlParserCtxtPtr ctxt,
00314 
00315                                                  const char *chunk,
00316 
00317                                                  int size,
00318 
00319                                                  int terminate);
00320 
00321 #endif /* LIBXML_PUSH_ENABLED */
00322 
00323 
00324 
00325 XMLPUBFUN void XMLCALL                  
00326 
00327                         htmlFreeParserCtxt      (htmlParserCtxtPtr ctxt);
00328 
00329 
00330 
00331 /*
00332 
00333  * New set of simpler/more flexible APIs
00334 
00335  */
00336 
00349 typedef enum {
00350 
00351     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
00352 
00353     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
00354 
00355     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
00356 
00357     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
00358 
00359     HTML_PARSE_NONET    = 1<<11 /* Forbid network access */
00360 
00361 } htmlParserOption;
00362 
00363 
00364 
00365 XMLPUBFUN void XMLCALL
00366 
00367                 htmlCtxtReset           (htmlParserCtxtPtr ctxt);
00368 
00369 XMLPUBFUN int XMLCALL
00370 
00371                 htmlCtxtUseOptions      (htmlParserCtxtPtr ctxt,
00372 
00373                                          int options);
00374 
00375 XMLPUBFUN htmlDocPtr XMLCALL
00376 
00377                 htmlReadDoc             (const xmlChar *cur,
00378 
00379                                          const char *URL,
00380 
00381                                          const char *encoding,
00382 
00383                                          int options);
00384 
00385 XMLPUBFUN htmlDocPtr XMLCALL
00386 
00387                 htmlReadFile            (const char *URL,
00388 
00389                                          const char *encoding,
00390 
00391                                          int options);
00392 
00393 XMLPUBFUN htmlDocPtr XMLCALL
00394 
00395                 htmlReadMemory          (const char *buffer,
00396 
00397                                          int size,
00398 
00399                                          const char *URL,
00400 
00401                                          const char *encoding,
00402 
00403                                          int options);
00404 
00405 XMLPUBFUN htmlDocPtr XMLCALL
00406 
00407                 htmlReadFd              (int fd,
00408 
00409                                          const char *URL,
00410 
00411                                          const char *encoding,
00412 
00413                                          int options);
00414 
00415 XMLPUBFUN htmlDocPtr XMLCALL
00416 
00417                 htmlReadIO              (xmlInputReadCallback ioread,
00418 
00419                                          xmlInputCloseCallback ioclose,
00420 
00421                                          void *ioctx,
00422 
00423                                          const char *URL,
00424 
00425                                          const char *encoding,
00426 
00427                                          int options);
00428 
00429 XMLPUBFUN htmlDocPtr XMLCALL
00430 
00431                 htmlCtxtReadDoc         (xmlParserCtxtPtr ctxt,
00432 
00433                                          const xmlChar *cur,
00434 
00435                                          const char *URL,
00436 
00437                                          const char *encoding,
00438 
00439                                          int options);
00440 
00441 XMLPUBFUN htmlDocPtr XMLCALL
00442 
00443                 htmlCtxtReadFile                (xmlParserCtxtPtr ctxt,
00444 
00445                                          const char *filename,
00446 
00447                                          const char *encoding,
00448 
00449                                          int options);
00450 
00451 XMLPUBFUN htmlDocPtr XMLCALL
00452 
00453                 htmlCtxtReadMemory              (xmlParserCtxtPtr ctxt,
00454 
00455                                          const char *buffer,
00456 
00457                                          int size,
00458 
00459                                          const char *URL,
00460 
00461                                          const char *encoding,
00462 
00463                                          int options);
00464 
00465 XMLPUBFUN htmlDocPtr XMLCALL
00466 
00467                 htmlCtxtReadFd          (xmlParserCtxtPtr ctxt,
00468 
00469                                          int fd,
00470 
00471                                          const char *URL,
00472 
00473                                          const char *encoding,
00474 
00475                                          int options);
00476 
00477 XMLPUBFUN htmlDocPtr XMLCALL
00478 
00479                 htmlCtxtReadIO          (xmlParserCtxtPtr ctxt,
00480 
00481                                          xmlInputReadCallback ioread,
00482 
00483                                          xmlInputCloseCallback ioclose,
00484 
00485                                          void *ioctx,
00486 
00487                                          const char *URL,
00488 
00489                                          const char *encoding,
00490 
00491                                          int options);
00492 
00493 
00494 
00495 /* NRK/Jan2003: further knowledge of HTML structure
00496 
00497  */
00498 
00499 typedef enum {
00500 
00501   HTML_NA = 0 ,         /* something we don't check at all */
00502 
00503   HTML_INVALID = 0x1 ,
00504 
00505   HTML_DEPRECATED = 0x2 ,
00506 
00507   HTML_VALID = 0x4 ,
00508 
00509   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
00510 
00511 } htmlStatus ;
00512 
00513 
00514 
00515 /* Using htmlElemDesc rather than name here, to emphasise the fact
00516 
00517    that otherwise there's a lookup overhead
00518 
00519 */
00520 
00521 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
00522 
00523 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
00524 
00525 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
00526 
00527 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
00528 
00541 #define htmlDefaultSubelement(elt) elt->defaultsubelt
00542 
00563 #define htmlElementAllowedHereDesc(parent,elt) \
00564 
00565         htmlElementAllowedHere((parent), (elt)->name)
00566 
00567 
00579 #define htmlRequiredAttrs(elt) (elt)->attrs_req
00580 
00581 
00582 
00583 
00584 
00585 #ifdef __cplusplus
00586 
00587 }
00588 
00589 #endif
00590 
00591 
00592 
00593 #endif /* LIBXML_HTML_ENABLED */
00594 
00595 #endif /* __HTML_PARSER_H__ */
00596 

Generated on Wed Jul 26 13:30:46 2006 for XSB by  doxygen 1.4.5