libwww_parse_html.c

00001 /* File:      libwww_parse_html.c
00002 ** Author(s): kifer, Yang Yang
00003 ** Contact:   xsb-contact@cs.sunysb.edu
00004 ** 
00005 ** Copyright (C) The Research Foundation of SUNY, 2000
00006 ** 
00007 ** XSB is free software; you can redistribute it and/or modify it under the
00008 ** terms of the GNU Library General Public License as published by the Free
00009 ** Software Foundation; either version 2 of the License, or (at your option)
00010 ** any later version.
00011 ** 
00012 ** XSB is distributed in the hope that it will be useful, but WITHOUT ANY
00013 ** WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00014 ** FOR A PARTICULAR PURPOSE.  See the GNU Library General Public License for
00015 ** more details.
00016 ** 
00017 ** You should have received a copy of the GNU Library General Public License
00018 ** along with XSB; if not, write to the Free Software Foundation,
00019 ** Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00020 **
00021 ** $Id: libwww_parse_html.c,v 1.12 2003/12/31 01:04:51 kifer Exp $
00022 ** 
00023 */
00024 
00025 
00026 #include "libwww_util.h"
00027 #include "libwww_parse.h"
00028 #include "libwww_parse_html.h"
00029 
00030 
00031 /* BOOL, PRIVATE, PUBLIC, etc., are defined in a Libwww header */
00032 
00033 /* This is the callback that captures start tag events */
00034 PRIVATE void html_beginElement(USERDATA *htext, /* where we build everything */
00035                                int      element_number, /* internal tag # */
00036                                /* bitmap: tells which tag attrs are present */
00037                                const BOOL *present,
00038                                /* array of values for the attributes
00039                                   specified by the "present" bitmap */ 
00040                                const char **value)
00041 {
00042 #ifdef LIBWWW_DEBUG
00043   HTTag *tag = SGML_findTag(htext->dtd, element_number);
00044   xsb_dbgmsg((LOG_DEBUG,"***In html_beginElement(%s): stackptr=%d tag=%s suppress=%d choose=%d",
00045              RequestID(htext->request),
00046              htext->stackptr, HTTag_name(tag),
00047              IS_SUPPRESSED_TAG((HKEY)element_number, htext->request),
00048              IS_SELECTED_TAG((HKEY)element_number, htext->request)
00049               ));
00050 #endif
00051 
00052   if (IS_STRIPPED_TAG((HKEY)element_number, htext->request)) return;
00053 
00054   if ((suppressing(htext) && !IS_SELECTED_TAG((HKEY)element_number, htext->request))
00055       || (parsing(htext) && IS_SUPPRESSED_TAG((HKEY)element_number, htext->request))) {
00056     html_push_suppressed_element(htext, element_number);
00057     return;
00058   }
00059 
00060   /* parsing or suppressing & found a selected tag */
00061   if ((parsing(htext) && !IS_SUPPRESSED_TAG((HKEY)element_number,htext->request))
00062       || (suppressing(htext) 
00063           && IS_SELECTED_TAG((HKEY)element_number, htext->request))) {
00064     html_push_element(htext,element_number,present,value);
00065     return;
00066   }
00067 }
00068 
00069 
00070 /* The callback for the end-tag event */
00071 PRIVATE void html_endElement (USERDATA *htext, int element_number)
00072 {
00073   int i, match;
00074 
00075 #ifdef LIBWWW_DEBUG
00076   xsb_dbgmsg((LOG_DEBUG,"***In html_endElement(%s): stackptr=%d",
00077               RequestID(htext->request), htext->stackptr));
00078 #endif
00079 
00080   if (IS_STRIPPED_TAG((HKEY)element_number, htext->request)) return;
00081 
00082   match = find_matching_elt(htext, element_number);
00083   /* the closing tag is probably out of place */
00084   if (match < 0) return;
00085 
00086 #ifdef LIBWWW_DEBUG_VERBOSE
00087   xsb_dbgmsg((LOG_DEBUG,"***match=%d", match));
00088 #endif
00089 
00090   for (i=htext->stackptr; i>=match; i--)
00091     if (parsing(htext))
00092       html_pop_element(htext);
00093     else
00094       html_pop_suppressed_element(htext);
00095 
00096 #ifdef LIBWWW_DEBUG_VERBOSE
00097   if (htext->stackptr >= 0) {
00098     if (!STACK_TOP(htext).suppress)
00099       print_prolog_term(STACK_TOP(htext).elt_term, "elt_term");
00100   }
00101 #endif
00102 
00103   return;
00104 }
00105 
00106 
00107 
00108 /* The callback to capture text events */
00109 PRIVATE void html_addText (USERDATA *htext, const char *textbuf, int len)
00110 {
00111   static XSB_StrDefine(pcdata_buf);
00112   int shift = 0;
00113   REQUEST_CONTEXT *context =
00114     (REQUEST_CONTEXT *)HTRequest_context(htext->request);
00115 
00116 #ifdef LIBWWW_DEBUG_VERBOSE
00117   xsb_dbgmsg((LOG_DEBUG,"***In html_addText: Request %s", RequestID(htext->request)));
00118 #endif
00119 
00120   if (IS_STRIPPED_TAG((HKEY)PCDATA_SPECIAL, htext->request)) return;
00121   if (suppressing(htext)) return;
00122 
00123 
00124   /* strip useless newlines */
00125   if (strncmp(textbuf,"\n", len) == 0) return;
00126 
00127   html_push_element(htext, PCDATA_SPECIAL, NULL, NULL);
00128 
00129   /* copy textbuf (which isn't null-terminated) into a variable length str */
00130   XSB_StrEnsureSize(&pcdata_buf, len+1);
00131   strncpy(pcdata_buf.string, textbuf, len);
00132   pcdata_buf.length = len;
00133   XSB_StrNullTerminate(&pcdata_buf);
00134 
00135   /* if string starts with a newline, skip the newline */
00136   if (strncmp(textbuf,"\n", strlen("\n")) == 0)
00137     shift = strlen("\n");
00138 
00139   /* put the text string into the elt term and then pop it */
00140   if (context->convert2list)
00141     c2p_chars(pcdata_buf.string+shift, p2p_arg(STACK_TOP(htext).elt_term,3));
00142   else 
00143     c2p_string(pcdata_buf.string+shift, p2p_arg(STACK_TOP(htext).elt_term,3));
00144 
00145   html_pop_element(htext);
00146   return;
00147 }
00148 
00149 
00150 /* Collect tag's attributes and make them into a list of the form
00151    [attval(attr,val), ...]; bind it to Arg 2 of ELT_TERM */
00152 PRIVATE void collect_html_attributes ( prolog_term  elt_term,
00153                                        HTTag        *tag,
00154                                        const BOOL   *present,
00155                                        const char  **value)
00156 {
00157   int tag_attributes_number = HTTag_attributes(tag);
00158   static XSB_StrDefine(attrname);
00159   int cnt;
00160   prolog_term
00161     prop_list = p2p_arg(elt_term,2),
00162     prop_list_tail = prop_list,
00163     prop_list_head;
00164 
00165   c2p_list(prop_list_tail);
00166 
00167 #ifdef LIBWWW_DEBUG_VERBOSE
00168   xsb_dbgmsg((LOG_DEBUG,"***In collect_html_attributes: tag_attributes_number=%d",
00169               tag_attributes_number));
00170 #endif
00171 
00172   for (cnt=0; cnt<tag_attributes_number; cnt++) {
00173     if (present[cnt]) {
00174       XSB_StrEnsureSize(&attrname, strlen(HTTag_attributeName(tag, cnt)));
00175       strcpy_lower(attrname.string, HTTag_attributeName(tag, cnt));
00176       
00177 #ifdef LIBWWW_DEBUG_VERBOSE
00178       xsb_dbgmsg((LOG_DEBUG,"***attr=%s, val=%s ",
00179                   attrname.string, (char *)value[cnt]));
00180 #endif
00181       prop_list_head = p2p_car(prop_list_tail);
00182       c2p_functor("attval",2,prop_list_head);
00183       c2p_string(attrname.string, p2p_arg(prop_list_head,1));
00184       /* some attrs, like "checked", are boolean and have no value; in this
00185          case we leave the value arg uninstantiated */
00186       if ((char *)value[cnt])
00187         c2p_string((char *)value[cnt], p2p_arg(prop_list_head, 2));
00188     
00189       prop_list_tail = p2p_cdr(prop_list_tail);
00190       c2p_list(prop_list_tail);
00191     }
00192   }
00193 
00194   /* Terminate the property list */
00195   c2p_nil(prop_list_tail);
00196   return;
00197 }
00198 
00199 
00200 /* push element onto HTEXT->stack */
00201 PRIVATE void html_push_element (USERDATA       *htext,
00202                                 int            element_number,
00203                                 const BOOL     *present,
00204                                 const char     **value)
00205 {
00206   static XSB_StrDefine(tagname);
00207   HTTag *tag = special_find_tag(htext, element_number);
00208   prolog_term location;
00209 
00210   /*   If tag is not valid for HTML */
00211   if (tag == NULL) return;
00212 
00213   if (htext->stackptr < 0)
00214     location = htext->parsed_term_tail;
00215   else 
00216     location = STACK_TOP(htext).content_list_tail;
00217 
00218   htext->stackptr++;
00219 
00220 #ifdef LIBWWW_DEBUG_VERBOSE
00221   xsb_dbgmsg((LOG_DEBUG,"***In html_push_element(%s): stackptr=%d",
00222               RequestID(htext->request), htext->stackptr));
00223 #endif
00224 
00225   CHECK_STACK_OVERFLOW(htext);
00226 
00227   /* wire the new elt into where it should be in the content list */
00228   STACK_TOP(htext).elt_term = p2p_car(location);
00229 
00230   STACK_TOP(htext).element_number = element_number;
00231   STACK_TOP(htext).suppress = FALSE;
00232 
00233   /* normal tags look like elt(tagname, attrlist, contentlist);
00234      pcdata tags are: elt(pcdata,[],text);
00235      empty tags look like elt(tagname,attrlist,[]); */
00236   STACK_TOP(htext).element_type = HTTag_content(tag);
00237   c2p_functor("elt",3,STACK_TOP(htext).elt_term);
00238 
00239   XSB_StrEnsureSize(&tagname, strlen(HTTag_name(tag)));
00240   strcpy_lower(tagname.string, HTTag_name(tag));
00241   c2p_string(tagname.string, p2p_arg(STACK_TOP(htext).elt_term, 1));
00242   collect_html_attributes(STACK_TOP(htext).elt_term, tag, present, value);
00243 #ifdef LIBWWW_DEBUG_VERBOSE
00244   xsb_dbgmsg((LOG_DEBUG,"***elt_name=%s", HTTag_name(tag)));
00245   print_prolog_term(STACK_TOP(htext).elt_term, "elt_term");
00246 #endif
00247 
00248   switch (STACK_TOP(htext).element_type) {
00249   case SGML_EMPTY:
00250     c2p_nil(p2p_arg(STACK_TOP(htext).elt_term,3));
00251     html_pop_element(htext);
00252     break;
00253   case PCDATA_SPECIAL:
00254     /* nothing to do: we pop this after htext is inserted in html_addText */
00255     break;
00256   default: /* normal elt */
00257     STACK_TOP(htext).content_list_tail = p2p_arg(STACK_TOP(htext).elt_term,3);
00258     c2p_list(STACK_TOP(htext).content_list_tail);
00259   }
00260 }
00261 
00262 
00263 /* When done with an elt, close its contents list and pop the stack */
00264 PRIVATE void html_pop_element(USERDATA *htext)
00265 {
00266 #ifdef LIBWWW_DEBUG_VERBOSE
00267   xsb_dbgmsg((LOG_DEBUG,"***In html_pop_element(%s): stackptr=%d, elt_name=%s",
00268              RequestID(htext->request),
00269              htext->stackptr,
00270               HTTag_name(special_find_tag(htext, STACK_TOP(htext).element_number))));
00271 #endif
00272   /* close the property list, for notmal elements */
00273   switch (STACK_TOP(htext).element_type) {
00274   case SGML_EMPTY: /* this case can't occur */
00275     break;
00276   case PCDATA_SPECIAL:
00277     break;
00278   default: /* normal element */
00279     c2p_nil(STACK_TOP(htext).content_list_tail);
00280   }
00281 
00282   /* insert new list cell into the tail and change content_list_tail to point
00283      to the new tail */
00284   if (htext->stackptr > 0) {
00285     STACK_PREV(htext).content_list_tail =
00286       p2p_cdr(STACK_PREV(htext).content_list_tail);
00287     c2p_list(STACK_PREV(htext).content_list_tail);
00288   } else {
00289     htext->parsed_term_tail = p2p_cdr(htext->parsed_term_tail);
00290     c2p_list(htext->parsed_term_tail);
00291   }
00292 
00293   htext->stackptr--;
00294 
00295 #ifdef LIBWWW_DEBUG_VERBOSE
00296   if (htext->stackptr >= 0)
00297     print_prolog_term(STACK_TOP(htext).content_list_tail, "content_list_tail");
00298   else
00299     print_prolog_term(htext->parsed_term_tail, "parsed_term_tail");
00300 #endif
00301 
00302   return;
00303 }
00304 
00305 
00306 /* Push tag, but keep only the tag info; don't convert to prolog term */
00307 PRIVATE void html_push_suppressed_element(USERDATA *htext, int element_number)
00308 {
00309   /* if empty tag, then just return */
00310   if (SGML_findTagContents(htext->dtd, element_number) == SGML_EMPTY)
00311       return;
00312   /* non-empty tag */
00313   htext->stackptr++; /* advance ptr, but don't push tag */
00314 
00315   STACK_TOP(htext).element_number = element_number;
00316   STACK_TOP(htext).suppress = TRUE;
00317 
00318   /* passing content list tail through suppressed elements */
00319   if (htext->stackptr == 0)
00320     STACK_TOP(htext).content_list_tail = htext->parsed_term_tail;
00321   else 
00322     STACK_TOP(htext).content_list_tail = STACK_PREV(htext).content_list_tail;
00323 
00324   return;
00325 }
00326 
00327 
00328 PRIVATE void html_pop_suppressed_element(USERDATA *htext)
00329 {
00330   /* chain the list tails back through the sequence of suppressed tags */
00331   if (htext->stackptr > 0) {
00332     STACK_PREV(htext).content_list_tail = STACK_TOP(htext).content_list_tail;
00333   } else {
00334     htext->parsed_term_tail = STACK_TOP(htext).content_list_tail;
00335   }
00336 
00337   htext->stackptr--;
00338 
00339 #ifdef LIBWWW_DEBUG_VERBOSE
00340   xsb_dbgmsg((LOG_DEBUG,"***In html_pop_suppressed_element(%s): stackptr=%d",
00341               RequestID(htext->request), htext->stackptr));
00342   if (htext->stackptr >= 0)
00343     print_prolog_term(STACK_TOP(htext).content_list_tail, "content_list_tail");
00344   else
00345     print_prolog_term(htext->parsed_term_tail, "parsed_term_tail");
00346 #endif
00347 
00348   return;
00349 }
00350 
00351 /* search the stack to see if there is a matching element */
00352 PRIVATE int find_matching_elt(USERDATA *htext, int elt_number)
00353 {
00354   int i;
00355   for (i=htext->stackptr; i>=0; i--) {
00356 #ifdef LIBWWW_DEBUG_VERBOSE
00357     xsb_dbgmsg((LOG_DEBUG,"***In find_matching_elt"));
00358     xsb_dbgmsg((LOG_DEBUG,"***i=%d htext->stack[i].element_number=%d(%s) elt_number=%d(%s)",
00359                i,
00360                htext->stack[i].element_number, 
00361                SGML_findTagName(htext->dtd, htext->stack[i].element_number),
00362                elt_number,
00363                 SGML_findTagName(htext->dtd, elt_number)));
00364 #endif
00365     if (htext->stack[i].element_number == elt_number)
00366       return i;
00367   }
00368   return -1;
00369 }
00370 
00371 
00372 PRIVATE inline HTTag *special_find_tag(USERDATA *htext, int element_number)
00373 {
00374   static HTTag pcdata_tag = {"pcdata", NULL, 0, PCDATA_SPECIAL};
00375   if (element_number == PCDATA_SPECIAL)
00376     return &pcdata_tag;
00377   return SGML_findTag(htext->dtd, element_number);
00378 }
00379 
00380 
00381 /* USERDATA creation and deletion callbacks */
00382 USERDATA *html_create_userData( HTRequest *             request,
00383                                 HTParentAnchor *        anchor,
00384                                 HTStream *              output_stream)
00385 {
00386   USERDATA *me = NULL;
00387 
00388 #ifdef LIBWWW_DEBUG
00389   xsb_dbgmsg((LOG_DEBUG,"***Start html_create_userData(%s):", RequestID(request)));
00390 #endif
00391   if (request) {
00392     /* make sure that MIME type is appropriate for HTML */
00393     if (!verifyMIMEformat(request, HTMLPARSE)) {
00394       /* The following causes segfault, so we xsb_abort instead 
00395          HTStream * input = HTRequest_inputStream(request);
00396          HTRequest_kill(request);
00397          (*input->isa->abort)(input, NULL);
00398          return NULL;
00399       */
00400       xsb_abort("[LIBWWW_REQUEST] Bug: Request type/MIME type mismatch");
00401     }
00402     if ((me = (USERDATA *) HT_CALLOC(1, sizeof(USERDATA))) == NULL)
00403       HT_OUTOFMEM("libwww_parse_html");
00404     me->delete_method = html_delete_userData;
00405     me->request = request;
00406     me->node_anchor =  anchor;
00407     me->target = output_stream;
00408     me->dtd = HTML_dtd();
00409     me->suppress_is_default = 
00410       ((REQUEST_CONTEXT *)HTRequest_context(request))->suppress_is_default;
00411     me->parsed_term = p2p_new();
00412     c2p_list(me->parsed_term);
00413     me->parsed_term_tail = me->parsed_term;
00414     SETUP_STACK(me);
00415   }
00416 
00417 #ifdef LIBWWW_DEBUG
00418   xsb_dbgmsg((LOG_DEBUG,"***In html_create_userData(%s):", RequestID(request)));
00419 #endif
00420 
00421   /* Hook up userdata to the request context */
00422   ((REQUEST_CONTEXT *)HTRequest_context(request))->userdata = (void *)me;
00423   return me;
00424 }
00425 
00426 
00427 PRIVATE void html_delete_userData(void *userdata)
00428 {
00429   int i;
00430   prolog_term parsed_result, status_term;
00431   USERDATA *me = (USERDATA *)userdata;
00432   HTRequest *request = me->request;
00433 
00434   if (me->request) {
00435     parsed_result =
00436       ((REQUEST_CONTEXT *)HTRequest_context(request))->request_result;
00437     status_term =
00438       ((REQUEST_CONTEXT *)HTRequest_context(request))->status_term;
00439   } else return;
00440 
00441 #ifdef LIBWWW_DEBUG
00442   xsb_dbgmsg((LOG_DEBUG,"***In html_delete_userData(%s): stackptr=%d",
00443               RequestID(request), me->stackptr));
00444 #endif
00445 
00446   /* close open tags on stack */
00447   for (i=me->stackptr; i>=0; i--)
00448     if (parsing(me))
00449       html_pop_element(me);
00450     else
00451       html_pop_suppressed_element(me);
00452 
00453   /* terminate the parsed prolog terms list */
00454   c2p_nil(me->parsed_term_tail);
00455 
00456   /* pass the result to the outside world */
00457   if (is_var(me->parsed_term))
00458     p2p_unify(parsed_result, me->parsed_term);
00459   else
00460     xsb_abort("[LIBWWW_REQUEST] Request %s: Arg 4 (Result) must be unbound variable",
00461               RequestID(request));
00462 
00463 
00464   if (me->target) FREE_TARGET(me);
00465   if (me->stack) HT_FREE(me->stack);
00466   HT_FREE(me);
00467 
00468 #ifdef LIBWWW_DEBUG
00469   xsb_dbgmsg((LOG_DEBUG,"***Request %s: freed the USERDATA obj", RequestID(request)));
00470 #endif
00471 
00472   return;
00473 }
00474 
00475 
00476 void html_register_callbacks()
00477 {
00478   /* register callback for begin/end element events */
00479   HText_registerElementCallback(html_beginElement, html_endElement);
00480   /* register callback for text chunks */
00481   HText_registerTextCallback(html_addText);
00482   /* register callbacks to create and delete the HText (USERDATA)
00483      objects. These are objects where we build parsed terms */
00484   HText_registerCDCallback(html_create_userData, 
00485                            (HText_delete *)html_delete_userData);
00486   return;
00487 }
00488 
00489 
00490 void set_html_conversions()
00491 {
00492   /* Must delete old converter and create new. Apparently something in libwww
00493      releases the atoms used in thes converters, which causes it to crash 
00494      in HTStreamStack() on the second call to rdfparse. */
00495   HTPresentation_deleteAll(HTML_converter);
00496   HTML_converter = HTList_new();
00497 
00498   HTConversion_add(HTML_converter,"*/*", "www/debug",
00499                    HTBlackHoleConverter, 1.0, 0.0, 0.0);
00500   HTConversion_add(HTML_converter,"message/rfc822", "*/*",
00501                    HTMIMEConvert, 1.0, 0.0, 0.0);
00502   HTConversion_add(HTML_converter,"message/x-rfc822-foot", "*/*",
00503                    HTMIMEFooter, 1.0, 0.0, 0.0);
00504   HTConversion_add(HTML_converter,"message/x-rfc822-head", "*/*",
00505                    HTMIMEHeader, 1.0, 0.0, 0.0);
00506   HTConversion_add(HTML_converter,"message/x-rfc822-cont", "*/*",
00507                    HTMIMEContinue, 1.0, 0.0, 0.0);
00508   HTConversion_add(HTML_converter,"message/x-rfc822-upgrade","*/*",
00509                    HTMIMEUpgrade, 1.0, 0.0, 0.0);
00510   HTConversion_add(HTML_converter,"message/x-rfc822-partial", "*/*",
00511                    HTMIMEPartial, 1.0, 0.0, 0.0);
00512   HTConversion_add(HTML_converter,"multipart/*", "*/*",
00513                    HTBoundary, 1.0, 0.0, 0.0);
00514   HTConversion_add(HTML_converter,"text/x-http", "*/*",
00515                    HTTPStatus_new, 1.0, 0.0, 0.0);
00516   /* www/html is invented by us to force html conversion */
00517   HTConversion_add(HTML_converter,"text/html", "www/html",
00518                    HTMLPresent, 1.0, 0.0, 0.0);
00519   HTConversion_add(HTML_converter,"text/plain", "www/html",
00520                    HTMLPresent, 1.0, 0.0, 0.0);
00521   HTConversion_add(HTML_converter,"www/present", "www/html",
00522                    HTMLPresent, 1.0, 0.0, 0.0);
00523   HTConversion_add(HTML_converter,"text/xml", "www/html",
00524                    HTMLPresent, 1.0, 0.0, 0.0);
00525   HTConversion_add(HTML_converter,"text/rdf", "www/html",
00526                    HTMLPresent, 1.0, 0.0, 0.0);
00527   HTConversion_add(HTML_converter, "application/html", "*/*",
00528                    HTMLPresent, 1.0, 0.0, 0.0);
00529 }
00530 
00531 

Generated on Wed Jul 26 13:30:45 2006 for XSB by  doxygen 1.4.5