token_xsb.c

00001 /* File:      token_xsb.c
00002 ** Author(s): Richard A. O'Keefe, Deeporn H. Beardsley, Baoqiu Cui,
00003 **            C.R. Ramakrishnan 
00004 ** Contact:   xsb-contact@cs.sunysb.edu
00005 ** 
00006 ** Copyright (C) The Research Foundation of SUNY, 1986, 1993-1998
00007 ** 
00008 ** XSB is free software; you can redistribute it and/or modify it under the
00009 ** terms of the GNU Library General Public License as published by the Free
00010 ** Software Foundation; either version 2 of the License, or (at your option)
00011 ** any later version.
00012 ** 
00013 ** XSB is distributed in the hope that it will be useful, but WITHOUT ANY
00014 ** WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00015 ** FOR A PARTICULAR PURPOSE.  See the GNU Library General Public License for
00016 ** more details.
00017 ** 
00018 ** You should have received a copy of the GNU Library General Public License
00019 ** along with XSB; if not, write to the Free Software Foundation,
00020 ** Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00021 **
00022 ** $Id: token_xsb.c,v 1.25 2006/04/30 18:27:14 tswift Exp $
00023 ** 
00024 */
00025 
00026 #include "xsb_config.h"
00027 
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <math.h>
00031 
00032 #include "auxlry.h"
00033 #include "cell_xsb.h"
00034 #include "context.h"
00035 #include "token_xsb.h"
00036 #include "psc_xsb.h"
00037 #include "subp.h"
00038 #include "register.h"
00039 #include "error_xsb.h"
00040 #include "memory_xsb.h"
00041 
00042 #define exit_if_null(x) {\
00043   if(x == NULL){\
00044    xsb_exit("Malloc Failed !\n");\
00045   }\
00046 }
00047 
00048 #define Char unsigned char
00049 #define AlphabetSize 256
00050  
00051 #define InRange(X,L,U) ((unsigned)((X)-(L)) <= (unsigned)((U)-(L)))
00052 #define IsLayout(X) InRange(InType(X), SPACE, EOLN)
00053  
00054 
00055 /*  VERY IMPORTANT NOTE: I assume that the stdio library returns the value
00056     EOF when character input hits the end of the file, and that this value
00057     is actually the integer -1.  You will note the DigVal(), InType(), and
00058     OuType() macros below, and there is a ChType() macro used in crack().
00059     They all depend on this assumption.
00060 */
00061  
00062 #define InType(c)       (intab.chtype+1)[c]
00063 #define DigVal(c)       (digval+1)[c]
00064 
00065 //Char outqt[EOFCH+1];   /* All the "+1" appear because of the EOF char */
00066  
00067 struct CHARS
00068     {
00069         int     eolcom;       /* End-of-line comment, default % */
00070         int     endeol;       /* early terminator of eolcoms, default none */
00071         int     begcom;       /* In-line comment start, default / */
00072         int     astcom;       /* In-line comment second, default * */
00073         int     endcom;       /* In-line comment finish, default / */
00074         int     radix;        /* Radix character, default ' */
00075         int     dpoint;       /* Decimal point, default . */
00076         int     escape;       /* String escape character, default \ */
00077         int     termin;       /* Terminates a clause */
00078         char    chtype[AlphabetSize+1];
00079     };
00080  
00081 struct CHARS intab =   /* Special character table */
00082     {
00083         '%',                  /* eolcom: end of line comments */
00084         -1,                   /* endeol: early end for eolcoms */
00085         '/',                  /* begcom: in-line comments */
00086         '*',                  /* astcom: in-line comments */
00087         '/',                  /* endcom: in-line comments */
00088         '\'',                 /* radix : radix separator */
00089         '.',                  /* dpoint: decimal point */
00090         '\\',                 /* escape: string escape character */
00091         '.',                  /* termin: ends clause, sign or solo */
00092     {
00093         EOFCH,                /* really the -1th element of the table: */
00094     /*  ^@      ^A      ^B      ^C      ^D      ^E      ^F      ^G      */
00095         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00096     /*  ^H      ^I      ^J      ^K      ^L      ^M      ^N      ^O      */
00097         SPACE,  SPACE,  EOLN,   SPACE,  EOLN,   SPACE,  SPACE,  SPACE,
00098     /*  ^P      ^Q      ^R      ^S      ^T      ^U      ^V      ^W      */
00099         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00100     /*  ^X      ^Y      ^Z      ^[      ^\      ^]      ^^      ^_      */
00101         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00102     /*  sp      !       "       #       $       %       &       '       */
00103         SPACE,  NOBLE,  LISQT,  SIGN,   SIGN,  PUNCT,  SIGN,   ATMQT,
00104     /*  (       )       *       +       ,       -       .       /       */
00105         PUNCT,  PUNCT,  SIGN,   SIGN,   PUNCT,  SIGN,   SIGN,   SIGN,
00106     /*  0       1       2       3       4       5       6       7       */
00107         DIGIT,  DIGIT,  DIGIT,  DIGIT,  DIGIT,  DIGIT,  DIGIT,  DIGIT,
00108     /*  8       9       :       ;       <       =       >       ?       */
00109         DIGIT,  DIGIT,  SIGN,   PUNCT,  SIGN,   SIGN,   SIGN,   SIGN,
00110     /*  @       A       B       C       D       E       F       G       */
00111         SIGN,   UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,
00112     /*  H       I       J       K       L       M       N       O       */
00113         UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,
00114     /*  P       Q       R       S       T       U       V       W       */
00115         UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,
00116     /*  X       Y       Z       [       \       ]       ^       _       */
00117         UPPER,  UPPER,  UPPER,  PUNCT,  SIGN,   PUNCT,  SIGN,   BREAK,
00118     /*  `       a       b       c       d       e       f       g       */
00119         SIGN,   LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,
00120     /*  h       i       j       k       l       m       n       o       */
00121         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,
00122     /*  p       q       r       s       t       u       v       w       */
00123         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,
00124     /*  x       y       z       {       |       }       ~       ^?      */
00125         LOWER,  LOWER,  LOWER,  PUNCT,  PUNCT,  PUNCT,  SIGN,   SPACE,
00126     /*  128     129     130     131     132     133     134     135     */
00127         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00128     /*  136     137     138     139     140     141     142     143     */
00129         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00130     /*  144     145     146     147     148     149     150     151     */
00131         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00132     /*  152     153     154     155     156     157     158     159     */
00133         SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,  SPACE,
00134     /*  NBSP    !-inv   cents   pounds  ching   yen     brobar  section */
00135         SPACE,  SIGN,   SIGN,   SIGN,   SIGN,   SIGN,   SIGN,   SIGN,
00136     /*  "accent copyr   -a ord  <<      nothook SHY     (reg)   ovbar   */
00137         SIGN,   SIGN,   LOWER,  SIGN,   SIGN,   SIGN,   SIGN,   SIGN,
00138     /*  degrees +/-     super 2 super 3 -       micron  pilcrow -       */
00139         SIGN,   SIGN,   LOWER,  LOWER,  SIGN,   SIGN,   SIGN,   SIGN,
00140     /*  ,       super 1 -o ord  >>      1/4     1/2     3/4     ?-inv   */
00141         SIGN,   LOWER,  LOWER,  SIGN,   SIGN,   SIGN,   SIGN,   SIGN,
00142     /*  `A      'A      ^A      ~A      "A      oA      AE      ,C      */
00143         UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,
00144     /*  `E      'E      ^E      "E      `I      'I      ^I      "I      */
00145         UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,
00146     /*  ETH     ~N      `O      'O      ^O      ~O      "O      x times */
00147         UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  SIGN,
00148     /*  /O      `U      'U      ^U      "U      'Y      THORN   ,B      */
00149         UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  UPPER,  LOWER,
00150     /*  `a      'a      ^a      ~a      "a      oa      ae      ,c      */
00151         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,
00152     /*  `e      'e      ^e      "e      `i      'i      ^i      "i      */
00153         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,
00154     /*  eth     ~n      `o      'o      ^o      ~o      "o      -:-     */
00155 #ifdef  vms
00156         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,
00157 #else
00158         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  SIGN,
00159 #endif
00160     /*  /o      `u      'u      ^u      "u      'y      thorn  "y       */
00161 #ifdef  vms
00162         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  SPACE
00163 #else
00164         LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER,  LOWER
00165 #endif
00166 }};
00167  
00168 char digval[AlphabetSize+1] =
00169     {
00170         99,                     /* really the -1th element of the table */
00171     /*  ^@      ^A      ^B      ^C      ^D      ^E      ^F      ^G      */
00172         99,     99,     99,     99,     99,     99,     99,     99,
00173     /*  ^H      ^I      ^J      ^K      ^L      ^M      ^N      ^O      */
00174         99,     99,     99,     99,     99,     99,     99,     99,
00175     /*  ^P      ^Q      ^R      ^S      ^T      ^U      ^V      ^W      */
00176         99,     99,     99,     99,     99,     99,     99,     99,
00177     /*  ^X      ^Y      ^Z      ^[      ^\      ^]      ^^      ^_      */
00178         99,     99,     99,     99,     99,     99,     99,     99,
00179     /*  sp      !       "       #       $       %       &       '       */
00180         99,     99,     99,     99,     99,     99,     99,     99,
00181     /*  (       )       *       +       ,       -       .       /       */
00182         99,     99,     99,     99,     99,     99,     99,     99,
00183     /*  0       1       2       3       4       5       6       7       */
00184         0,      1,      2,      3,      4,      5,      6,      7,
00185     /*  8       9       :       ;       <       =       >       ?       */
00186         8,      9,      99,     99,     99,     99,     99,     99,
00187     /*  @       A       B       C       D       E       F       G       */
00188         99,     10,     11,     12,     13,     14,     15,     16,
00189     /*  H       I       J       K       L       M       N       O       */
00190         17,     18,     19,     20,     21,     22,     23,     24,
00191     /*  P       Q       R       S       T       U       V       W       */
00192         25,     26,     27,     28,     29,     30,     31,     32,
00193     /*  X       Y       Z       [       \       ]       ^       _       */
00194         33,     34,     35,     99,     99,     99,     99,     0,  /*NB*/
00195     /*  `       a       b       c       d       e       f       g       */
00196         99,     10,     11,     12,     13,     14,     15,     16,
00197     /*  h       i       j       k       l       m       n       o       */
00198         17,     18,     19,     20,     21,     22,     23,     24,
00199     /*  p       q       r       s       t       u       v       w       */
00200         25,     26,     27,     28,     29,     30,     31,     32,
00201     /*  x       y       z       {       |       }       ~       ^?      */
00202         33,     34,     35,     99,     99,     99,     99,     99,
00203     /*  128     129     130     131     132     133     134     135     */
00204         99,     99,     99,     99,     99,     99,     99,     99,
00205     /*  136     137     138     139     140     141     142     143     */
00206         99,     99,     99,     99,     99,     99,     99,     99,
00207     /*  144     145     146     147     148     149     150     151     */
00208         99,     99,     99,     99,     99,     99,     99,     99,
00209     /*  152     153     154     155     156     157     158     159     */
00210         99,     99,     99,     99,     99,     99,     99,     99,
00211     /*  160     161     162     163     164     165     166     167     */
00212         99,     99,     99,     99,     99,     99,     99,     99,
00213     /*  168     169     170(-a) 171     172     173     174     175     */
00214         99,     99,     99,     99,     99,     99,     99,     99,
00215     /*  176     177     178(2)  179(3)  180     181     182     183     */
00216         99,     99,     2,      3,      99,     99,     99,     99,
00217     /*  184     185(1)  186(-o) 187     188     189     190     191     */
00218         99,     1,      99,     99,     99,     99,     99,     99,
00219     /*  192     193     194     195     196     197     198     199     */
00220         99,     99,     99,     99,     99,     99,     99,     99,
00221     /*  200     201     202     203     204     205     206     207     */
00222         99,     99,     99,     99,     99,     99,     99,     99,
00223     /*  208     209     210     211     212     213     214     215     */
00224         99,     99,     99,     99,     99,     99,     99,     99,
00225     /*  216     217     218     219     220     221     222     223     */
00226         99,     99,     99,     99,     99,     99,     99,     99,
00227     /*  224     225     226     227     228     229     230     231     */
00228         99,     99,     99,     99,     99,     99,     99,     99,
00229     /*  232     233     234     235     236     237     238     239     */
00230         99,     99,     99,     99,     99,     99,     99,     99,
00231     /*  240     241     242     243     244     245     246     247     */
00232         99,     99,     99,     99,     99,     99,     99,     99,
00233     /*  248     249     250     251     252     253     254     255     */
00234         99,     99,     99,     99,     99,     99,     99,     99
00235     };
00236 
00237 int intype(int c)
00238 {
00239   return (intab.chtype+1)[c];
00240 }
00241 
00242 static void SyntaxError(char *description)
00243 {
00244         xsb_abort("[TOKENIZER] Syntax error: %s", description);
00245 }
00246  
00247 
00248 void unGetC(int d, FILE *card, STRFILE *instr)
00249 {
00250   if (instr) {
00251     (instr)->strcnt++;
00252     (instr)->strptr--;
00253   }
00254   else ungetc(d, card);
00255 }
00256 
00257  
00258  
00259 /*  GetToken() reads a single token from the input stream and returns
00260     its type, which is one of the following:
00261 
00262         TK_INT          -- an integer 
00263         TK_INTFUNC      -- an integer functor
00264         TK_VARFUNC      -- a HiLog variable( pair 
00265         TK_FUNC         -- an atom( pair
00266         TK_ATOM         -- an atom
00267         TK_VAR          -- a variable
00268         TK_PUNC         -- a single punctuation mark
00269         TK_HPUNC        -- punctuation ) followed by a ( in HiLog terms
00270         TK_LIST         -- a quoted list of character codes (in buffer)
00271         TK_STR          -- a quoted string
00272         TK_EOC          -- end of clause (normally '.\n').
00273         TK_EOF          -- signifies end-of-file.
00274         TK_REAL         -- a real, in double_v.
00275         TK_REALFUNC     -- a real, in double_v.
00276 
00277     In most of the above cases (except the last two), the text of the 
00278     token is in AtomStr.
00279     There are two questions: between which pairs of adjacent tokens is
00280     a space (a) necessary, (b) desirable?  There is an additional
00281     dummy token type used by the output routines, namely
00282         NOBLE           -- extra space is definitely not needed.
00283     I leave it as an exercise for the reader to answer question (a).
00284     Since this program is to produce output I find palatable (even if
00285     it isn't exactly what I'd write myself), extra spaces are ok.  In
00286     fact, the main use of this program is as an editor command, so it
00287     is normal to do a bit of manual post-processing.  Question (b) is
00288     the one to worry about then.  My answer is that a space is never
00289     written
00290         - after  PUNCT ( [ { |
00291         - before PUNCT ) ] } | , <ENDCL>
00292     is written after comma only sometimes, and is otherwise always
00293     written.  The variable lastput thus takes these values:
00294 
00295         ALPHA      -- put a space except before PUNCT
00296         SIGN       -- as alpha, but different so ENDCL knows to put a space.
00297         NOBLE      -- don't put a space
00298         ENDCL      -- just ended a clause
00299         EOFCH      -- at beginning of file
00300 */
00301 
00302 #ifndef MULTI_THREAD
00303 struct token_t res_str;
00304 struct token_t *token = &res_str;
00305 
00306 int     lastc = ' ';    /* previous character */
00307 char*   strbuff = NULL;             /* Pointer to token buffer; Will be
00308                                        allocated on first call to GetToken */
00309 int     strbuff_len = InitStrLen;   /* length of first allocation will be
00310                                        InitStrLen; will be doubled on
00311                                        subsequent overflows */
00312 double  double_v;
00313 long    rad_int;
00314 #endif
00315 
00316 char    tok2long[]      = "token too long";
00317 char    eofinrem[]      = "end of file in comment";
00318 char    badexpt[]       = "bad exponent";
00319 char    badradix[]      = "radix not 0 or 2..36";
00320  
00321  
00322 /*  read_character(FILE* card, STRFILE* instr, Char q)
00323     reads one character from a quoted atom, list, string, or character.
00324     Doubled quotes are read as single characters, otherwise a
00325     quote is returned as -1 and lastc is set to the next character.
00326     If the input syntax has character escapes, they are processed.
00327     Note that many more character escape sequences are accepted than
00328     are generated.  There is a divergence from C: \xhh sequences are
00329     two hexadecimal digits long, not three.
00330     Note that the \c and <space> sequences combine to make a pretty
00331     way of continuing strings.  Do it like this:
00332         "This is a string, which \c
00333        \ has to be continued over \c
00334        \ several lines.\n".
00335 
00336     -- If encounters the EOF, then return -2. (Baoqiu, 2/16/1997)
00337 */
00338  
00339 static int read_character(CTXTdeclc register FILE *card,
00340                           register STRFILE *instr,
00341                           register int q)
00342 {
00343         register int c;
00344  
00345         c = GetC(card,instr);
00346 BACK:   if (c < 0) {
00347           if (c == EOF) /* to mostly handle cygwin stdio.h bug ... */
00348 READ_ERROR: if (q < 0) {
00349                 SyntaxError("end of file in character constant");
00350                 //              return -2;              /* encounters EOF */
00351             } else {
00352                 char message[80];
00353                 sprintf(message, "end of file in %cquoted%c constant", q, q);
00354                 SyntaxError(message);
00355                 //              return -2;              /* encounters EOF */
00356             }
00357           else c = c & 0xff;  /* in which getc returns "signed" char? */
00358         }
00359         if (c == q) {
00360             c = GetC(card,instr);
00361             if (c == q) return c;
00362             lastc = c;
00363             return -1;
00364         } else
00365         if (c != intab.escape) {
00366             return c;
00367         }
00368         /*  If we get here, we have read the "\" of an escape sequence  */
00369         c = GetC(card,instr);
00370         switch (c) {
00371             case EOF:
00372                 clearerr(card);
00373                 goto READ_ERROR;
00374             case 'a':                   /* alarm */
00375                 return  '\a';
00376             case 'b':                   /* backspace */
00377                 return  '\b';
00378             case 'f':                   /* formfeed */
00379                 return '\f';
00380             case '\n':                  /* seeing a newline */
00381                 while (IsLayout(c = GetC(card,instr)));
00382                 goto BACK;
00383             case 'n':                   /* newline */
00384                 return '\n';
00385             case 'r':                   /* return */
00386                 return '\r';
00387             case 't':                   /* tab */
00388                 return  '\t';
00389             case 'v':                   /* vertical tab */
00390                 return '\v';
00391             case 'x':                   /* hexadecimal */
00392                 {   int i, n;
00393                     for (n = 0, i = 2; --i >= 0; n = (n<<4) + DigVal(c))
00394                         if (DigVal(c = GetC(card,instr)) >= 16) {
00395                             if (c < 0) goto READ_ERROR;
00396                             (void)unGetC(c, card, instr);
00397                             break;
00398                         }
00399                     return n & 255;
00400                 }
00401             case '0': case '1': case '2': case '3':
00402             case '4': case '5': case '6': case '7':
00403                 {   int i, n;
00404                     for (n = c-'0', i = 2; --i >= 0; n = (n<<3) + DigVal(c))
00405                         if (DigVal(c = GetC(card,instr)) >= 8) {
00406                             if (c < 0) goto READ_ERROR;
00407                             (void) unGetC(c, card, instr);
00408                             break;
00409                         }
00410                     return n & 255;
00411                 }
00412             case '\\':                  /* backslash */
00413                 return '\\';
00414 // Don't include ISO's single quote escape; it breaks '/\', which is (commonly?) used in XSB
00415 // If this is changed, change double_quotes() in io_builtins_xsb.c
00416 //          case '\'':                  /* single quote */
00417 //              return '\'';
00418             case '"':                   /* double quote */
00419                 return '"';
00420             case '`':                   /* back quote */
00421                 return '`';
00422             default:                    /* return the \, not an escape */
00423               (void) unGetC(c, card, instr);
00424               return '\\';
00425         }
00426     }
00427  
00428 /*  com0plain(card, instr, endeol)
00429     These comments have the form
00430         <eolcom> <char>* <newline>                      {PUNCT}
00431     or  <eolcom><eolcom> <char>* <newline>              {SIGN }
00432     depending on the classification of <eolcom>.  Note that we could
00433     handle ADA comments with no trouble at all.  There was a Pop-2
00434     dialect which had end-of-line comments using "!" where the comment
00435     could also be terminated by "!".  You could obtain the effect of
00436     including a "!" in the comment by doubling it, but what you had
00437     then was of course two comments.  The endeol parameter of this
00438     function allows the handling of comments like that which can be
00439     terminated either by a new-line character or an <endeol>, whichever
00440     comes first.  For ordinary purposes, endeol = -1 will do fine.
00441     When this is called, the initial <eolcom>s have been consumed.
00442     We return the first character after the comment.
00443     If the end of the source file is encountered, we do not treat it
00444     as an error, but quietly close the comment and return EOF as the
00445     "following" character.
00446  
00447 */
00448 static int com0plain(register FILE *card,       /* source file */
00449                      register STRFILE *instr,   /* source string, if non-NULL */
00450                      register int endeol)       /* The closing character "!" */
00451 {
00452     register int c;
00453  
00454     while ((c = GetC(card,instr)) >= 0 && c != '\n' && c != endeol) ;
00455     if (c >= 0) c = GetC(card,instr);
00456     return c;
00457 }
00458  
00459  
00460 /*  The states in the next two functions are
00461         0       - after an uninteresting character
00462         1       - after an "astcom"
00463         2       - after a  "begcom"
00464     Assuming begcom = "(", astom = "#", endcom = ")",
00465     com2plain will accept "(#)" as a complete comment.  This can
00466     be changed by initialising the state to 0 rather than 1.
00467     The same is true of com2nest, which accepts "(#(#)#) as a
00468     complete comment.  Changing it would be rather harder.
00469     Fixing the bug where the closing <astcom> is copied if it is
00470     not an asterisk may entail rejecting "(#)".
00471 */
00472  
00473 /*  com2plain(card, instr, stcom, endcom)
00474     handles PL/I-style comments, that is, comments which begin with
00475     a pair of characters <begcom><astcom> and end with a pair of
00476     chracters <astcom><endcom>, where nesting is not allowed.  For
00477     example, if we take begcom='(', astcom='*', endcom=')' as in
00478     Pascal, the comment "(* not a (* plain *)^ comment *) ends at
00479     the "^".
00480     For this kind of comment, it is perfectly sensible for any of
00481     the characters to be equal.  For example, if all three of the
00482     bracket characters are "#", then "## stuff ##" is a comment.
00483     When this is called, the initial <begcom><astcom> has been consumed.
00484 */
00485 static int com2plain(register FILE *card,       /* source file */
00486                      register STRFILE *instr,   /* source string, if non-NULL */
00487                      int astcom,                /* The asterisk character "*" */
00488                      int endcom)                /* The closing character "/" */
00489 {
00490         register int c;
00491         register int state;
00492 
00493         for (state = 0; (c = GetC(card,instr)) >= 0; ) {
00494             if (c == endcom && state) break;
00495             state = c == astcom;
00496         }
00497         if (c < 0) return 1; 
00498         else return 0; 
00499 }
00500 
00501 #ifndef MULTI_THREAD 
00502 int token_too_long_warning = 1;
00503 #endif
00504 
00505 void realloc_strbuff(CTXTdeclc char **pstrbuff, char **ps, int *pn)
00506      /* Expand token buffer when needed.
00507       * pstrbuff: base address of current buffer
00508       * ps: tail of current buffer
00509       * pn: number of elements remaining in the current buffer
00510       * --  C.R., 7/27/97
00511      */
00512 { 
00513   char *newbuff;
00514 
00515   newbuff = (char *)realloc(*pstrbuff, strbuff_len * 2);
00516   exit_if_null(newbuff);
00517   if (token_too_long_warning) {
00518     xsb_warn("Extra-long token. Runaway string?");
00519     token_too_long_warning = 0;
00520   }
00521 
00522   if (*pstrbuff != newbuff) {
00523     /* Aha, base address has changed, so change s too*/
00524     *ps += newbuff - *pstrbuff;
00525   }
00526   *pstrbuff = newbuff;
00527   *pn += strbuff_len;
00528   strbuff_len *= 2;
00529   return;
00530 }
00531 
00532 struct token_t *GetToken(CTXTdeclc FILE *card, STRFILE *instr, int prevch)
00533 {
00534         char *s;
00535         register int c, d = 0;
00536         long oldv = 0, newv = 0; 
00537         int n;
00538 
00539         if (strbuff == NULL)
00540           {
00541             /* First call for GetToken, so allocate a buffer */
00542             strbuff = (char *)mem_alloc(strbuff_len,OTHER_SPACE);
00543             exit_if_null(strbuff);
00544           }
00545         s = strbuff;
00546         n = strbuff_len;
00547 
00548         c = prevch; 
00549 START:
00550         switch (InType(c)) {
00551  
00552             case DIGIT:
00553                 /*  The following kinds of numbers exist:
00554                       (1) unsigned decimal integers: d+
00555                       (2) unsigned based integers: d+Ro+[R]
00556                       (3) unsigned floats: d* [. d*] [e +/-] d+
00557                       (4) characters: 0Rc[R]
00558                     We allow underscores in numbers too, ignoring them.
00559                 */
00560                 do {
00561                     if (c != '_') *s++ = c;
00562                     c = GetC(card,instr);
00563                 } while (InType(c) <= BREAK);
00564                 if (c == intab.radix) {  
00565                     *s = 0;
00566                     for (d = 0, s = strbuff; (c = *s++);) {
00567                       d = d*10-'0'+c;
00568                     }
00569                     if (d == 1 || d > 36) {
00570                       SyntaxError(badradix);
00571                       //                      token->type = TK_ERROR;
00572                       //                      return token;
00573                     }
00574                     if (d == 0) {
00575                       /*  0'c['] is a character code  */
00576                       d = read_character(CTXTc card, instr, -1);
00577                       //                      sprintf(strbuff, "%d", d);
00578                       rad_int = d;
00579                       d = GetC(card,instr);
00580                       //                      rad_int = atoi(strbuff);
00581                       token->nextch = d == intab.radix ? GetC(card,instr):d;
00582                       token->value = (char *)(&rad_int);
00583                       token->type = TK_INT;
00584                       return token;
00585                     }
00586                     /* handle non-0 radix */
00587 NONZERO_RADIX:      while (c = GetC(card,instr), DigVal(c) < d)
00588                         if (c != '_') {
00589                             oldv = newv;
00590                             newv = newv*d + DigVal(c);
00591                             if (newv < oldv || newv > MY_MAXINT) {
00592                                 xsb_error("Overflow in radix notation, returning float");
00593                                 double_v = oldv*1.0*d + DigVal(c);
00594                                 while (c = GetC(card,instr), DigVal(c) < 99)
00595                                     if (c != '_') 
00596                                         double_v = double_v*d + DigVal(c);
00597                                 if (c == intab.radix) 
00598                                         c = GetC(card,instr);
00599                                 token->nextch = c;
00600                                 token->value = (char *)(&double_v);
00601                                 if (c == '(')   /* Modified for HiLog */        
00602                                         token->type = TK_REALFUNC;
00603                                 else
00604                                         token->type = TK_REAL;
00605                                 return token;
00606                             }
00607                         }
00608                     rad_int = newv;
00609                     if (c == intab.radix) 
00610                         c = GetC(card,instr);
00611                     token->nextch = c;
00612                     token->value = (char *)(&rad_int);
00613                     if (c == '(')       /* Modified for HiLog */
00614                         token->type = TK_INTFUNC;
00615                     else
00616                         token->type = TK_INT;
00617                     return token;
00618                 }
00619                 else if (c == intab.dpoint) {
00620                     d = GetC(card,instr);
00621                     if (InType(d) == DIGIT) {
00622 LAB_DECIMAL:                *s++ = '.';
00623                         do {
00624                             if (d != '_') *s++ = d;
00625                             d = GetC(card,instr);
00626                         } while (InType(d) <= BREAK);
00627                         if ((d | 32) == 'e') {
00628                             *s++ = 'E';
00629                             d = GetC(card,instr);
00630                             if (d == '-') *s++ = d, d = GetC(card,instr);
00631                             else if (d == '+') d = GetC(card,instr);
00632                             if (InType(d) > BREAK) {
00633                                 SyntaxError(badexpt);
00634                                 //                              token->type = TK_ERROR;
00635                                 //                              return token;
00636                             }
00637                             do {
00638                                 if (d != '_') *s++ = d;
00639                                 d = GetC(card,instr);
00640                             } while (InType(d) <= BREAK);
00641                         }
00642                         c = d;
00643                         *s = 0;
00644                         sscanf(strbuff, "%lf", &double_v);
00645                         token->nextch = c;
00646                         token->value = (char *)(&double_v);
00647                         if (c == '(')   /* Modified for HiLog */        
00648                                 token->type = TK_REALFUNC;
00649                         else
00650                                 token->type = TK_REAL;
00651                         return token;
00652                     } else {
00653                         unGetC(d, card, instr);
00654                         /* c has not changed */
00655                     }
00656                 }
00657                 else if (c == 'b' || c == 'o' || c == 'x') {
00658                   int oc = c;
00659                   *s = 0;
00660                   for (d = 0, s = strbuff; (oc = *s++);)
00661                     d = d*10-'0'+oc;
00662                   if (d == 0) {
00663                     if (c == 'b') d = 2;
00664                     else if (c == 'o') d = 8;
00665                     else /*if (c == 'x')*/ d = 16;
00666                   } else {
00667                     token->nextch = c;
00668                     rad_int = d;
00669                     token->value = (char *)(&rad_int);
00670                     token->type = TK_INT;
00671                     return token;
00672                   }
00673                   goto NONZERO_RADIX;
00674                 }
00675                 token->nextch = c;
00676                 *s = 0;
00677                 for (rad_int = 0, s = strbuff; (c = *s++);) {
00678                   d = rad_int;
00679                   rad_int = rad_int*10-'0'+c;
00680                   if (rad_int < d || rad_int > MY_MAXINT) {
00681                     xsb_error("Overflow in integer, returning MAX_INT");
00682                     rad_int = MY_MAXINT;
00683                     break;
00684                   }
00685                 }
00686                   //            rad_int = atoi(strbuff);
00687                 token->value = (char *)(&rad_int);
00688                 if (c == '(')   /* Modified for HiLog */
00689                         token->type = TK_INTFUNC;
00690                 else
00691                         token->type = TK_INT;
00692                 return token;
00693  
00694             case BREAK:        /* Modified for HiLog */
00695               do {
00696                     if (--n < 0) {
00697                       realloc_strbuff(CTXTc &strbuff, &s, &n); 
00698                       }
00699                     *s++ = c, c = GetC(card,instr);
00700                 } while (InType(c) <= LOWER);
00701                 *s = 0;
00702                 if (c == '(') {
00703                     token->nextch = c;
00704                     token->value = strbuff;
00705                     token->type = TK_VVARFUNC;
00706                     return token;
00707                 } else {
00708                     token->nextch = c;
00709                     token->value = strbuff;
00710                     token->type = TK_VVAR;
00711                     return token;
00712                 }
00713  
00714             case UPPER:         /* Modified for HiLog */
00715                 do {
00716                     if (--n < 0) {
00717                       realloc_strbuff(CTXTc &strbuff, &s, &n);
00718                     }
00719                     *s++ = c, c = GetC(card,instr);
00720                 } while (InType(c) <= LOWER);
00721                 *s = 0;
00722                 if (c == '(') {
00723                     token->nextch = c;
00724                     token->value = strbuff;
00725                     token->type = TK_VARFUNC; 
00726                     return token;
00727                 } else {
00728                     token->nextch = c;
00729                     token->value = strbuff;
00730                     token->type = TK_VAR;
00731                     return token;
00732                 }
00733  
00734             case LOWER:
00735                 do {
00736                     if (--n < 0) {
00737                       realloc_strbuff(CTXTc &strbuff, &s, &n);
00738                     }
00739                     *s++ = c, c = GetC(card,instr);
00740                 } while (InType(c) <= LOWER);
00741                 *s = 0;
00742 SYMBOL:         if (c == '(') {
00743                     token->nextch = c;
00744                     token->value = strbuff;
00745                     token->type = TK_FUNC;
00746                     return token;
00747                 } else {
00748                     token->nextch = c;
00749                     token->value = strbuff;
00750                     token->type = TK_ATOM;
00751                     return token;
00752                 }
00753  
00754             case SIGN:
00755                 *s = c, d = GetC(card,instr);
00756                 if (c == intab.begcom && d == intab.astcom) {
00757 ASTCOM:             if (com2plain(card, instr, d, intab.endcom)) {
00758                         SyntaxError(eofinrem);
00759                         //                      token->type = TK_ERROR;
00760                         //                      return token;
00761                     }
00762                     c = GetC(card,instr);
00763                     goto START;
00764                 } else
00765                 if (c == intab.dpoint && InType(d) == DIGIT) {
00766                     *s++ = '0';
00767                     goto LAB_DECIMAL;
00768                 }
00769                 while (InType(d) == SIGN) {
00770                     if (--n == 0) {
00771                       realloc_strbuff(CTXTc &strbuff, &s, &n);
00772                     }
00773                     *++s = d, d = GetC(card,instr);
00774                 }
00775                 *++s = 0;
00776                 if (InType(d)>=SPACE && c==intab.termin && strbuff[1]==0) {
00777                     token->nextch = d;
00778                     token->value = 0;
00779                     token->type = TK_EOC;
00780                     return token;       /* i.e. '.' followed by layout */
00781                 }
00782                 c = d;
00783                 goto SYMBOL;
00784  
00785             case NOBLE:
00786                 if (c == intab.termin) {
00787                     *s = 0;
00788                     token->nextch = ' ';
00789                     token->value = 0;
00790                     token->type = TK_EOC;
00791                     return token;
00792                 } else
00793                 if (c == intab.eolcom) {
00794                     c = com0plain(card, instr, intab.endeol);
00795                     goto START;
00796                 }
00797                 *s++ = c, *s = 0;
00798                 c = GetC(card,instr);
00799                 goto SYMBOL;
00800  
00801             case PUNCT:
00802                 if (c == intab.termin) {
00803                     *s = 0;
00804                     token->nextch = ' ';
00805                     token->value = 0;
00806                     token->type = TK_EOC;
00807                     return token;
00808                 } else
00809                 if (c == intab.eolcom) {
00810                     c = com0plain(card, instr, intab.endeol);
00811                     goto START;
00812                 }
00813                 d = GetC(card,instr);
00814                 if (c == intab.begcom && d == intab.astcom) goto ASTCOM;
00815  
00816               /*  If we arrive here, c is an ordinary punctuation mark  */
00817 /*                  if (c == '(')  *s++ = ' '; */
00818                     /* In PSBProlog (as in most other Prologs) it was     */
00819                     /* necessary to distinguish between atom( and atom (  */
00820                     /* This was originally used for operators but it was  */
00821                     /* deleted by Jiyang - seems to cause no problem for  */
00822                     /* HiLog.                                             */
00823                 *s++ = c, *s = 0;
00824                 token->nextch = d;
00825                 token->value = strbuff;
00826            /*  In HiLog we need the following distinction so that we do not */
00827            /*  recognize terms of the form f(p) (c,d) which are not HiLog   */
00828            /*  terms as the same HiLog term as f(p)(c,d) which is a legal   */
00829            /*  HiLog term. All this mess is caused by the fact that this    */
00830            /*  scanner throws away all the spaces and we have no other way  */
00831            /*  of recognizing whether the next left parenthesis belongs to  */
00832            /*  the same term being read, (especially since it is not        */
00833            /*  desirable to keep the previous character read).              */
00834                 if (c == ')' && d == '(')
00835                   token->type = TK_HPUNC;
00836                 else
00837                   token->type = TK_PUNC;
00838                 return token;
00839  
00840             case CHRQT:
00841                 /*  `c[`] is read as an integer.
00842                     Eventually we should treat characters as a distinct
00843                     token type, so they can be generated on output.
00844                     If the character quote, atom quote, list quote,
00845                     or string quote is the radix character, we should
00846                     generate 0'x notation, otherwise `x`.
00847                 */
00848                 d = read_character(CTXTc card, instr, -1);
00849                 sprintf(strbuff, "%d", d);
00850                 d = GetC(card,instr);
00851                 rad_int = atoi(strbuff);
00852                 token->nextch = d == c ? GetC(card,instr) : d;
00853                 token->value = (char *)(&rad_int);
00854                 token->type = TK_INT;
00855                 return token;
00856  
00857             case ATMQT:
00858                 while ((d = read_character(CTXTc card, instr, c)) >= 0) {
00859                     if (--n < 0) {
00860                       realloc_strbuff(CTXTc &strbuff, &s, &n);
00861                     }
00862                     *s++ = d;
00863                 }
00864                 *s = 0;
00865                 c = lastc;
00866                 goto SYMBOL;
00867 
00868 /**** this case deleted, messed up treatment of $, which was STRQT
00869             case STRQT:
00870                 while ((d = read_character(CTXTc card, instr, c)) >= 0) {
00871                     if (--n < 0) {
00872                       realloc_strbuff(CTXTc &strbuff, &s, &n);
00873                     }
00874                     *s++ = d;
00875                 }
00876                 *s = 0;
00877                 token->nextch = lastc;
00878                 token->value = strbuff;
00879                 token->type = TK_STR;
00880                 return token;
00881 case deleted ****/
00882 
00883             case LISQT: 
00884                 while ((d = read_character(CTXTc card, instr, c)) >= 0) {
00885                     if (--n < 0) {
00886                       realloc_strbuff(CTXTc &strbuff, &s, &n);
00887                     }
00888                     *s++ = d;
00889                 }
00890                 *s = 0;
00891                 token->nextch = lastc;
00892                 token->value = strbuff;
00893                 token->type = TK_LIST;
00894                 return token;
00895 
00896             case EOLN:
00897             case SPACE:
00898                 c = GetC(card,instr);
00899                 goto START;
00900  
00901             case EOFCH:
00902                 if (!instr) clearerr(card);
00903                 token->nextch = ' ';
00904                 token->value = 0;
00905                 token->type = TK_EOF;
00906                 return token;
00907 
00908         }
00909         /* There is no way we can get here */
00910         xsb_abort("[Internal error] InType(%d)==%d\n", c, InType(c));
00911         /*NOTREACHED*/
00912         return FALSE; /* to pacify the compiler */
00913 }
00914 
00915 /* --- Testing routines (usually commented) ---  
00916  
00917 void main(int arc, char *argv[])
00918 {
00919   FILE *card;
00920   struct token_t *res;
00921 
00922   card = fopen(argv[1], "r");
00923   if (!card) exit(1);
00924   token->nextch = ' ';
00925   do {
00926     res = GetToken(CTXTc card, NULL, token->nextch);
00927     print_token(res->type, res->value);
00928   } while (res->type != TK_EOF);
00929 }
00930 
00931 void print_token(int token_type, char *ptr)
00932 {
00933   switch (token_type) {
00934   case TK_PUNC          : printf("TK_PUNC: %c\t", *ptr); break;
00935   case TK_VARFUNC       : printf("TK_VARFUNC: %s\t", ptr); break;
00936   case TK_VAR           : printf("TK_VAR: %s\t", ptr); break;
00937   case TK_FUNC          : printf("TK_FUNC: %s\t", ptr); break;
00938   case TK_INT           : printf("TK_INT: %ld\t", *(Integer *)ptr); break;
00939   case TK_ATOM          : printf("TK_ATOM: %s\t", ptr); break;
00940   case TK_EOC           : printf("\nTK_EOC\n"); break;
00941   case TK_VVAR          : printf("TK_VVAR: %s\t", ptr); break;
00942   case TK_VVARFUNC      : printf("TK_VVARFUNC: %s\t", ptr); break;
00943   case TK_REAL          : printf("TK_REAL: %f\t", *(double *)ptr); break;
00944   case TK_EOF           : printf("\nTK_EOF\n"); break;
00945   case TK_STR           : printf("TK_STR: %s\t", ptr); break;
00946   case TK_LIST          : printf("TK_LIST: %s\t", ptr); break;
00947   case TK_HPUNC         : printf("TK_HPUNC: %c\t", *ptr); break;
00948   case TK_INTFUNC       : printf("TK_INTFUNC: %ld\t", *(Integer *)ptr); break;
00949   case TK_REALFUNC      : printf("TK_REALFUNC: %f\t", *(double *)ptr); break;
00950   }
00951 }
00952 
00953  ----------------------------------------------- */

Generated on Wed Jul 26 13:30:43 2006 for XSB by  doxygen 1.4.5