string_xsb.c

00001 /* File:      string_xsb.c  -- string manipulation stuff
00002 ** Author(s): kifer
00003 ** Contact:   xsb-contact@cs.sunysb.edu
00004 ** 
00005 ** Copyright (C) The Research Foundation of SUNY, 1999
00006 ** 
00007 ** XSB is free software; you can redistribute it and/or modify it under the
00008 ** terms of the GNU Library General Public License as published by the Free
00009 ** Software Foundation; either version 2 of the License, or (at your option)
00010 ** any later version.
00011 ** 
00012 ** XSB is distributed in the hope that it will be useful, but WITHOUT ANY
00013 ** WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00014 ** FOR A PARTICULAR PURPOSE.  See the GNU Library General Public License for
00015 ** more details.
00016 ** 
00017 ** You should have received a copy of the GNU Library General Public License
00018 ** along with XSB; if not, write to the Free Software Foundation,
00019 ** Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00020 **
00021 ** $Id: string_xsb.c,v 1.19 2005/11/16 17:32:05 dwarren Exp $
00022 ** 
00023 */
00024 
00025 #include "xsb_config.h"
00026 #include "xsb_debug.h"
00027 
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 
00032 #include "auxlry.h"
00033 #include "cell_xsb.h"
00034 #include "error_xsb.h"
00035 #include "psc_xsb.h"
00036 #include "hash_xsb.h"
00037 #include "tries.h"
00038 #include "choice.h"
00039 #include "deref.h"
00040 #include "memory_xsb.h"
00041 #include "heap_xsb.h"
00042 #include "register.h"
00043 #include "flags_xsb.h"
00044 #include "binding.h"
00045 #include "builtin.h"
00046 #include "cinterf.h"
00047 
00048 #include "sp_unify_xsb_i.h"
00049 #include "string_xsb.h"
00050 
00051 extern char *p_charlist_to_c_string(CTXTdeclc prolog_term term, VarString *outstring, 
00052                                     char *in_func, char *where);
00053 extern void c_string_to_p_charlist(char *name, prolog_term list,
00054                                    int regs_to_protect, char *in_func, char *where);
00055 
00056 static Cell term, term2, term3;
00057 
00058 static XSB_StrDefine(input_buffer);
00059 static XSB_StrDefine(subst_buf);
00060 static XSB_StrDefine(output_buffer);
00061 
00062 
00063 #include "ptoc_tag_xsb_i.h"
00064 
00065 
00066 xsbBool str_cat(CTXTdecl)
00067 {
00068   char *str1, *str2, *tmpstr;
00069   int tmpstr_len;
00070 
00071   term = ptoc_tag(CTXTc 1);
00072   term2 = ptoc_tag(CTXTc 2);
00073   if (isatom(term) && isatom(term2)) {
00074     str1 = string_val(term);
00075     str2 = string_val(term2);
00076     tmpstr_len = strlen(str1) + strlen(str2) + 1;
00077     
00078     tmpstr = (char *)mem_alloc(tmpstr_len,LEAK_SPACE);
00079     strcpy(tmpstr, str1);
00080     strcat(tmpstr, str2);
00081     str1 = string_find(tmpstr, 1);
00082     mem_dealloc(tmpstr,tmpstr_len,LEAK_SPACE);
00083     return atom_unify(CTXTc makestring(str1), ptoc_tag(CTXTc 3));
00084   } else return FALSE;
00085 }
00086 
00087 
00088 /*
00089   Input:
00090       Arg1: +Substr
00091       Arg2: + String
00092       Arg3: +forward/reverse (checks only f/r)
00093         f means the first match from the start of String
00094         r means the first match from the end of String
00095   Output:
00096       Arg4: Beg
00097         Beg is the offset where Substr matches. Must be a variable or an
00098         integer
00099       Arg5: End
00100         End is the offset of the next character after the end of Substr
00101         Must be a variable or an integer.
00102 
00103       Both Beg and End can be negative, in which case they represent the offset
00104       from the 2nd character past the end of String.
00105       For instance, -1 means the next character past the end of String,
00106       so End = -1 means that Substr must be a suffix of String..
00107 
00108       The meaning of End and of negative offsets is consistent with substring
00109       and string_substitute predicates.
00110 */
00111 xsbBool str_match(CTXTdecl)
00112 {
00113   static char *subptr, *stringptr, *direction, *matchptr;
00114   static int substr_beg, substr_end;
00115   int reverse=TRUE; /* search in reverse */
00116   int beg_bos_offset=TRUE; /* measure beg offset from the beg of string */
00117   int end_bos_offset=TRUE; /* measure end offset from the beg of string */
00118   int str_len, sub_len; /* length of string and substring */
00119   Cell beg_offset_term, end_offset_term;
00120 
00121   term = ptoc_tag(CTXTc 1);
00122   term2 = ptoc_tag(CTXTc 2);
00123   term3 = ptoc_tag(CTXTc 3);
00124   beg_offset_term = ptoc_tag(CTXTc 4);
00125   end_offset_term = ptoc_tag(CTXTc 5);
00126   if (!isatom(term) || !isatom(term2) || !isatom(term3)) {
00127     xsb_abort("STR_MATCH: Arguments 1,2,3 must be bound to strings");
00128   }
00129   subptr = string_val(term);
00130   stringptr = string_val(term2);
00131   direction = string_val(term3);
00132 
00133   if (*direction == 'f')
00134     reverse=FALSE;
00135   else if (*direction != 'r')
00136     xsb_abort("STR_MATCH: Argument 3 must be bound to forward/reverse");
00137 
00138   str_len=strlen(stringptr);
00139   sub_len=strlen(subptr);
00140 
00141   if (isinteger(beg_offset_term)|isboxedinteger(beg_offset_term)) {
00142     if (int_val(beg_offset_term) < 0) {
00143       beg_bos_offset = FALSE;
00144     }
00145   }
00146   if (isinteger(end_offset_term)|isboxedinteger(end_offset_term)) {
00147     if (int_val(end_offset_term) < 0) {
00148       end_bos_offset = FALSE;
00149     }
00150   }
00151 
00152   if (reverse)
00153     matchptr = xsb_strrstr(stringptr, subptr);
00154   else
00155     matchptr = strstr(stringptr, subptr);
00156 
00157   if (matchptr == NULL) return FALSE;
00158 
00159   substr_beg = (beg_bos_offset?
00160                 matchptr - stringptr : -(str_len - (matchptr - stringptr))
00161                 );
00162   substr_end = (end_bos_offset?
00163                 (matchptr - stringptr) + sub_len
00164                 : -(str_len + 1 - (matchptr - stringptr) - sub_len)
00165                 );
00166   
00167   return
00168     (p2p_unify(CTXTc beg_offset_term, makeint(substr_beg))
00169      && p2p_unify(CTXTc end_offset_term, makeint(substr_end)));
00170 }
00171 
00172 
00173 
00174 
00175 /* XSB string substitution entry point
00176    In: 
00177       Arg1: string
00178       Arg2: beginning offset
00179       Arg3: ending offset. `_' or -1: end of string, -2: char before last, etc.
00180    Out:
00181       Arg4: new (output) string
00182    Always succeeds, unless error.
00183 */
00184 xsbBool substring(CTXTdecl)
00185 {
00186   /* Prolog args are first assigned to these, so we could examine the types
00187      of these objects to determine if we got strings or atoms. */
00188   prolog_term input_term, output_term;
00189   prolog_term beg_offset_term, end_offset_term;
00190   char *input_string=NULL;    /* string where matches are to be found */
00191   int beg_offset=0, end_offset=0, input_len=0, substring_len=0;
00192   int conversion_required=FALSE;
00193 
00194   XSB_StrSet(&output_buffer,"");
00195 
00196   input_term = reg_term(CTXTc 1);  /* Arg1: string to find matches in */
00197   if (isatom(input_term)) /* check it */
00198     input_string = string_val(input_term);
00199   else if (islist(input_term)) {
00200     input_string = p_charlist_to_c_string(CTXTc input_term, &input_buffer,
00201                                           "SUBSTRING", "input string");
00202     conversion_required = TRUE;
00203   } else
00204     xsb_abort("[SUBSTRING] Arg 1 (the input string) must be an atom or a character list");
00205 
00206   input_len = strlen(input_string);
00207 
00208   /* arg 2: beginning offset */
00209   beg_offset_term = reg_term(CTXTc 2);
00210   if (! (isinteger(beg_offset_term)|isboxedinteger(beg_offset_term)))
00211     xsb_abort("[SUBSTRING] Arg 2 (the beginning offset) must be an integer");
00212   beg_offset = int_val(beg_offset_term);
00213   if (beg_offset < 0)
00214     beg_offset = 0;
00215   else if (beg_offset > input_len)
00216     beg_offset = input_len;
00217 
00218   /* arg 3: ending offset */
00219   end_offset_term = reg_term(CTXTc 3);
00220   if (isref(end_offset_term))
00221     end_offset = input_len;
00222   else if (! (isinteger(end_offset_term)|isboxedinteger(end_offset_term)))
00223     xsb_abort("[SUBSTRING] Arg 3 (the end offset) must be integer or _");
00224   else end_offset = int_val(end_offset_term);
00225 
00226   if (end_offset < 0)
00227     end_offset = input_len + 1 + end_offset;
00228   else if (end_offset > input_len)
00229     end_offset = input_len;
00230   else if (end_offset < beg_offset)
00231     end_offset = beg_offset;
00232 
00233   output_term = reg_term(CTXTc 4);
00234   if (! isref(output_term))
00235     xsb_abort("[SUBSTRING] Arg 4 (the output string) must be an unbound variable");
00236 
00237   /* do the actual replacement */
00238   substring_len = end_offset-beg_offset;
00239   XSB_StrAppendBlk(&output_buffer, input_string+beg_offset, substring_len);
00240   XSB_StrNullTerminate(&output_buffer);
00241   
00242   /* get result out */
00243   if (conversion_required)
00244     c_string_to_p_charlist(output_buffer.string, output_term,
00245                            4, "SUBSTRING", "Arg 4");
00246   else
00247     c2p_string(CTXTc output_buffer.string, output_term);
00248   
00249   return(TRUE);
00250 }
00251 
00252 
00253 /* XSB string substitution entry point: replace substrings specified in Arg2
00254    with strings in Arg3.
00255    In: 
00256        Arg1: string
00257        Arg2: substring specification, a list [s(B1,E1),s(B2,E2),...]
00258        Arg3: list of replacement strings
00259    Out:
00260        Arg4: new (output) string
00261    Always succeeds, unless error.
00262 */
00263 xsbBool string_substitute(CTXTdecl)
00264 {
00265   /* Prolog args are first assigned to these, so we could examine the types
00266      of these objects to determine if we got strings or atoms. */
00267   prolog_term input_term, output_term;
00268   prolog_term subst_reg_term, subst_spec_list_term, subst_spec_list_term1;
00269   prolog_term subst_str_term=(prolog_term)0,
00270     subst_str_list_term, subst_str_list_term1;
00271   char *input_string=NULL;    /* string where matches are to be found */
00272   char *subst_string=NULL;
00273   prolog_term beg_term, end_term;
00274   int beg_offset=0, end_offset=0, input_len;
00275   int last_pos = 0; /* last scanned pos in input string */
00276   /* the output buffer is made large enough to include the input string and the
00277      substitution string. */
00278   int conversion_required=FALSE; /* from C string to Prolog char list */
00279 
00280   XSB_StrSet(&output_buffer,"");
00281 
00282   input_term = reg_term(CTXTc 1);  /* Arg1: string to find matches in */
00283   if (isatom(input_term)) /* check it */
00284     input_string = string_val(input_term);
00285   else if (islist(input_term)) {
00286     input_string = p_charlist_to_c_string(CTXTc input_term, &input_buffer,
00287                                           "STRING_SUBSTITUTE", "input string");
00288     conversion_required = TRUE;
00289   } else
00290     xsb_abort("[STRING_SUBSTITUTE] Arg 1 (the input string) must be an atom or a character list");
00291 
00292   input_len = strlen(input_string);
00293 
00294   /* arg 2: substring specification */
00295   subst_spec_list_term = reg_term(CTXTc 2);
00296   if (!islist(subst_spec_list_term) && !isnil(subst_spec_list_term))
00297     xsb_abort("[STRING_SUBSTITUTE] Arg 2 must be a list [s(B1,E1),s(B2,E2),...]");
00298 
00299   /* handle substitution string */
00300   subst_str_list_term = reg_term(CTXTc 3);
00301   if (! islist(subst_str_list_term))
00302     xsb_abort("[STRING_SUBSTITUTE] Arg 3 must be a list of strings");
00303 
00304   output_term = reg_term(CTXTc 4);
00305   if (! isref(output_term))
00306     xsb_abort("[STRING_SUBSTITUTE] Arg 4 (the output) must be an unbound variable");
00307 
00308   subst_spec_list_term1 = subst_spec_list_term;
00309   subst_str_list_term1 = subst_str_list_term;
00310 
00311   if (isnil(subst_spec_list_term1)) {
00312     XSB_StrSet(&output_buffer, input_string);
00313     goto EXIT;
00314   }
00315   if (isnil(subst_str_list_term1))
00316     xsb_abort("[STRING_SUBSTITUTE] Arg 3 must not be an empty list");
00317 
00318   do {
00319     subst_reg_term = p2p_car(subst_spec_list_term1);
00320     subst_spec_list_term1 = p2p_cdr(subst_spec_list_term1);
00321 
00322     if (!isnil(subst_str_list_term1)) {
00323       subst_str_term = p2p_car(subst_str_list_term1);
00324       subst_str_list_term1 = p2p_cdr(subst_str_list_term1);
00325 
00326       if (isatom(subst_str_term)) {
00327         subst_string = string_val(subst_str_term);
00328       } else if (islist(subst_str_term)) {
00329         subst_string = p_charlist_to_c_string(CTXTc subst_str_term, &subst_buf,
00330                                               "STRING_SUBSTITUTE",
00331                                               "substitution string");
00332       } else 
00333         xsb_abort("[STRING_SUBSTITUTE] Arg 3 must be a list of strings");
00334     }
00335 
00336     beg_term = p2p_arg(subst_reg_term,1);
00337     end_term = p2p_arg(subst_reg_term,2);
00338 
00339     if (!(isinteger(beg_term)|isboxedinteger(beg_term)) || 
00340         !(isinteger(end_term)|isboxedinteger(end_term)))
00341       xsb_abort("[STRING_SUBSTITUTE] Non-integer in Arg 2");
00342     else{
00343       beg_offset = int_val(beg_term);
00344       end_offset = int_val(end_term);
00345     }
00346     /* -1 means end of string */
00347     if (end_offset < 0)
00348       end_offset = input_len;
00349     if ((end_offset < beg_offset) || (beg_offset < last_pos))
00350       xsb_abort("[STRING_SUBSTITUTE] Substitution regions in Arg 2 not sorted");
00351 
00352     /* do the actual replacement */
00353     XSB_StrAppendBlk(&output_buffer,input_string+last_pos,beg_offset-last_pos);
00354     XSB_StrAppend(&output_buffer, subst_string);
00355     
00356     last_pos = end_offset;
00357 
00358   } while (!isnil(subst_spec_list_term1));
00359 
00360   XSB_StrAppend(&output_buffer, input_string+end_offset);
00361 
00362  EXIT:
00363   /* get result out */
00364   if (conversion_required)
00365     c_string_to_p_charlist(output_buffer.string, output_term,
00366                            4, "STRING_SUBSTITUTE", "Arg 4");
00367   else
00368     c2p_string(CTXTc output_buffer.string, output_term);
00369   
00370   return(TRUE);
00371 }
00372 
00373 
00374 /*
00375  * strrstr.c -- find last occurence of string in another string
00376  *
00377  */
00378 
00379 char *xsb_strrstr(char *str, char *pat)
00380 {
00381   size_t len, patlen;
00382   const char *p;
00383   
00384   len = strlen(str);
00385   patlen = strlen(pat);
00386   
00387   if (patlen > len)
00388     return NULL;
00389   for (p = str + (len - patlen); p >= str; --p)
00390     if (*p == *pat && strncmp(p, pat, patlen) == 0)
00391       return (char *) p;
00392   return NULL;
00393 }

Generated on Wed Jul 26 13:30:42 2006 for XSB by  doxygen 1.4.5