00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include "xsb_config.h"
00027
00028 #include <stdio.h>
00029 #include <string.h>
00030 #include <regex.h>
00031
00032 #include "auxlry.h"
00033 #include "cell_xsb.h"
00034 #include "error_xsb.h"
00035 #include "cinterf.h"
00036 #include "heap_xsb.h"
00037
00038 extern char *p_charlist_to_c_string(prolog_term term, VarString *outstring,
00039 char *in_func, char *where);
00040 extern void c_string_to_p_charlist(char *name, prolog_term list,
00041 char *in_func, char *where);
00042
00043
00044 extern unsigned long hash(char *objname, byte arity, unsigned long tbl_size);
00045
00046 #define REGEXP_TBL_SIZE 17
00047 #define NMATCH 31
00048
00049
00050
00051
00052 #define GET_MATCH_PTR(i) (input_string + match_array[i].rm_so)
00053 #define GET_MATCH_SIZE(i) (match_array[i].rm_eo - match_array[i].rm_so)
00054
00055
00056 struct regexp_tbl_entry {
00057 int flags;
00058 char *original;
00059 regex_t compiled;
00060 };
00061
00062 static struct regexp_tbl_entry regexp_tbl[REGEXP_TBL_SIZE];
00063
00064 static int xsb_re_match(char *regexp_ptr, char* match_str, int match_flags,
00065 regmatch_t **match_array, int *paren_number,
00066 char *context);
00067 static void initialize_regexp_tbl(void);
00068 static int make_flags(prolog_term flag_term, char *context);
00069
00070 static int first_call = TRUE;
00071
00072
00073 static XSB_StrDefine(input_buffer);
00074 static XSB_StrDefine(subst_buf);
00075 static XSB_StrDefine(output_buffer);
00076 static XSB_StrDefine(regexp_buffer);
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094 int do_regmatch__(void)
00095 {
00096 prolog_term listHead, listTail;
00097
00098
00099 prolog_term regexp_term, input_term, offset_term;
00100 prolog_term output_term = p2p_new();
00101 int i;
00102 char *regexp_ptr=NULL;
00103 char *input_string=NULL;
00104 int match_flags=0;
00105 int return_code, paren_number, offset;
00106 regmatch_t *match_array;
00107
00108 if (first_call)
00109 initialize_regexp_tbl();
00110
00111 regexp_term = reg_term(1);
00112 if (is_string(regexp_term))
00113 regexp_ptr = string_val(regexp_term);
00114 else if (is_list(regexp_term))
00115 regexp_ptr = p_charlist_to_c_string(regexp_term, ®exp_buffer,
00116 "RE_MATCH", "regular expression");
00117 else
00118 xsb_abort("[RE_MATCH] Arg 1 (the regular expression) must be an atom or a character list");
00119
00120 input_term = reg_term(2);
00121 if (is_string(input_term))
00122 input_string = string_val(input_term);
00123 else if (is_list(input_term)) {
00124 input_string = p_charlist_to_c_string(input_term, &input_buffer,
00125 "RE_MATCH", "input string");
00126 } else
00127 xsb_abort("[RE_MATCH] Arg 2 (the input string) must be an atom or a character list");
00128
00129 offset_term = reg_term(3);
00130 if (! is_int(offset_term))
00131 xsb_abort("[RE_MATCH] Arg 3 (the offset) must be an integer");
00132 offset = int_val(offset_term);
00133 if (offset < 0 || offset > strlen(input_string))
00134 xsb_abort("[RE_MATCH] Arg 3 (=%d) must be between 0 and %d",
00135 offset, strlen(input_string));
00136
00137
00138 match_flags = make_flags(reg_term(4), "RE_MATCH");
00139
00140
00141 return_code = xsb_re_match(regexp_ptr, input_string+offset, match_flags,
00142 &match_array, &paren_number, "RE_MATCH");
00143
00144 if (! return_code) return FALSE;
00145
00146
00147
00148
00149 listTail = output_term;
00150 for (i=0; i <= paren_number; i++) {
00151 c2p_list(listTail);
00152 listHead = p2p_car(listTail);
00153
00154
00155 c2p_functor("match", 2, listHead);
00156 c2p_int(match_array[i].rm_so+offset, p2p_arg(listHead,1));
00157 c2p_int(match_array[i].rm_eo+offset, p2p_arg(listHead,2));
00158
00159 listTail = p2p_cdr(listTail);
00160 }
00161
00162 c2p_nil(listTail);
00163 return p2p_unify(output_term, reg_term(5));
00164 }
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182 int do_bulkmatch__(void)
00183 {
00184 prolog_term listHead, listTail;
00185
00186
00187 prolog_term regexp_term, input_term, offset_term;
00188 prolog_term output_term = p2p_new();
00189 char *regexp_ptr=NULL;
00190 char *input_string=NULL;
00191 int match_flags=FALSE;
00192 int return_code, paren_number, offset;
00193 regmatch_t *match_array;
00194 int last_pos=0, input_len;
00195
00196 if (first_call)
00197 initialize_regexp_tbl();
00198
00199 regexp_term = reg_term(1);
00200 if (is_string(regexp_term))
00201 regexp_ptr = string_val(regexp_term);
00202 else if (is_list(regexp_term))
00203 regexp_ptr = p_charlist_to_c_string(regexp_term, ®exp_buffer,
00204 "RE_BULKMATCH", "regular expression");
00205 else
00206 xsb_abort("[RE_BULKMATCH] Arg 1 (the regular expression) must be an atom or a character list");
00207
00208 input_term = reg_term(2);
00209 if (is_string(input_term))
00210 input_string = string_val(input_term);
00211 else if (is_list(input_term)) {
00212 input_string = p_charlist_to_c_string(input_term, &input_buffer,
00213 "RE_BULKMATCH", "input string");
00214 } else
00215 xsb_abort("[RE_BULKMATCH] Arg 2 (the input string) must be an atom or a character list");
00216
00217 input_len = strlen(input_string);
00218
00219 offset_term = reg_term(3);
00220 if (! is_int(offset_term))
00221 xsb_abort("[RE_BULKMATCH] Arg 3 (the offset) must be an integer");
00222 offset = int_val(offset_term);
00223 if (offset < 0 || offset > input_len)
00224 xsb_abort("[RE_BULKMATCH] Arg 3 (=%d) must be between 0 and %d", input_len);
00225
00226
00227 match_flags = make_flags(reg_term(4), "RE_BULKMATCH");
00228
00229 last_pos = offset;
00230
00231 listTail = output_term;
00232 while (last_pos < input_len) {
00233 return_code = xsb_re_match(regexp_ptr, input_string+last_pos, match_flags,
00234 &match_array, &paren_number, "RE_BULKMATCH");
00235
00236 if (! return_code) break;
00237
00238 c2p_list(listTail);
00239 listHead = p2p_car(listTail);
00240
00241
00242 c2p_functor("match", 2, listHead);
00243 c2p_int(match_array[0].rm_so+last_pos, p2p_arg(listHead,1));
00244 c2p_int(match_array[0].rm_eo+last_pos, p2p_arg(listHead,2));
00245
00246 listTail = p2p_cdr(listTail);
00247 if (match_array[0].rm_eo > 0)
00248 last_pos = match_array[0].rm_eo+last_pos;
00249 else
00250 last_pos++;
00251 }
00252
00253 c2p_nil(listTail);
00254 return p2p_unify(output_term, reg_term(5));
00255 }
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269 int do_regsubstitute__(void)
00270 {
00271
00272
00273 prolog_term input_term, output_term;
00274 prolog_term subst_reg_term, subst_spec_list_term, subst_spec_list_term1;
00275 prolog_term subst_str_term=(prolog_term)0,
00276 subst_str_list_term, subst_str_list_term1;
00277 char *input_string=NULL;
00278 char *subst_string=NULL;
00279 prolog_term beg_term, end_term;
00280 int beg_offset=0, end_offset=0, input_len;
00281 int last_pos = 0;
00282
00283
00284 int conversion_required=FALSE;
00285
00286 XSB_StrSet(&output_buffer,"");
00287
00288 input_term = reg_term(1);
00289 if (is_string(input_term))
00290 input_string = string_val(input_term);
00291 else if (is_list(input_term)) {
00292 input_string = p_charlist_to_c_string(input_term, &input_buffer,
00293 "RE_SUBSTITUTE", "input string");
00294 conversion_required = TRUE;
00295 } else
00296 xsb_abort("[RE_SUBSTITUTE] Arg 1 (the input string) must be an atom or a character list");
00297
00298 input_len = strlen(input_string);
00299
00300
00301 subst_spec_list_term = reg_term(2);
00302 if (!is_list(subst_spec_list_term) && !is_nil(subst_spec_list_term))
00303 xsb_abort("[RE_SUBSTITUTE] Arg 2 must be a list [s(B1,E1),s(B2,E2),...]");
00304
00305
00306 subst_str_list_term = reg_term(3);
00307 if (! is_list(subst_str_list_term))
00308 xsb_abort("[RE_SUBSTITUTE] Arg 3 must be a list of strings");
00309
00310 output_term = reg_term(4);
00311 if (! is_var(output_term))
00312 xsb_abort("[RE_SUBSTITUTE] Arg 4 (the output) must be an unbound variable");
00313
00314 subst_spec_list_term1 = subst_spec_list_term;
00315 subst_str_list_term1 = subst_str_list_term;
00316
00317 if (is_nil(subst_spec_list_term1)) {
00318 XSB_StrSet(&output_buffer, input_string);
00319 goto EXIT;
00320 }
00321 if (is_nil(subst_str_list_term1))
00322 xsb_abort("[RE_SUBSTITUTE] Arg 3 must not be an empty list");
00323
00324 do {
00325 subst_reg_term = p2p_car(subst_spec_list_term1);
00326 subst_spec_list_term1 = p2p_cdr(subst_spec_list_term1);
00327
00328 if (!is_nil(subst_str_list_term1)) {
00329 subst_str_term = p2p_car(subst_str_list_term1);
00330 subst_str_list_term1 = p2p_cdr(subst_str_list_term1);
00331
00332 if (is_string(subst_str_term)) {
00333 subst_string = string_val(subst_str_term);
00334 } else if (is_list(subst_str_term)) {
00335 subst_string = p_charlist_to_c_string(subst_str_term, &subst_buf,
00336 "RE_SUBSTITUTE",
00337 "substitution string");
00338 } else
00339 xsb_abort("[RE_SUBSTITUTE] Arg 3 must be a list of strings");
00340 }
00341
00342 beg_term = p2p_arg(subst_reg_term,1);
00343 end_term = p2p_arg(subst_reg_term,2);
00344
00345 if (!is_int(beg_term) || !is_int(end_term))
00346 xsb_abort("[RE_SUBSTITUTE] Non-integer in Arg 2");
00347 else{
00348 beg_offset = int_val(beg_term);
00349 end_offset = int_val(end_term);
00350 }
00351
00352 if (end_offset < 0)
00353 end_offset = input_len;
00354 if ((end_offset < beg_offset) || (beg_offset < last_pos))
00355 xsb_abort("[RE_SUBSTITUTE] Substitution regions in Arg 2 not sorted");
00356
00357
00358 XSB_StrAppendBlk(&output_buffer,input_string+last_pos,beg_offset-last_pos);
00359 XSB_StrAppend(&output_buffer, subst_string);
00360
00361 last_pos = end_offset;
00362
00363 } while (!is_nil(subst_spec_list_term1));
00364
00365 XSB_StrAppend(&output_buffer, input_string+end_offset);
00366
00367 EXIT:
00368
00369 if (conversion_required)
00370 c_string_to_p_charlist(output_buffer.string, output_term,
00371 "RE_SUBSTITUTE", "Arg 4");
00372 else
00373
00374
00375
00376
00377
00378
00379
00380 ctop_string(4, output_buffer.string);
00381
00382 return(TRUE);
00383 }
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396 int do_regsubstring__(void)
00397 {
00398
00399
00400 prolog_term input_term, output_term;
00401 prolog_term beg_offset_term, end_offset_term;
00402 char *input_string=NULL;
00403 int beg_offset, end_offset, input_len, substring_len;
00404 int conversion_required=FALSE;
00405
00406 XSB_StrSet(&output_buffer,"");
00407
00408 input_term = reg_term(1);
00409 if (is_string(input_term))
00410 input_string = string_val(input_term);
00411 else if (is_list(input_term)) {
00412 input_string = p_charlist_to_c_string(input_term, &input_buffer,
00413 "RE_SUBSTRING", "input string");
00414 conversion_required = TRUE;
00415 } else
00416 xsb_abort("[RE_SUBSTRING] Arg 1 (the input string) must be an atom or a character list");
00417
00418 input_len = strlen(input_string);
00419
00420
00421 beg_offset_term = reg_term(2);
00422 if (! is_int(beg_offset_term))
00423 xsb_abort("[RE_SUBSTRING] Arg 2 (the beginning offset) must be an integer");
00424 beg_offset = int_val(beg_offset_term);
00425 if (beg_offset < 0 || beg_offset > input_len)
00426 xsb_abort("[RE_SUBSTRING] Arg 2 (=%d) must be between 0 and %d",
00427 beg_offset, input_len);
00428
00429
00430 end_offset_term = reg_term(3);
00431 if (! is_int(end_offset_term))
00432 xsb_abort("[RE_SUBSTRING] Arg 3 (the ending offset) must be an integer");
00433 end_offset = int_val(end_offset_term);
00434 if (end_offset < 0)
00435 end_offset = input_len;
00436 else if (end_offset > input_len || end_offset < beg_offset)
00437 xsb_abort("[RE_SUBSTRING] Arg 3 (=%d) must be < 0 or between %d and %d",
00438 end_offset, beg_offset, input_len);
00439
00440 output_term = reg_term(4);
00441 if (! is_var(output_term))
00442 xsb_abort("[RE_SUBSTRING] Arg 4 (the output string) must be an unbound variable");
00443
00444
00445 substring_len = end_offset-beg_offset;
00446 XSB_StrAppendBlk(&output_buffer, input_string+beg_offset, substring_len);
00447 XSB_StrNullTerminate(&output_buffer);
00448
00449
00450 if (conversion_required)
00451 c_string_to_p_charlist(output_buffer.string, output_term,
00452 "RE_SUBSTITUTE", "Arg 4");
00453 else
00454
00455
00456
00457
00458
00459
00460
00461 ctop_string(4, output_buffer.string);
00462
00463 return(TRUE);
00464 }
00465
00466
00467
00468
00469 static XSB_StrDefine(temp_buffer);
00470 int do_regcharlist_to_string__(void)
00471 {
00472
00473 prolog_term input_term = reg_term(1);
00474
00475 p_charlist_to_c_string(input_term, &temp_buffer,
00476 "RE_CHARLIST_TO_STRING", "input string");
00477 ctop_string(2, temp_buffer.string);
00478 return TRUE;
00479 }
00480
00481
00482 static int make_flags(prolog_term flag_term, char *context)
00483 {
00484 int flags = 0;
00485 prolog_term aux_list=flag_term, head_trm;
00486 char *head;
00487
00488 if (is_var(flag_term))
00489 return REG_EXTENDED;
00490 else if (is_int(flag_term))
00491 return (REG_EXTENDED | REG_ICASE);
00492
00493 if (is_nil(flag_term))
00494 return 0;
00495
00496 if (! is_list(flag_term))
00497 xsb_abort("[%s] Arg 4 (flags) must be a variable, an integer, or a list",
00498 context);
00499
00500 do {
00501 head_trm = p2p_car(aux_list);
00502 aux_list = p2p_cdr(aux_list);
00503 if (!is_string(head_trm))
00504 xsb_abort("[%s] Arg 4: allowed flags are `extended' and `ignorecase'",
00505 context);
00506 head = string_val(head_trm);
00507 if (strcmp(head,"extended")==0)
00508 flags = flags | REG_EXTENDED;
00509 else if (strcmp(head,"ignorecase")==0)
00510 flags = flags | REG_ICASE;
00511 } while (!is_nil(aux_list));
00512
00513 return flags;
00514 }
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531 #define ERR_MSG_LEN 100
00532 static int xsb_re_match(char *regexp_ptr, char *match_str, int flags,
00533 regmatch_t **match_array, int *paren_number,
00534 char *context)
00535 {
00536 static regmatch_t matches[NMATCH];
00537 regex_t *compiled_re;
00538 int idx, err_code;
00539 char err_msg[ERR_MSG_LEN];
00540
00541 *match_array = matches;
00542
00543 idx = hash(regexp_ptr, 1, REGEXP_TBL_SIZE);
00544
00545
00546
00547
00548 compiled_re = ®exp_tbl[idx].compiled;
00549 if ((regexp_tbl[idx].original == NULL)
00550 || (0 != strcmp(regexp_ptr, regexp_tbl[idx].original))
00551 || (regexp_tbl[idx].flags != flags)
00552 ) {
00553
00554 regexp_tbl[idx].original = regexp_ptr;
00555 regexp_tbl[idx].flags = flags;
00556 if (0 == (err_code = regcomp(®exp_tbl[idx].compiled,regexp_ptr,flags)))
00557 regexp_tbl[idx].original = regexp_ptr;
00558 else {
00559 regexp_tbl[idx].original = NULL;
00560 regerror(err_code, compiled_re, err_msg, ERR_MSG_LEN);
00561 xsb_abort("[%s] %s", context, err_msg);
00562 }
00563 }
00564
00565 *paren_number = compiled_re->re_nsub;
00566 err_code = regexec(®exp_tbl[idx].compiled, match_str, NMATCH, matches, 0);
00567
00568 if (err_code == REG_NOMATCH) return FALSE;
00569
00570 if (err_code != 0) {
00571 regerror(err_code, compiled_re, err_msg, ERR_MSG_LEN);
00572 xsb_abort("[%s] %s", context, err_msg);
00573 }
00574
00575 return TRUE;
00576 }
00577
00578
00579 void initialize_regexp_tbl()
00580 {
00581 int i;
00582 first_call = FALSE;
00583 for (i=0; i<REGEXP_TBL_SIZE; i++) {
00584 regexp_tbl[i].original = NULL;
00585 regexp_tbl[i].flags = 0;
00586 }
00587 }
00588