regexec.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026
  1. /*
  2. regexec.c - TRE POSIX compatible matching functions (and more).
  3. Copyright (c) 2001-2009 Ville Laurikari <[email protected]>
  4. All rights reserved.
  5. Redistribution and use in source and binary forms, with or without
  6. modification, are permitted provided that the following conditions
  7. are met:
  8. 1. Redistributions of source code must retain the above copyright
  9. notice, this list of conditions and the following disclaimer.
  10. 2. Redistributions in binary form must reproduce the above copyright
  11. notice, this list of conditions and the following disclaimer in the
  12. documentation and/or other materials provided with the distribution.
  13. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  14. ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  15. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  16. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  17. HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  18. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  19. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include <wchar.h>
  28. #include <wctype.h>
  29. #include <limits.h>
  30. #include <stdint.h>
  31. #include <regex.h>
  32. #include "tre.h"
  33. #include <assert.h>
  34. static void
  35. tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
  36. const tre_tnfa_t *tnfa, int *tags, int match_eo);
  37. /***********************************************************************
  38. from tre-match-utils.h
  39. ***********************************************************************/
  40. #define GET_NEXT_WCHAR() do { \
  41. prev_c = next_c; pos += pos_add_next; \
  42. if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
  43. if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
  44. else pos_add_next++; \
  45. } \
  46. str_byte += pos_add_next; \
  47. } while (0)
  48. #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
  49. #define CHECK_ASSERTIONS(assertions) \
  50. (((assertions & ASSERT_AT_BOL) \
  51. && (pos > 0 || reg_notbol) \
  52. && (prev_c != L'\n' || !reg_newline)) \
  53. || ((assertions & ASSERT_AT_EOL) \
  54. && (next_c != L'\0' || reg_noteol) \
  55. && (next_c != L'\n' || !reg_newline)) \
  56. || ((assertions & ASSERT_AT_BOW) \
  57. && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
  58. || ((assertions & ASSERT_AT_EOW) \
  59. && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
  60. || ((assertions & ASSERT_AT_WB) \
  61. && (pos != 0 && next_c != L'\0' \
  62. && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
  63. || ((assertions & ASSERT_AT_WB_NEG) \
  64. && (pos == 0 || next_c == L'\0' \
  65. || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
  66. #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
  67. (((trans_i->assertions & ASSERT_CHAR_CLASS) \
  68. && !(tnfa->cflags & REG_ICASE) \
  69. && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \
  70. || ((trans_i->assertions & ASSERT_CHAR_CLASS) \
  71. && (tnfa->cflags & REG_ICASE) \
  72. && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \
  73. && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \
  74. || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \
  75. && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
  76. tnfa->cflags & REG_ICASE)))
  77. /* Returns 1 if `t1' wins `t2', 0 otherwise. */
  78. static int
  79. tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
  80. int *t1, int *t2)
  81. {
  82. int i;
  83. for (i = 0; i < num_tags; i++)
  84. {
  85. if (tag_directions[i] == TRE_TAG_MINIMIZE)
  86. {
  87. if (t1[i] < t2[i])
  88. return 1;
  89. if (t1[i] > t2[i])
  90. return 0;
  91. }
  92. else
  93. {
  94. if (t1[i] > t2[i])
  95. return 1;
  96. if (t1[i] < t2[i])
  97. return 0;
  98. }
  99. }
  100. /* assert(0);*/
  101. return 0;
  102. }
  103. static int
  104. tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
  105. {
  106. while (*classes != (tre_ctype_t)0)
  107. if ((!icase && tre_isctype(wc, *classes))
  108. || (icase && (tre_isctype(tre_toupper(wc), *classes)
  109. || tre_isctype(tre_tolower(wc), *classes))))
  110. return 1; /* Match. */
  111. else
  112. classes++;
  113. return 0; /* No match. */
  114. }
  115. /***********************************************************************
  116. from tre-match-parallel.c
  117. ***********************************************************************/
  118. /*
  119. This algorithm searches for matches basically by reading characters
  120. in the searched string one by one, starting at the beginning. All
  121. matching paths in the TNFA are traversed in parallel. When two or
  122. more paths reach the same state, exactly one is chosen according to
  123. tag ordering rules; if returning submatches is not required it does
  124. not matter which path is chosen.
  125. The worst case time required for finding the leftmost and longest
  126. match, or determining that there is no match, is always linearly
  127. dependent on the length of the text being searched.
  128. This algorithm cannot handle TNFAs with back referencing nodes.
  129. See `tre-match-backtrack.c'.
  130. */
  131. typedef struct {
  132. tre_tnfa_transition_t *state;
  133. int *tags;
  134. } tre_tnfa_reach_t;
  135. typedef struct {
  136. int pos;
  137. int **tags;
  138. } tre_reach_pos_t;
  139. static reg_errcode_t
  140. tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
  141. int *match_tags, int eflags,
  142. int *match_end_ofs)
  143. {
  144. /* State variables required by GET_NEXT_WCHAR. */
  145. tre_char_t prev_c = 0, next_c = 0;
  146. const char *str_byte = string;
  147. int pos = -1;
  148. int pos_add_next = 1;
  149. #ifdef TRE_MBSTATE
  150. mbstate_t mbstate;
  151. #endif /* TRE_MBSTATE */
  152. int reg_notbol = eflags & REG_NOTBOL;
  153. int reg_noteol = eflags & REG_NOTEOL;
  154. int reg_newline = tnfa->cflags & REG_NEWLINE;
  155. reg_errcode_t ret;
  156. char *buf;
  157. tre_tnfa_transition_t *trans_i;
  158. tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
  159. tre_reach_pos_t *reach_pos;
  160. int *tag_i;
  161. int num_tags, i;
  162. int match_eo = -1; /* end offset of match (-1 if no match found yet) */
  163. int new_match = 0;
  164. int *tmp_tags = NULL;
  165. int *tmp_iptr;
  166. #ifdef TRE_MBSTATE
  167. memset(&mbstate, '\0', sizeof(mbstate));
  168. #endif /* TRE_MBSTATE */
  169. if (!match_tags)
  170. num_tags = 0;
  171. else
  172. num_tags = tnfa->num_tags;
  173. /* Allocate memory for temporary data required for matching. This needs to
  174. be done for every matching operation to be thread safe. This allocates
  175. everything in a single large block with calloc(). */
  176. {
  177. size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
  178. char *tmp_buf;
  179. /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
  180. * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
  181. if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states))
  182. goto error_exit;
  183. /* Likewise check rbytes. */
  184. if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
  185. goto error_exit;
  186. /* Likewise check pbytes. */
  187. if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
  188. goto error_exit;
  189. /* Compute the length of the block we need. */
  190. tbytes = sizeof(*tmp_tags) * num_tags;
  191. rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
  192. pbytes = sizeof(*reach_pos) * tnfa->num_states;
  193. xbytes = sizeof(int) * num_tags;
  194. total_bytes =
  195. (sizeof(long) - 1) * 4 /* for alignment paddings */
  196. + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
  197. /* Allocate the memory. */
  198. buf = calloc(total_bytes, 1);
  199. if (buf == NULL)
  200. return REG_ESPACE;
  201. /* Get the various pointers within tmp_buf (properly aligned). */
  202. tmp_tags = (void *)buf;
  203. tmp_buf = buf + tbytes;
  204. tmp_buf += ALIGN(tmp_buf, long);
  205. reach_next = (void *)tmp_buf;
  206. tmp_buf += rbytes;
  207. tmp_buf += ALIGN(tmp_buf, long);
  208. reach = (void *)tmp_buf;
  209. tmp_buf += rbytes;
  210. tmp_buf += ALIGN(tmp_buf, long);
  211. reach_pos = (void *)tmp_buf;
  212. tmp_buf += pbytes;
  213. tmp_buf += ALIGN(tmp_buf, long);
  214. for (i = 0; i < tnfa->num_states; i++)
  215. {
  216. reach[i].tags = (void *)tmp_buf;
  217. tmp_buf += xbytes;
  218. reach_next[i].tags = (void *)tmp_buf;
  219. tmp_buf += xbytes;
  220. }
  221. }
  222. for (i = 0; i < tnfa->num_states; i++)
  223. reach_pos[i].pos = -1;
  224. GET_NEXT_WCHAR();
  225. pos = 0;
  226. reach_next_i = reach_next;
  227. while (1)
  228. {
  229. /* If no match found yet, add the initial states to `reach_next'. */
  230. if (match_eo < 0)
  231. {
  232. trans_i = tnfa->initial;
  233. while (trans_i->state != NULL)
  234. {
  235. if (reach_pos[trans_i->state_id].pos < pos)
  236. {
  237. if (trans_i->assertions
  238. && CHECK_ASSERTIONS(trans_i->assertions))
  239. {
  240. trans_i++;
  241. continue;
  242. }
  243. reach_next_i->state = trans_i->state;
  244. for (i = 0; i < num_tags; i++)
  245. reach_next_i->tags[i] = -1;
  246. tag_i = trans_i->tags;
  247. if (tag_i)
  248. while (*tag_i >= 0)
  249. {
  250. if (*tag_i < num_tags)
  251. reach_next_i->tags[*tag_i] = pos;
  252. tag_i++;
  253. }
  254. if (reach_next_i->state == tnfa->final)
  255. {
  256. match_eo = pos;
  257. new_match = 1;
  258. for (i = 0; i < num_tags; i++)
  259. match_tags[i] = reach_next_i->tags[i];
  260. }
  261. reach_pos[trans_i->state_id].pos = pos;
  262. reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
  263. reach_next_i++;
  264. }
  265. trans_i++;
  266. }
  267. reach_next_i->state = NULL;
  268. }
  269. else
  270. {
  271. if (num_tags == 0 || reach_next_i == reach_next)
  272. /* We have found a match. */
  273. break;
  274. }
  275. /* Check for end of string. */
  276. if (!next_c) break;
  277. GET_NEXT_WCHAR();
  278. /* Swap `reach' and `reach_next'. */
  279. reach_i = reach;
  280. reach = reach_next;
  281. reach_next = reach_i;
  282. /* For each state in `reach', weed out states that don't fulfill the
  283. minimal matching conditions. */
  284. if (tnfa->num_minimals && new_match)
  285. {
  286. new_match = 0;
  287. reach_next_i = reach_next;
  288. for (reach_i = reach; reach_i->state; reach_i++)
  289. {
  290. int skip = 0;
  291. for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
  292. {
  293. int end = tnfa->minimal_tags[i];
  294. int start = tnfa->minimal_tags[i + 1];
  295. if (end >= num_tags)
  296. {
  297. skip = 1;
  298. break;
  299. }
  300. else if (reach_i->tags[start] == match_tags[start]
  301. && reach_i->tags[end] < match_tags[end])
  302. {
  303. skip = 1;
  304. break;
  305. }
  306. }
  307. if (!skip)
  308. {
  309. reach_next_i->state = reach_i->state;
  310. tmp_iptr = reach_next_i->tags;
  311. reach_next_i->tags = reach_i->tags;
  312. reach_i->tags = tmp_iptr;
  313. reach_next_i++;
  314. }
  315. }
  316. reach_next_i->state = NULL;
  317. /* Swap `reach' and `reach_next'. */
  318. reach_i = reach;
  319. reach = reach_next;
  320. reach_next = reach_i;
  321. }
  322. /* For each state in `reach' see if there is a transition leaving with
  323. the current input symbol to a state not yet in `reach_next', and
  324. add the destination states to `reach_next'. */
  325. reach_next_i = reach_next;
  326. for (reach_i = reach; reach_i->state; reach_i++)
  327. {
  328. for (trans_i = reach_i->state; trans_i->state; trans_i++)
  329. {
  330. /* Does this transition match the input symbol? */
  331. if (trans_i->code_min <= (tre_cint_t)prev_c &&
  332. trans_i->code_max >= (tre_cint_t)prev_c)
  333. {
  334. if (trans_i->assertions
  335. && (CHECK_ASSERTIONS(trans_i->assertions)
  336. || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
  337. {
  338. continue;
  339. }
  340. /* Compute the tags after this transition. */
  341. for (i = 0; i < num_tags; i++)
  342. tmp_tags[i] = reach_i->tags[i];
  343. tag_i = trans_i->tags;
  344. if (tag_i != NULL)
  345. while (*tag_i >= 0)
  346. {
  347. if (*tag_i < num_tags)
  348. tmp_tags[*tag_i] = pos;
  349. tag_i++;
  350. }
  351. if (reach_pos[trans_i->state_id].pos < pos)
  352. {
  353. /* Found an unvisited node. */
  354. reach_next_i->state = trans_i->state;
  355. tmp_iptr = reach_next_i->tags;
  356. reach_next_i->tags = tmp_tags;
  357. tmp_tags = tmp_iptr;
  358. reach_pos[trans_i->state_id].pos = pos;
  359. reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
  360. if (reach_next_i->state == tnfa->final
  361. && (match_eo == -1
  362. || (num_tags > 0
  363. && reach_next_i->tags[0] <= match_tags[0])))
  364. {
  365. match_eo = pos;
  366. new_match = 1;
  367. for (i = 0; i < num_tags; i++)
  368. match_tags[i] = reach_next_i->tags[i];
  369. }
  370. reach_next_i++;
  371. }
  372. else
  373. {
  374. assert(reach_pos[trans_i->state_id].pos == pos);
  375. /* Another path has also reached this state. We choose
  376. the winner by examining the tag values for both
  377. paths. */
  378. if (tre_tag_order(num_tags, tnfa->tag_directions,
  379. tmp_tags,
  380. *reach_pos[trans_i->state_id].tags))
  381. {
  382. /* The new path wins. */
  383. tmp_iptr = *reach_pos[trans_i->state_id].tags;
  384. *reach_pos[trans_i->state_id].tags = tmp_tags;
  385. if (trans_i->state == tnfa->final)
  386. {
  387. match_eo = pos;
  388. new_match = 1;
  389. for (i = 0; i < num_tags; i++)
  390. match_tags[i] = tmp_tags[i];
  391. }
  392. tmp_tags = tmp_iptr;
  393. }
  394. }
  395. }
  396. }
  397. }
  398. reach_next_i->state = NULL;
  399. }
  400. *match_end_ofs = match_eo;
  401. ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
  402. error_exit:
  403. xfree(buf);
  404. return ret;
  405. }
  406. /***********************************************************************
  407. from tre-match-backtrack.c
  408. ***********************************************************************/
  409. /*
  410. This matcher is for regexps that use back referencing. Regexp matching
  411. with back referencing is an NP-complete problem on the number of back
  412. references. The easiest way to match them is to use a backtracking
  413. routine which basically goes through all possible paths in the TNFA
  414. and chooses the one which results in the best (leftmost and longest)
  415. match. This can be spectacularly expensive and may run out of stack
  416. space, but there really is no better known generic algorithm. Quoting
  417. Henry Spencer from comp.compilers:
  418. <URL: http://compilers.iecc.com/comparch/article/93-03-102>
  419. POSIX.2 REs require longest match, which is really exciting to
  420. implement since the obsolete ("basic") variant also includes
  421. \<digit>. I haven't found a better way of tackling this than doing
  422. a preliminary match using a DFA (or simulation) on a modified RE
  423. that just replicates subREs for \<digit>, and then doing a
  424. backtracking match to determine whether the subRE matches were
  425. right. This can be rather slow, but I console myself with the
  426. thought that people who use \<digit> deserve very slow execution.
  427. (Pun unintentional but very appropriate.)
  428. */
  429. typedef struct {
  430. int pos;
  431. const char *str_byte;
  432. tre_tnfa_transition_t *state;
  433. int state_id;
  434. int next_c;
  435. int *tags;
  436. #ifdef TRE_MBSTATE
  437. mbstate_t mbstate;
  438. #endif /* TRE_MBSTATE */
  439. } tre_backtrack_item_t;
  440. typedef struct tre_backtrack_struct {
  441. tre_backtrack_item_t item;
  442. struct tre_backtrack_struct *prev;
  443. struct tre_backtrack_struct *next;
  444. } *tre_backtrack_t;
  445. #ifdef TRE_MBSTATE
  446. #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
  447. #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
  448. #else /* !TRE_MBSTATE */
  449. #define BT_STACK_MBSTATE_IN
  450. #define BT_STACK_MBSTATE_OUT
  451. #endif /* !TRE_MBSTATE */
  452. #define tre_bt_mem_new tre_mem_new
  453. #define tre_bt_mem_alloc tre_mem_alloc
  454. #define tre_bt_mem_destroy tre_mem_destroy
  455. #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
  456. do \
  457. { \
  458. int i; \
  459. if (!stack->next) \
  460. { \
  461. tre_backtrack_t s; \
  462. s = tre_bt_mem_alloc(mem, sizeof(*s)); \
  463. if (!s) \
  464. { \
  465. tre_bt_mem_destroy(mem); \
  466. if (tags) \
  467. xfree(tags); \
  468. if (pmatch) \
  469. xfree(pmatch); \
  470. if (states_seen) \
  471. xfree(states_seen); \
  472. return REG_ESPACE; \
  473. } \
  474. s->prev = stack; \
  475. s->next = NULL; \
  476. s->item.tags = tre_bt_mem_alloc(mem, \
  477. sizeof(*tags) * tnfa->num_tags); \
  478. if (!s->item.tags) \
  479. { \
  480. tre_bt_mem_destroy(mem); \
  481. if (tags) \
  482. xfree(tags); \
  483. if (pmatch) \
  484. xfree(pmatch); \
  485. if (states_seen) \
  486. xfree(states_seen); \
  487. return REG_ESPACE; \
  488. } \
  489. stack->next = s; \
  490. stack = s; \
  491. } \
  492. else \
  493. stack = stack->next; \
  494. stack->item.pos = (_pos); \
  495. stack->item.str_byte = (_str_byte); \
  496. stack->item.state = (_state); \
  497. stack->item.state_id = (_state_id); \
  498. stack->item.next_c = (_next_c); \
  499. for (i = 0; i < tnfa->num_tags; i++) \
  500. stack->item.tags[i] = (_tags)[i]; \
  501. BT_STACK_MBSTATE_IN; \
  502. } \
  503. while (0)
  504. #define BT_STACK_POP() \
  505. do \
  506. { \
  507. int i; \
  508. assert(stack->prev); \
  509. pos = stack->item.pos; \
  510. str_byte = stack->item.str_byte; \
  511. state = stack->item.state; \
  512. next_c = stack->item.next_c; \
  513. for (i = 0; i < tnfa->num_tags; i++) \
  514. tags[i] = stack->item.tags[i]; \
  515. BT_STACK_MBSTATE_OUT; \
  516. stack = stack->prev; \
  517. } \
  518. while (0)
  519. #undef MIN
  520. #define MIN(a, b) ((a) <= (b) ? (a) : (b))
  521. static reg_errcode_t
  522. tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
  523. int *match_tags, int eflags, int *match_end_ofs)
  524. {
  525. /* State variables required by GET_NEXT_WCHAR. */
  526. tre_char_t prev_c = 0, next_c = 0;
  527. const char *str_byte = string;
  528. int pos = 0;
  529. int pos_add_next = 1;
  530. #ifdef TRE_MBSTATE
  531. mbstate_t mbstate;
  532. #endif /* TRE_MBSTATE */
  533. int reg_notbol = eflags & REG_NOTBOL;
  534. int reg_noteol = eflags & REG_NOTEOL;
  535. int reg_newline = tnfa->cflags & REG_NEWLINE;
  536. /* These are used to remember the necessary values of the above
  537. variables to return to the position where the current search
  538. started from. */
  539. int next_c_start;
  540. const char *str_byte_start;
  541. int pos_start = -1;
  542. #ifdef TRE_MBSTATE
  543. mbstate_t mbstate_start;
  544. #endif /* TRE_MBSTATE */
  545. /* End offset of best match so far, or -1 if no match found yet. */
  546. int match_eo = -1;
  547. /* Tag arrays. */
  548. int *next_tags, *tags = NULL;
  549. /* Current TNFA state. */
  550. tre_tnfa_transition_t *state;
  551. int *states_seen = NULL;
  552. /* Memory allocator to for allocating the backtracking stack. */
  553. tre_mem_t mem = tre_bt_mem_new();
  554. /* The backtracking stack. */
  555. tre_backtrack_t stack;
  556. tre_tnfa_transition_t *trans_i;
  557. regmatch_t *pmatch = NULL;
  558. int ret;
  559. #ifdef TRE_MBSTATE
  560. memset(&mbstate, '\0', sizeof(mbstate));
  561. #endif /* TRE_MBSTATE */
  562. if (!mem)
  563. return REG_ESPACE;
  564. stack = tre_bt_mem_alloc(mem, sizeof(*stack));
  565. if (!stack)
  566. {
  567. ret = REG_ESPACE;
  568. goto error_exit;
  569. }
  570. stack->prev = NULL;
  571. stack->next = NULL;
  572. if (tnfa->num_tags)
  573. {
  574. tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
  575. if (!tags)
  576. {
  577. ret = REG_ESPACE;
  578. goto error_exit;
  579. }
  580. }
  581. if (tnfa->num_submatches)
  582. {
  583. pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
  584. if (!pmatch)
  585. {
  586. ret = REG_ESPACE;
  587. goto error_exit;
  588. }
  589. }
  590. if (tnfa->num_states)
  591. {
  592. states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
  593. if (!states_seen)
  594. {
  595. ret = REG_ESPACE;
  596. goto error_exit;
  597. }
  598. }
  599. retry:
  600. {
  601. int i;
  602. for (i = 0; i < tnfa->num_tags; i++)
  603. {
  604. tags[i] = -1;
  605. if (match_tags)
  606. match_tags[i] = -1;
  607. }
  608. for (i = 0; i < tnfa->num_states; i++)
  609. states_seen[i] = 0;
  610. }
  611. state = NULL;
  612. pos = pos_start;
  613. GET_NEXT_WCHAR();
  614. pos_start = pos;
  615. next_c_start = next_c;
  616. str_byte_start = str_byte;
  617. #ifdef TRE_MBSTATE
  618. mbstate_start = mbstate;
  619. #endif /* TRE_MBSTATE */
  620. /* Handle initial states. */
  621. next_tags = NULL;
  622. for (trans_i = tnfa->initial; trans_i->state; trans_i++)
  623. {
  624. if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
  625. {
  626. continue;
  627. }
  628. if (state == NULL)
  629. {
  630. /* Start from this state. */
  631. state = trans_i->state;
  632. next_tags = trans_i->tags;
  633. }
  634. else
  635. {
  636. /* Backtrack to this state. */
  637. BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
  638. trans_i->state_id, next_c, tags, mbstate);
  639. {
  640. int *tmp = trans_i->tags;
  641. if (tmp)
  642. while (*tmp >= 0)
  643. stack->item.tags[*tmp++] = pos;
  644. }
  645. }
  646. }
  647. if (next_tags)
  648. for (; *next_tags >= 0; next_tags++)
  649. tags[*next_tags] = pos;
  650. if (state == NULL)
  651. goto backtrack;
  652. while (1)
  653. {
  654. tre_tnfa_transition_t *next_state;
  655. int empty_br_match;
  656. if (state == tnfa->final)
  657. {
  658. if (match_eo < pos
  659. || (match_eo == pos
  660. && match_tags
  661. && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
  662. tags, match_tags)))
  663. {
  664. int i;
  665. /* This match wins the previous match. */
  666. match_eo = pos;
  667. if (match_tags)
  668. for (i = 0; i < tnfa->num_tags; i++)
  669. match_tags[i] = tags[i];
  670. }
  671. /* Our TNFAs never have transitions leaving from the final state,
  672. so we jump right to backtracking. */
  673. goto backtrack;
  674. }
  675. /* Go to the next character in the input string. */
  676. empty_br_match = 0;
  677. trans_i = state;
  678. if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
  679. {
  680. /* This is a back reference state. All transitions leaving from
  681. this state have the same back reference "assertion". Instead
  682. of reading the next character, we match the back reference. */
  683. int so, eo, bt = trans_i->u.backref;
  684. int bt_len;
  685. int result;
  686. /* Get the substring we need to match against. Remember to
  687. turn off REG_NOSUB temporarily. */
  688. tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
  689. tnfa, tags, pos);
  690. so = pmatch[bt].rm_so;
  691. eo = pmatch[bt].rm_eo;
  692. bt_len = eo - so;
  693. result = strncmp((const char*)string + so, str_byte - 1,
  694. (size_t)bt_len);
  695. if (result == 0)
  696. {
  697. /* Back reference matched. Check for infinite loop. */
  698. if (bt_len == 0)
  699. empty_br_match = 1;
  700. if (empty_br_match && states_seen[trans_i->state_id])
  701. {
  702. goto backtrack;
  703. }
  704. states_seen[trans_i->state_id] = empty_br_match;
  705. /* Advance in input string and resync `prev_c', `next_c'
  706. and pos. */
  707. str_byte += bt_len - 1;
  708. pos += bt_len - 1;
  709. GET_NEXT_WCHAR();
  710. }
  711. else
  712. {
  713. goto backtrack;
  714. }
  715. }
  716. else
  717. {
  718. /* Check for end of string. */
  719. if (next_c == L'\0')
  720. goto backtrack;
  721. /* Read the next character. */
  722. GET_NEXT_WCHAR();
  723. }
  724. next_state = NULL;
  725. for (trans_i = state; trans_i->state; trans_i++)
  726. {
  727. if (trans_i->code_min <= (tre_cint_t)prev_c
  728. && trans_i->code_max >= (tre_cint_t)prev_c)
  729. {
  730. if (trans_i->assertions
  731. && (CHECK_ASSERTIONS(trans_i->assertions)
  732. || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
  733. {
  734. continue;
  735. }
  736. if (next_state == NULL)
  737. {
  738. /* First matching transition. */
  739. next_state = trans_i->state;
  740. next_tags = trans_i->tags;
  741. }
  742. else
  743. {
  744. /* Second matching transition. We may need to backtrack here
  745. to take this transition instead of the first one, so we
  746. push this transition in the backtracking stack so we can
  747. jump back here if needed. */
  748. BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
  749. trans_i->state_id, next_c, tags, mbstate);
  750. {
  751. int *tmp;
  752. for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
  753. stack->item.tags[*tmp] = pos;
  754. }
  755. #if 0 /* XXX - it's important not to look at all transitions here to keep
  756. the stack small! */
  757. break;
  758. #endif
  759. }
  760. }
  761. }
  762. if (next_state != NULL)
  763. {
  764. /* Matching transitions were found. Take the first one. */
  765. state = next_state;
  766. /* Update the tag values. */
  767. if (next_tags)
  768. while (*next_tags >= 0)
  769. tags[*next_tags++] = pos;
  770. }
  771. else
  772. {
  773. backtrack:
  774. /* A matching transition was not found. Try to backtrack. */
  775. if (stack->prev)
  776. {
  777. if (stack->item.state->assertions & ASSERT_BACKREF)
  778. {
  779. states_seen[stack->item.state_id] = 0;
  780. }
  781. BT_STACK_POP();
  782. }
  783. else if (match_eo < 0)
  784. {
  785. /* Try starting from a later position in the input string. */
  786. /* Check for end of string. */
  787. if (next_c == L'\0')
  788. {
  789. break;
  790. }
  791. next_c = next_c_start;
  792. #ifdef TRE_MBSTATE
  793. mbstate = mbstate_start;
  794. #endif /* TRE_MBSTATE */
  795. str_byte = str_byte_start;
  796. goto retry;
  797. }
  798. else
  799. {
  800. break;
  801. }
  802. }
  803. }
  804. ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
  805. *match_end_ofs = match_eo;
  806. error_exit:
  807. tre_bt_mem_destroy(mem);
  808. #ifndef TRE_USE_ALLOCA
  809. if (tags)
  810. xfree(tags);
  811. if (pmatch)
  812. xfree(pmatch);
  813. if (states_seen)
  814. xfree(states_seen);
  815. #endif /* !TRE_USE_ALLOCA */
  816. return ret;
  817. }
  818. /***********************************************************************
  819. from regexec.c
  820. ***********************************************************************/
  821. /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
  822. endpoint values. */
  823. static void
  824. tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
  825. const tre_tnfa_t *tnfa, int *tags, int match_eo)
  826. {
  827. tre_submatch_data_t *submatch_data;
  828. unsigned int i, j;
  829. int *parents;
  830. i = 0;
  831. if (match_eo >= 0 && !(cflags & REG_NOSUB))
  832. {
  833. /* Construct submatch offsets from the tags. */
  834. submatch_data = tnfa->submatch_data;
  835. while (i < tnfa->num_submatches && i < nmatch)
  836. {
  837. if (submatch_data[i].so_tag == tnfa->end_tag)
  838. pmatch[i].rm_so = match_eo;
  839. else
  840. pmatch[i].rm_so = tags[submatch_data[i].so_tag];
  841. if (submatch_data[i].eo_tag == tnfa->end_tag)
  842. pmatch[i].rm_eo = match_eo;
  843. else
  844. pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
  845. /* If either of the endpoints were not used, this submatch
  846. was not part of the match. */
  847. if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
  848. pmatch[i].rm_so = pmatch[i].rm_eo = -1;
  849. i++;
  850. }
  851. /* Reset all submatches that are not within all of their parent
  852. submatches. */
  853. i = 0;
  854. while (i < tnfa->num_submatches && i < nmatch)
  855. {
  856. if (pmatch[i].rm_eo == -1)
  857. assert(pmatch[i].rm_so == -1);
  858. assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
  859. parents = submatch_data[i].parents;
  860. if (parents != NULL)
  861. for (j = 0; parents[j] >= 0; j++)
  862. {
  863. if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
  864. || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
  865. pmatch[i].rm_so = pmatch[i].rm_eo = -1;
  866. }
  867. i++;
  868. }
  869. }
  870. while (i < nmatch)
  871. {
  872. pmatch[i].rm_so = -1;
  873. pmatch[i].rm_eo = -1;
  874. i++;
  875. }
  876. }
  877. /*
  878. Wrapper functions for POSIX compatible regexp matching.
  879. */
  880. int
  881. regexec(const regex_t *restrict preg, const char *restrict string,
  882. size_t nmatch, regmatch_t pmatch[restrict], int eflags)
  883. {
  884. tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
  885. reg_errcode_t status;
  886. int *tags = NULL, eo;
  887. if (tnfa->cflags & REG_NOSUB) nmatch = 0;
  888. if (tnfa->num_tags > 0 && nmatch > 0)
  889. {
  890. tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
  891. if (tags == NULL)
  892. return REG_ESPACE;
  893. }
  894. /* Dispatch to the appropriate matcher. */
  895. if (tnfa->have_backrefs)
  896. {
  897. /* The regex has back references, use the backtracking matcher. */
  898. status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
  899. }
  900. else
  901. {
  902. /* Exact matching, no back references, use the parallel matcher. */
  903. status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
  904. }
  905. if (status == REG_OK)
  906. /* A match was found, so fill the submatch registers. */
  907. tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
  908. if (tags)
  909. xfree(tags);
  910. return status;
  911. }