1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
87
88#ifdef op
89#undef op
90#endif
91
92#ifdef MSDOS
93# if defined(BUGGY_MSC6)
94
95# pragma optimize("a",off)
96
97# pragma optimize("w",on )
98# endif
99#endif
100
101#ifndef STATIC
102#define STATIC static
103#endif
104
105typedef struct RExC_state_t {
106 U32 flags;
107 char *precomp;
108 regexp *rx;
109 regexp_internal *rxi;
110 char *start;
111 char *end;
112 char *parse;
113 I32 whilem_seen;
114 regnode *emit_start;
115 regnode *emit_bound;
116 regnode *emit;
117 I32 naughty;
118 I32 sawback;
119 U32 seen;
120 I32 size;
121 I32 npar;
122 I32 cpar;
123 I32 nestroot;
124 I32 extralen;
125 I32 seen_zerolen;
126 I32 seen_evals;
127 regnode **open_parens;
128 regnode **close_parens;
129 regnode *opend;
130 I32 utf8;
131 I32 orig_utf8;
132
133
134 HV *charnames;
135 HV *paren_names;
136
137 regnode **recurse;
138 I32 recurse_count;
139#if ADD_TO_REGEXEC
140 char *starttry;
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143#ifdef DEBUGGING
144 const char *lastparse;
145 I32 lastnum;
146 AV *paren_name_list;
147#define RExC_lastparse (pRExC_state->lastparse)
148#define RExC_lastnum (pRExC_state->lastnum)
149#define RExC_paren_name_list (pRExC_state->paren_name_list)
150#endif
151} RExC_state_t;
152
153#define RExC_flags (pRExC_state->flags)
154#define RExC_precomp (pRExC_state->precomp)
155#define RExC_rx (pRExC_state->rx)
156#define RExC_rxi (pRExC_state->rxi)
157#define RExC_start (pRExC_state->start)
158#define RExC_end (pRExC_state->end)
159#define RExC_parse (pRExC_state->parse)
160#define RExC_whilem_seen (pRExC_state->whilem_seen)
161#ifdef RE_TRACK_PATTERN_OFFSETS
162#define RExC_offsets (pRExC_state->rxi->u.offsets)
163#endif
164#define RExC_emit (pRExC_state->emit)
165#define RExC_emit_start (pRExC_state->emit_start)
166#define RExC_emit_bound (pRExC_state->emit_bound)
167#define RExC_naughty (pRExC_state->naughty)
168#define RExC_sawback (pRExC_state->sawback)
169#define RExC_seen (pRExC_state->seen)
170#define RExC_size (pRExC_state->size)
171#define RExC_npar (pRExC_state->npar)
172#define RExC_nestroot (pRExC_state->nestroot)
173#define RExC_extralen (pRExC_state->extralen)
174#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
175#define RExC_seen_evals (pRExC_state->seen_evals)
176#define RExC_utf8 (pRExC_state->utf8)
177#define RExC_orig_utf8 (pRExC_state->orig_utf8)
178#define RExC_charnames (pRExC_state->charnames)
179#define RExC_open_parens (pRExC_state->open_parens)
180#define RExC_close_parens (pRExC_state->close_parens)
181#define RExC_opend (pRExC_state->opend)
182#define RExC_paren_names (pRExC_state->paren_names)
183#define RExC_recurse (pRExC_state->recurse)
184#define RExC_recurse_count (pRExC_state->recurse_count)
185
186
187#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
190
191#ifdef SPSTART
192#undef SPSTART
193#endif
194
195
196
197#define WORST 0
198#define HASWIDTH 0x01
199#define SIMPLE 0x02
200#define SPSTART 0x04
201#define TRYAGAIN 0x08
202#define POSTPONED 0x10
203
204#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205
206
207#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208#define TRIE_STUDY_OPT
209#define FULL_TRIE_STUDY
210#define TRIE_STCLASS
211#endif
212
213
214
215#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216#define PBITVAL(paren) (1 << ((paren) & 7))
217#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297typedef struct scan_data_t {
298
299
300 I32 pos_min;
301 I32 pos_delta;
302 SV *last_found;
303 I32 last_end;
304 I32 last_start_min;
305 I32 last_start_max;
306 SV **longest;
307 SV *longest_fixed;
308 I32 offset_fixed;
309 I32 *minlen_fixed;
310 I32 lookbehind_fixed;
311 SV *longest_float;
312 I32 offset_float_min;
313 I32 offset_float_max;
314 I32 *minlen_float;
315 I32 lookbehind_float;
316 I32 flags;
317 I32 whilem_c;
318 I32 *last_closep;
319 struct regnode_charclass_class *start_class;
320} scan_data_t;
321
322
323
324
325
326static const scan_data_t zero_scan_data =
327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
328
329#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330#define SF_BEFORE_SEOL 0x0001
331#define SF_BEFORE_MEOL 0x0002
332#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334
335#ifdef NO_UNARY_PLUS
336# define SF_FIX_SHIFT_EOL (0+2)
337# define SF_FL_SHIFT_EOL (0+4)
338#else
339# define SF_FIX_SHIFT_EOL (+2)
340# define SF_FL_SHIFT_EOL (+4)
341#endif
342
343#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345
346#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL)
348#define SF_IS_INF 0x0040
349#define SF_HAS_PAR 0x0080
350#define SF_IN_PAR 0x0100
351#define SF_HAS_EVAL 0x0200
352#define SCF_DO_SUBSTR 0x0400
353#define SCF_DO_STCLASS_AND 0x0800
354#define SCF_DO_STCLASS_OR 0x1000
355#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356#define SCF_WHILEM_VISITED_POS 0x2000
357
358#define SCF_TRIE_RESTUDY 0x4000
359#define SCF_SEEN_ACCEPT 0x8000
360
361#define UTF (RExC_utf8 != 0)
362#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
364
365#define OOB_UNICODE 12345678
366#define OOB_NAMEDCLASS -1
367
368#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370
371
372
373#define RegexLengthToShowInErrorMessages 127
374
375
376
377
378
379
380#define MARKER1 "<-- HERE"
381#define MARKER2 " <-- HERE "
382
383#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
384
385
386
387
388
389
390#define _FAIL(code) STMT_START { \
391 const char *ellipses = ""; \
392 IV len = RExC_end - RExC_precomp; \
393 \
394 if (!SIZE_ONLY) \
395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
396 if (len > RegexLengthToShowInErrorMessages) { \
397 \
398 len = RegexLengthToShowInErrorMessages - 10; \
399 ellipses = "..."; \
400 } \
401 code; \
402} STMT_END
403
404#define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
407
408#define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
411
412
413
414
415#define Simple_vFAIL(m) STMT_START { \
416 const IV offset = RExC_parse - RExC_precomp; \
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419} STMT_END
420
421
422
423
424#define vFAIL(m) STMT_START { \
425 if (!SIZE_ONLY) \
426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
427 Simple_vFAIL(m); \
428} STMT_END
429
430
431
432
433#define Simple_vFAIL2(m,a1) STMT_START { \
434 const IV offset = RExC_parse - RExC_precomp; \
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
438
439
440
441
442#define vFAIL2(m,a1) STMT_START { \
443 if (!SIZE_ONLY) \
444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
445 Simple_vFAIL2(m, a1); \
446} STMT_END
447
448
449
450
451
452#define Simple_vFAIL3(m, a1, a2) STMT_START { \
453 const IV offset = RExC_parse - RExC_precomp; \
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458
459
460
461#define vFAIL3(m,a1,a2) STMT_START { \
462 if (!SIZE_ONLY) \
463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
464 Simple_vFAIL3(m, a1, a2); \
465} STMT_END
466
467
468
469
470#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476#define vWARN(loc,m) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
479 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARNdep(loc,m) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
485 "%s" REPORT_LOCATION, \
486 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
487} STMT_END
488
489
490#define vWARN2(loc, m, a1) STMT_START { \
491 const IV offset = loc - RExC_precomp; \
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
494} STMT_END
495
496#define vWARN3(loc, m, a1, a2) STMT_START { \
497 const IV offset = loc - RExC_precomp; \
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
500} STMT_END
501
502#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
503 const IV offset = loc - RExC_precomp; \
504 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
505 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
506} STMT_END
507
508#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
509 const IV offset = loc - RExC_precomp; \
510 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
511 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512} STMT_END
513
514
515
516#define REGC(c,s) STMT_START { \
517 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
518} STMT_END
519
520
521
522
523
524
525
526#ifndef RE_TRACK_PATTERN_OFFSETS
527#define Set_Node_Offset_To_R(node,byte)
528#define Set_Node_Offset(node,byte)
529#define Set_Cur_Node_Offset
530#define Set_Node_Length_To_R(node,len)
531#define Set_Node_Length(node,len)
532#define Set_Node_Cur_Length(node)
533#define Node_Offset(n)
534#define Node_Length(n)
535#define Set_Node_Offset_Length(node,offset,len)
536#define ProgLen(ri) ri->u.proglen
537#define SetProgLen(ri,x) ri->u.proglen = x
538#else
539#define ProgLen(ri) ri->u.offsets[0]
540#define SetProgLen(ri,x) ri->u.offsets[0] = x
541#define Set_Node_Offset_To_R(node,byte) STMT_START { \
542 if (! SIZE_ONLY) { \
543 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
544 __LINE__, (int)(node), (int)(byte))); \
545 if((node) < 0) { \
546 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
547 } else { \
548 RExC_offsets[2*(node)-1] = (byte); \
549 } \
550 } \
551} STMT_END
552
553#define Set_Node_Offset(node,byte) \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
555#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
556
557#define Set_Node_Length_To_R(node,len) STMT_START { \
558 if (! SIZE_ONLY) { \
559 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
560 __LINE__, (int)(node), (int)(len))); \
561 if((node) < 0) { \
562 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
563 } else { \
564 RExC_offsets[2*(node)] = (len); \
565 } \
566 } \
567} STMT_END
568
569#define Set_Node_Length(node,len) \
570 Set_Node_Length_To_R((node)-RExC_emit_start, len)
571#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
572#define Set_Node_Cur_Length(node) \
573 Set_Node_Length(node, RExC_parse - parse_start)
574
575
576#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
577#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
578
579#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
580 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
581 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
582} STMT_END
583#endif
584
585#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
586#define EXPERIMENTAL_INPLACESCAN
587#endif
588
589#define DEBUG_STUDYDATA(str,data,depth) \
590DEBUG_OPTIMISE_MORE_r(if(data){ \
591 PerlIO_printf(Perl_debug_log, \
592 "%*s" str "Pos:%"IVdf"/%"IVdf \
593 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
594 (int)(depth)*2, "", \
595 (IV)((data)->pos_min), \
596 (IV)((data)->pos_delta), \
597 (UV)((data)->flags), \
598 (IV)((data)->whilem_c), \
599 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
600 is_inf ? "INF " : "" \
601 ); \
602 if ((data)->last_found) \
603 PerlIO_printf(Perl_debug_log, \
604 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
605 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
606 SvPVX_const((data)->last_found), \
607 (IV)((data)->last_end), \
608 (IV)((data)->last_start_min), \
609 (IV)((data)->last_start_max), \
610 ((data)->longest && \
611 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
612 SvPVX_const((data)->longest_fixed), \
613 (IV)((data)->offset_fixed), \
614 ((data)->longest && \
615 (data)->longest==&((data)->longest_float)) ? "*" : "", \
616 SvPVX_const((data)->longest_float), \
617 (IV)((data)->offset_float_min), \
618 (IV)((data)->offset_float_max) \
619 ); \
620 PerlIO_printf(Perl_debug_log,"\n"); \
621});
622
623static void clear_re(pTHX_ void *r);
624
625
626
627
628
629STATIC void
630S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
631{
632 const STRLEN l = CHR_SVLEN(data->last_found);
633 const STRLEN old_l = CHR_SVLEN(*data->longest);
634 GET_RE_DEBUG_FLAGS_DECL;
635
636 PERL_ARGS_ASSERT_SCAN_COMMIT;
637
638 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
639 SvSetMagicSV(*data->longest, data->last_found);
640 if (*data->longest == data->longest_fixed) {
641 data->offset_fixed = l ? data->last_start_min : data->pos_min;
642 if (data->flags & SF_BEFORE_EOL)
643 data->flags
644 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
645 else
646 data->flags &= ~SF_FIX_BEFORE_EOL;
647 data->minlen_fixed=minlenp;
648 data->lookbehind_fixed=0;
649 }
650 else {
651 data->offset_float_min = l ? data->last_start_min : data->pos_min;
652 data->offset_float_max = (l
653 ? data->last_start_max
654 : data->pos_min + data->pos_delta);
655 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
656 data->offset_float_max = I32_MAX;
657 if (data->flags & SF_BEFORE_EOL)
658 data->flags
659 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
660 else
661 data->flags &= ~SF_FL_BEFORE_EOL;
662 data->minlen_float=minlenp;
663 data->lookbehind_float=0;
664 }
665 }
666 SvCUR_set(data->last_found, 0);
667 {
668 SV * const sv = data->last_found;
669 if (SvUTF8(sv) && SvMAGICAL(sv)) {
670 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 if (mg)
672 mg->mg_len = 0;
673 }
674 }
675 data->last_end = -1;
676 data->flags &= ~SF_BEFORE_EOL;
677 DEBUG_STUDYDATA("commit: ",data,0);
678}
679
680
681STATIC void
682S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
683{
684 PERL_ARGS_ASSERT_CL_ANYTHING;
685
686 ANYOF_CLASS_ZERO(cl);
687 ANYOF_BITMAP_SETALL(cl);
688 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
689 if (LOC)
690 cl->flags |= ANYOF_LOCALE;
691}
692
693
694STATIC int
695S_cl_is_anything(const struct regnode_charclass_class *cl)
696{
697 int value;
698
699 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
700
701 for (value = 0; value <= ANYOF_MAX; value += 2)
702 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
703 return 1;
704 if (!(cl->flags & ANYOF_UNICODE_ALL))
705 return 0;
706 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
707 return 0;
708 return 1;
709}
710
711
712STATIC void
713S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
714{
715 PERL_ARGS_ASSERT_CL_INIT;
716
717 Zero(cl, 1, struct regnode_charclass_class);
718 cl->type = ANYOF;
719 cl_anything(pRExC_state, cl);
720}
721
722STATIC void
723S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
724{
725 PERL_ARGS_ASSERT_CL_INIT_ZERO;
726
727 Zero(cl, 1, struct regnode_charclass_class);
728 cl->type = ANYOF;
729 cl_anything(pRExC_state, cl);
730 if (LOC)
731 cl->flags |= ANYOF_LOCALE;
732}
733
734
735
736STATIC void
737S_cl_and(struct regnode_charclass_class *cl,
738 const struct regnode_charclass_class *and_with)
739{
740 PERL_ARGS_ASSERT_CL_AND;
741
742 assert(and_with->type == ANYOF);
743 if (!(and_with->flags & ANYOF_CLASS)
744 && !(cl->flags & ANYOF_CLASS)
745 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
746 && !(and_with->flags & ANYOF_FOLD)
747 && !(cl->flags & ANYOF_FOLD)) {
748 int i;
749
750 if (and_with->flags & ANYOF_INVERT)
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] &= ~and_with->bitmap[i];
753 else
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] &= and_with->bitmap[i];
756 }
757 if (!(and_with->flags & ANYOF_EOS))
758 cl->flags &= ~ANYOF_EOS;
759
760 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
761 !(and_with->flags & ANYOF_INVERT)) {
762 cl->flags &= ~ANYOF_UNICODE_ALL;
763 cl->flags |= ANYOF_UNICODE;
764 ARG_SET(cl, ARG(and_with));
765 }
766 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
767 !(and_with->flags & ANYOF_INVERT))
768 cl->flags &= ~ANYOF_UNICODE_ALL;
769 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
770 !(and_with->flags & ANYOF_INVERT))
771 cl->flags &= ~ANYOF_UNICODE;
772}
773
774
775
776STATIC void
777S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
778{
779 PERL_ARGS_ASSERT_CL_OR;
780
781 if (or_with->flags & ANYOF_INVERT) {
782
783
784
785
786
787
788
789
790
791 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
792 && !(or_with->flags & ANYOF_FOLD)
793 && !(cl->flags & ANYOF_FOLD) ) {
794 int i;
795
796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 cl->bitmap[i] |= ~or_with->bitmap[i];
798 }
799 else {
800 cl_anything(pRExC_state, cl);
801 }
802 } else {
803
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && (!(or_with->flags & ANYOF_FOLD)
806 || (cl->flags & ANYOF_FOLD)) ) {
807 int i;
808
809
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] |= or_with->bitmap[i];
812 if (or_with->flags & ANYOF_CLASS) {
813 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
814 cl->classflags[i] |= or_with->classflags[i];
815 cl->flags |= ANYOF_CLASS;
816 }
817 }
818 else {
819 cl_anything(pRExC_state, cl);
820 }
821 }
822 if (or_with->flags & ANYOF_EOS)
823 cl->flags |= ANYOF_EOS;
824
825 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
826 ARG(cl) != ARG(or_with)) {
827 cl->flags |= ANYOF_UNICODE_ALL;
828 cl->flags &= ~ANYOF_UNICODE;
829 }
830 if (or_with->flags & ANYOF_UNICODE_ALL) {
831 cl->flags |= ANYOF_UNICODE_ALL;
832 cl->flags &= ~ANYOF_UNICODE;
833 }
834}
835
836#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
837#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
838#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
839#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
840
841
842#ifdef DEBUGGING
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863STATIC void
864S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
865 AV *revcharmap, U32 depth)
866{
867 U32 state;
868 SV *sv=sv_newmortal();
869 int colwidth= widecharmap ? 6 : 4;
870 GET_RE_DEBUG_FLAGS_DECL;
871
872 PERL_ARGS_ASSERT_DUMP_TRIE;
873
874 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
875 (int)depth * 2 + 2,"",
876 "Match","Base","Ofs" );
877
878 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
879 SV ** const tmp = av_fetch( revcharmap, state, 0);
880 if ( tmp ) {
881 PerlIO_printf( Perl_debug_log, "%*s",
882 colwidth,
883 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
884 PL_colors[0], PL_colors[1],
885 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
886 PERL_PV_ESCAPE_FIRSTCHAR
887 )
888 );
889 }
890 }
891 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
892 (int)depth * 2 + 2,"");
893
894 for( state = 0 ; state < trie->uniquecharcount ; state++ )
895 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
896 PerlIO_printf( Perl_debug_log, "\n");
897
898 for( state = 1 ; state < trie->statecount ; state++ ) {
899 const U32 base = trie->states[ state ].trans.base;
900
901 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
902
903 if ( trie->states[ state ].wordnum ) {
904 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
905 } else {
906 PerlIO_printf( Perl_debug_log, "%6s", "" );
907 }
908
909 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
910
911 if ( base ) {
912 U32 ofs = 0;
913
914 while( ( base + ofs < trie->uniquecharcount ) ||
915 ( base + ofs - trie->uniquecharcount < trie->lasttrans
916 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
917 ofs++;
918
919 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
920
921 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
922 if ( ( base + ofs >= trie->uniquecharcount ) &&
923 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
924 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
925 {
926 PerlIO_printf( Perl_debug_log, "%*"UVXf,
927 colwidth,
928 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
929 } else {
930 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
931 }
932 }
933
934 PerlIO_printf( Perl_debug_log, "]");
935
936 }
937 PerlIO_printf( Perl_debug_log, "\n" );
938 }
939}
940
941
942
943
944
945
946STATIC void
947S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
948 HV *widecharmap, AV *revcharmap, U32 next_alloc,
949 U32 depth)
950{
951 U32 state;
952 SV *sv=sv_newmortal();
953 int colwidth= widecharmap ? 6 : 4;
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
957
958
959 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
960 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
961 "------:-----+-----------------\n" );
962
963 for( state=1 ; state < next_alloc ; state ++ ) {
964 U16 charid;
965
966 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
967 (int)depth * 2 + 2,"", (UV)state );
968 if ( ! trie->states[ state ].wordnum ) {
969 PerlIO_printf( Perl_debug_log, "%5s| ","");
970 } else {
971 PerlIO_printf( Perl_debug_log, "W%4x| ",
972 trie->states[ state ].wordnum
973 );
974 }
975 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
976 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
977 if ( tmp ) {
978 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
979 colwidth,
980 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
981 PL_colors[0], PL_colors[1],
982 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
983 PERL_PV_ESCAPE_FIRSTCHAR
984 ) ,
985 TRIE_LIST_ITEM(state,charid).forid,
986 (UV)TRIE_LIST_ITEM(state,charid).newstate
987 );
988 if (!(charid % 10))
989 PerlIO_printf(Perl_debug_log, "\n%*s| ",
990 (int)((depth * 2) + 14), "");
991 }
992 }
993 PerlIO_printf( Perl_debug_log, "\n");
994 }
995}
996
997
998
999
1000
1001
1002
1003STATIC void
1004S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1005 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1006 U32 depth)
1007{
1008 U32 state;
1009 U16 charid;
1010 SV *sv=sv_newmortal();
1011 int colwidth= widecharmap ? 6 : 4;
1012 GET_RE_DEBUG_FLAGS_DECL;
1013
1014 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1015
1016
1017
1018
1019
1020
1021 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1022
1023 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1024 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1025 if ( tmp ) {
1026 PerlIO_printf( Perl_debug_log, "%*s",
1027 colwidth,
1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029 PL_colors[0], PL_colors[1],
1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 PERL_PV_ESCAPE_FIRSTCHAR
1032 )
1033 );
1034 }
1035 }
1036
1037 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1038
1039 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1041 }
1042
1043 PerlIO_printf( Perl_debug_log, "\n" );
1044
1045 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1046
1047 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1048 (int)depth * 2 + 2,"",
1049 (UV)TRIE_NODENUM( state ) );
1050
1051 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1052 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1053 if (v)
1054 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1055 else
1056 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1057 }
1058 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1059 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1060 } else {
1061 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1062 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1063 }
1064 }
1065}
1066
1067#endif
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186#define TRIE_STORE_REVCHAR \
1187 STMT_START { \
1188 if (UTF) { \
1189 SV *zlopp = newSV(2); \
1190 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1191 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1192 SvCUR_set(zlopp, kapow - flrbbbbb); \
1193 SvPOK_on(zlopp); \
1194 SvUTF8_on(zlopp); \
1195 av_push(revcharmap, zlopp); \
1196 } else { \
1197 char ooooff = (char)uvc; \
1198 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1199 } \
1200 } STMT_END
1201
1202#define TRIE_READ_CHAR STMT_START { \
1203 wordlen++; \
1204 if ( UTF ) { \
1205 if ( folder ) { \
1206 if ( foldlen > 0 ) { \
1207 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1208 foldlen -= len; \
1209 scan += len; \
1210 len = 0; \
1211 } else { \
1212 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1213 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1214 foldlen -= UNISKIP( uvc ); \
1215 scan = foldbuf + UNISKIP( uvc ); \
1216 } \
1217 } else { \
1218 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1219 } \
1220 } else { \
1221 uvc = (U32)*uc; \
1222 len = 1; \
1223 } \
1224} STMT_END
1225
1226
1227
1228#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1229 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1230 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1231 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1232 } \
1233 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1234 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1235 TRIE_LIST_CUR( state )++; \
1236} STMT_END
1237
1238#define TRIE_LIST_NEW(state) STMT_START { \
1239 Newxz( trie->states[ state ].trans.list, \
1240 4, reg_trie_trans_le ); \
1241 TRIE_LIST_CUR( state ) = 1; \
1242 TRIE_LIST_LEN( state ) = 4; \
1243} STMT_END
1244
1245#define TRIE_HANDLE_WORD(state) STMT_START { \
1246 U16 dupe= trie->states[ state ].wordnum; \
1247 regnode * const noper_next = regnext( noper ); \
1248 \
1249 if (trie->wordlen) \
1250 trie->wordlen[ curword ] = wordlen; \
1251 DEBUG_r({ \
1252 \
1253 SV* tmp; \
1254 if (OP(noper) != NOTHING) \
1255 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1256 else \
1257 tmp = newSVpvn_utf8( "", 0, UTF ); \
1258 av_push( trie_words, tmp ); \
1259 }); \
1260 \
1261 curword++; \
1262 \
1263 if ( noper_next < tail ) { \
1264 if (!trie->jump) \
1265 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1266 trie->jump[curword] = (U16)(noper_next - convert); \
1267 if (!jumper) \
1268 jumper = noper_next; \
1269 if (!nextbranch) \
1270 nextbranch= regnext(cur); \
1271 } \
1272 \
1273 if ( dupe ) { \
1274 \
1275 \
1276 \
1277 \
1278 if (!trie->nextword) \
1279 trie->nextword = (U16 *) \
1280 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1281 while ( trie->nextword[dupe] ) \
1282 dupe= trie->nextword[dupe]; \
1283 trie->nextword[dupe]= curword; \
1284 } else { \
1285 \
1286 trie->states[ state ].wordnum = curword; \
1287 } \
1288} STMT_END
1289
1290
1291#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1292 ( ( base + charid >= ucharcount \
1293 && base + charid < ubound \
1294 && state == trie->trans[ base - ucharcount + charid ].check \
1295 && trie->trans[ base - ucharcount + charid ].next ) \
1296 ? trie->trans[ base - ucharcount + charid ].next \
1297 : ( state==1 ? special : 0 ) \
1298 )
1299
1300#define MADE_TRIE 1
1301#define MADE_JUMP_TRIE 2
1302#define MADE_EXACT_TRIE 4
1303
1304STATIC I32
1305S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1306{
1307 dVAR;
1308
1309 reg_trie_data *trie;
1310 HV *widecharmap = NULL;
1311 AV *revcharmap = newAV();
1312 regnode *cur;
1313 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1314 STRLEN len = 0;
1315 UV uvc = 0;
1316 U16 curword = 0;
1317 U32 next_alloc = 0;
1318 regnode *jumper = NULL;
1319 regnode *nextbranch = NULL;
1320 regnode *convert = NULL;
1321
1322 const U8 * const folder = ( flags == EXACTF
1323 ? PL_fold
1324 : ( flags == EXACTFL
1325 ? PL_fold_locale
1326 : NULL
1327 )
1328 );
1329
1330#ifdef DEBUGGING
1331 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1332 AV *trie_words = NULL;
1333
1334
1335
1336#else
1337 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1338 STRLEN trie_charcount=0;
1339#endif
1340 SV *re_trie_maxbuff;
1341 GET_RE_DEBUG_FLAGS_DECL;
1342
1343 PERL_ARGS_ASSERT_MAKE_TRIE;
1344#ifndef DEBUGGING
1345 PERL_UNUSED_ARG(depth);
1346#endif
1347
1348 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1349 trie->refcount = 1;
1350 trie->startstate = 1;
1351 trie->wordcount = word_count;
1352 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1353 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1354 if (!(UTF && folder))
1355 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1356 DEBUG_r({
1357 trie_words = newAV();
1358 });
1359
1360 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1361 if (!SvIOK(re_trie_maxbuff)) {
1362 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1363 }
1364 DEBUG_OPTIMISE_r({
1365 PerlIO_printf( Perl_debug_log,
1366 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1367 (int)depth * 2 + 2, "",
1368 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1369 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1370 (int)depth);
1371 });
1372
1373
1374 if ( first == startbranch && OP( last ) != BRANCH ) {
1375
1376 convert = first;
1377 } else {
1378
1379 convert = NEXTOPER( first );
1380 }
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1406 regnode * const noper = NEXTOPER( cur );
1407 const U8 *uc = (U8*)STRING( noper );
1408 const U8 * const e = uc + STR_LEN( noper );
1409 STRLEN foldlen = 0;
1410 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1411 const U8 *scan = (U8*)NULL;
1412 U32 wordlen = 0;
1413 STRLEN chars = 0;
1414 bool set_bit = trie->bitmap ? 1 : 0;
1415
1416 if (OP(noper) == NOTHING) {
1417 trie->minlen= 0;
1418 continue;
1419 }
1420 if ( set_bit )
1421 TRIE_BITMAP_SET(trie,*uc);
1422
1423
1424 for ( ; uc < e ; uc += len ) {
1425 TRIE_CHARCOUNT(trie)++;
1426 TRIE_READ_CHAR;
1427 chars++;
1428 if ( uvc < 256 ) {
1429 if ( !trie->charmap[ uvc ] ) {
1430 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1431 if ( folder )
1432 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1433 TRIE_STORE_REVCHAR;
1434 }
1435 if ( set_bit ) {
1436
1437
1438 TRIE_BITMAP_SET(trie,uvc);
1439
1440
1441 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1442
1443 if ( !UTF ) {
1444
1445
1446 if (127 < uvc && uvc < 192) {
1447 TRIE_BITMAP_SET(trie,194);
1448 } else if (191 < uvc ) {
1449 TRIE_BITMAP_SET(trie,195);
1450
1451 }
1452 }
1453 set_bit = 0;
1454 }
1455 } else {
1456 SV** svpp;
1457 if ( !widecharmap )
1458 widecharmap = newHV();
1459
1460 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1461
1462 if ( !svpp )
1463 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1464
1465 if ( !SvTRUE( *svpp ) ) {
1466 sv_setiv( *svpp, ++trie->uniquecharcount );
1467 TRIE_STORE_REVCHAR;
1468 }
1469 }
1470 }
1471 if( cur == first ) {
1472 trie->minlen=chars;
1473 trie->maxlen=chars;
1474 } else if (chars < trie->minlen) {
1475 trie->minlen=chars;
1476 } else if (chars > trie->maxlen) {
1477 trie->maxlen=chars;
1478 }
1479
1480 }
1481 DEBUG_TRIE_COMPILE_r(
1482 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1483 (int)depth * 2 + 2,"",
1484 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1485 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1486 (int)trie->minlen, (int)trie->maxlen )
1487 );
1488 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525 STRLEN transcount = 1;
1526
1527 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1528 "%*sCompiling trie using list compiler\n",
1529 (int)depth * 2 + 2, ""));
1530
1531 trie->states = (reg_trie_state *)
1532 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1533 sizeof(reg_trie_state) );
1534 TRIE_LIST_NEW(1);
1535 next_alloc = 2;
1536
1537 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1538
1539 regnode * const noper = NEXTOPER( cur );
1540 U8 *uc = (U8*)STRING( noper );
1541 const U8 * const e = uc + STR_LEN( noper );
1542 U32 state = 1;
1543 U16 charid = 0;
1544 U8 *scan = (U8*)NULL;
1545 STRLEN foldlen = 0;
1546 U32 wordlen = 0;
1547 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1548
1549 if (OP(noper) != NOTHING) {
1550 for ( ; uc < e ; uc += len ) {
1551
1552 TRIE_READ_CHAR;
1553
1554 if ( uvc < 256 ) {
1555 charid = trie->charmap[ uvc ];
1556 } else {
1557 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1558 if ( !svpp ) {
1559 charid = 0;
1560 } else {
1561 charid=(U16)SvIV( *svpp );
1562 }
1563 }
1564
1565 if ( charid ) {
1566
1567 U16 check;
1568 U32 newstate = 0;
1569
1570 charid--;
1571 if ( !trie->states[ state ].trans.list ) {
1572 TRIE_LIST_NEW( state );
1573 }
1574 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1575 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1576 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1577 break;
1578 }
1579 }
1580 if ( ! newstate ) {
1581 newstate = next_alloc++;
1582 TRIE_LIST_PUSH( state, charid, newstate );
1583 transcount++;
1584 }
1585 state = newstate;
1586 } else {
1587 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1588 }
1589 }
1590 }
1591 TRIE_HANDLE_WORD(state);
1592
1593 }
1594
1595
1596 trie->statecount = next_alloc;
1597 trie->states = (reg_trie_state *)
1598 PerlMemShared_realloc( trie->states,
1599 next_alloc
1600 * sizeof(reg_trie_state) );
1601
1602
1603 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1604 revcharmap, next_alloc,
1605 depth+1)
1606 );
1607
1608 trie->trans = (reg_trie_trans *)
1609 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1610 {
1611 U32 state;
1612 U32 tp = 0;
1613 U32 zp = 0;
1614
1615
1616 for( state=1 ; state < next_alloc ; state ++ ) {
1617 U32 base=0;
1618
1619
1620
1621
1622
1623
1624
1625 if (trie->states[state].trans.list) {
1626 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1627 U16 maxid=minid;
1628 U16 idx;
1629
1630 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1631 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1632 if ( forid < minid ) {
1633 minid=forid;
1634 } else if ( forid > maxid ) {
1635 maxid=forid;
1636 }
1637 }
1638 if ( transcount < tp + maxid - minid + 1) {
1639 transcount *= 2;
1640 trie->trans = (reg_trie_trans *)
1641 PerlMemShared_realloc( trie->trans,
1642 transcount
1643 * sizeof(reg_trie_trans) );
1644 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1645 }
1646 base = trie->uniquecharcount + tp - minid;
1647 if ( maxid == minid ) {
1648 U32 set = 0;
1649 for ( ; zp < tp ; zp++ ) {
1650 if ( ! trie->trans[ zp ].next ) {
1651 base = trie->uniquecharcount + zp - minid;
1652 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1653 trie->trans[ zp ].check = state;
1654 set = 1;
1655 break;
1656 }
1657 }
1658 if ( !set ) {
1659 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1660 trie->trans[ tp ].check = state;
1661 tp++;
1662 zp = tp;
1663 }
1664 } else {
1665 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1666 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1667 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1668 trie->trans[ tid ].check = state;
1669 }
1670 tp += ( maxid - minid + 1 );
1671 }
1672 Safefree(trie->states[ state ].trans.list);
1673 }
1674
1675
1676
1677
1678
1679 trie->states[ state ].trans.base=base;
1680 }
1681 trie->lasttrans = tp + 1;
1682 }
1683 } else {
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1720 "%*sCompiling trie using table compiler\n",
1721 (int)depth * 2 + 2, ""));
1722
1723 trie->trans = (reg_trie_trans *)
1724 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1725 * trie->uniquecharcount + 1,
1726 sizeof(reg_trie_trans) );
1727 trie->states = (reg_trie_state *)
1728 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1729 sizeof(reg_trie_state) );
1730 next_alloc = trie->uniquecharcount + 1;
1731
1732
1733 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1734
1735 regnode * const noper = NEXTOPER( cur );
1736 const U8 *uc = (U8*)STRING( noper );
1737 const U8 * const e = uc + STR_LEN( noper );
1738
1739 U32 state = 1;
1740
1741 U16 charid = 0;
1742 U32 accept_state = 0;
1743 U8 *scan = (U8*)NULL;
1744
1745 STRLEN foldlen = 0;
1746 U32 wordlen = 0;
1747 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1748
1749 if ( OP(noper) != NOTHING ) {
1750 for ( ; uc < e ; uc += len ) {
1751
1752 TRIE_READ_CHAR;
1753
1754 if ( uvc < 256 ) {
1755 charid = trie->charmap[ uvc ];
1756 } else {
1757 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1758 charid = svpp ? (U16)SvIV(*svpp) : 0;
1759 }
1760 if ( charid ) {
1761 charid--;
1762 if ( !trie->trans[ state + charid ].next ) {
1763 trie->trans[ state + charid ].next = next_alloc;
1764 trie->trans[ state ].check++;
1765 next_alloc += trie->uniquecharcount;
1766 }
1767 state = trie->trans[ state + charid ].next;
1768 } else {
1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1770 }
1771
1772 }
1773 }
1774 accept_state = TRIE_NODENUM( state );
1775 TRIE_HANDLE_WORD(accept_state);
1776
1777 }
1778
1779
1780 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1781 revcharmap,
1782 next_alloc, depth+1));
1783
1784 {
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846 const U32 laststate = TRIE_NODENUM( next_alloc );
1847 U32 state, charid;
1848 U32 pos = 0, zp=0;
1849 trie->statecount = laststate;
1850
1851 for ( state = 1 ; state < laststate ; state++ ) {
1852 U8 flag = 0;
1853 const U32 stateidx = TRIE_NODEIDX( state );
1854 const U32 o_used = trie->trans[ stateidx ].check;
1855 U32 used = trie->trans[ stateidx ].check;
1856 trie->trans[ stateidx ].check = 0;
1857
1858 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1859 if ( flag || trie->trans[ stateidx + charid ].next ) {
1860 if ( trie->trans[ stateidx + charid ].next ) {
1861 if (o_used == 1) {
1862 for ( ; zp < pos ; zp++ ) {
1863 if ( ! trie->trans[ zp ].next ) {
1864 break;
1865 }
1866 }
1867 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1868 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1869 trie->trans[ zp ].check = state;
1870 if ( ++zp > pos ) pos = zp;
1871 break;
1872 }
1873 used--;
1874 }
1875 if ( !flag ) {
1876 flag = 1;
1877 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1878 }
1879 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ pos ].check = state;
1881 pos++;
1882 }
1883 }
1884 }
1885 trie->lasttrans = pos + 1;
1886 trie->states = (reg_trie_state *)
1887 PerlMemShared_realloc( trie->states, laststate
1888 * sizeof(reg_trie_state) );
1889 DEBUG_TRIE_COMPILE_MORE_r(
1890 PerlIO_printf( Perl_debug_log,
1891 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1892 (int)depth * 2 + 2,"",
1893 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1894 (IV)next_alloc,
1895 (IV)pos,
1896 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1897 );
1898
1899 }
1900 }
1901 DEBUG_TRIE_COMPILE_MORE_r(
1902 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1903 (int)depth * 2 + 2, "",
1904 (UV)trie->statecount,
1905 (UV)trie->lasttrans)
1906 );
1907
1908 trie->trans = (reg_trie_trans *)
1909 PerlMemShared_realloc( trie->trans, trie->lasttrans
1910 * sizeof(reg_trie_trans) );
1911
1912
1913 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1914
1915 {
1916 U8 nodetype =(U8)(flags & 0xFF);
1917 char *str=NULL;
1918
1919#ifdef DEBUGGING
1920 regnode *optimize = NULL;
1921#ifdef RE_TRACK_PATTERN_OFFSETS
1922
1923 U32 mjd_offset = 0;
1924 U32 mjd_nodelen = 0;
1925#endif
1926#endif
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936 if ( first != startbranch || OP( last ) == BRANCH ) {
1937
1938 NEXT_OFF( first ) = (U16)(last - first);
1939#ifdef RE_TRACK_PATTERN_OFFSETS
1940 DEBUG_r({
1941 mjd_offset= Node_Offset((convert));
1942 mjd_nodelen= Node_Length((convert));
1943 });
1944#endif
1945
1946 }
1947#ifdef RE_TRACK_PATTERN_OFFSETS
1948 else {
1949 DEBUG_r({
1950 const regnode *nop = NEXTOPER( convert );
1951 mjd_offset= Node_Offset((nop));
1952 mjd_nodelen= Node_Length((nop));
1953 });
1954 }
1955 DEBUG_OPTIMISE_r(
1956 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1957 (int)depth * 2 + 2, "",
1958 (UV)mjd_offset, (UV)mjd_nodelen)
1959 );
1960#endif
1961
1962
1963 trie->startstate= 1;
1964 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1965 U32 state;
1966 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1967 U32 ofs = 0;
1968 I32 idx = -1;
1969 U32 count = 0;
1970 const U32 base = trie->states[ state ].trans.base;
1971
1972 if ( trie->states[state].wordnum )
1973 count = 1;
1974
1975 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1976 if ( ( base + ofs >= trie->uniquecharcount ) &&
1977 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1978 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1979 {
1980 if ( ++count > 1 ) {
1981 SV **tmp = av_fetch( revcharmap, ofs, 0);
1982 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1983 if ( state == 1 ) break;
1984 if ( count == 2 ) {
1985 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1986 DEBUG_OPTIMISE_r(
1987 PerlIO_printf(Perl_debug_log,
1988 "%*sNew Start State=%"UVuf" Class: [",
1989 (int)depth * 2 + 2, "",
1990 (UV)state));
1991 if (idx >= 0) {
1992 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1993 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1994
1995 TRIE_BITMAP_SET(trie,*ch);
1996 if ( folder )
1997 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1998 DEBUG_OPTIMISE_r(
1999 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2000 );
2001 }
2002 }
2003 TRIE_BITMAP_SET(trie,*ch);
2004 if ( folder )
2005 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2006 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2007 }
2008 idx = ofs;
2009 }
2010 }
2011 if ( count == 1 ) {
2012 SV **tmp = av_fetch( revcharmap, idx, 0);
2013 STRLEN len;
2014 char *ch = SvPV( *tmp, len );
2015 DEBUG_OPTIMISE_r({
2016 SV *sv=sv_newmortal();
2017 PerlIO_printf( Perl_debug_log,
2018 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2019 (int)depth * 2 + 2, "",
2020 (UV)state, (UV)idx,
2021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2022 PL_colors[0], PL_colors[1],
2023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2024 PERL_PV_ESCAPE_FIRSTCHAR
2025 )
2026 );
2027 });
2028 if ( state==1 ) {
2029 OP( convert ) = nodetype;
2030 str=STRING(convert);
2031 STR_LEN(convert)=0;
2032 }
2033 STR_LEN(convert) += len;
2034 while (len--)
2035 *str++ = *ch++;
2036 } else {
2037#ifdef DEBUGGING
2038 if (state>1)
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2040#endif
2041 break;
2042 }
2043 }
2044 if (str) {
2045 regnode *n = convert+NODE_SZ_STR(convert);
2046 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2047 trie->startstate = state;
2048 trie->minlen -= (state - 1);
2049 trie->maxlen -= (state - 1);
2050#ifdef DEBUGGING
2051
2052
2053
2054 if (
2055#ifdef PERL_EXT_RE_BUILD
2056 1
2057#else
2058 DEBUG_r_TEST
2059#endif
2060 ) {
2061 regnode *fix = convert;
2062 U32 word = trie->wordcount;
2063 mjd_nodelen++;
2064 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2065 while( ++fix < n ) {
2066 Set_Node_Offset_Length(fix, 0, 0);
2067 }
2068 while (word--) {
2069 SV ** const tmp = av_fetch( trie_words, word, 0 );
2070 if (tmp) {
2071 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2072 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2073 else
2074 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2075 }
2076 }
2077 }
2078#endif
2079 if (trie->maxlen) {
2080 convert = n;
2081 } else {
2082 NEXT_OFF(convert) = (U16)(tail - convert);
2083 DEBUG_r(optimize= n);
2084 }
2085 }
2086 }
2087 if (!jumper)
2088 jumper = last;
2089 if ( trie->maxlen ) {
2090 NEXT_OFF( convert ) = (U16)(tail - convert);
2091 ARG_SET( convert, data_slot );
2092
2093
2094
2095 if (trie->jump)
2096 trie->jump[0] = (U16)(nextbranch - convert);
2097
2098
2099 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2100 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2101 {
2102 OP( convert ) = TRIEC;
2103 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2104 PerlMemShared_free(trie->bitmap);
2105 trie->bitmap= NULL;
2106 } else
2107 OP( convert ) = TRIE;
2108
2109
2110 convert->flags = nodetype;
2111 DEBUG_r({
2112 optimize = convert
2113 + NODE_STEP_REGNODE
2114 + regarglen[ OP( convert ) ];
2115 });
2116
2117
2118 }
2119
2120 DEBUG_r(if (optimize) {
2121 regnode *opt = convert;
2122
2123 while ( ++opt < optimize) {
2124 Set_Node_Offset_Length(opt,0,0);
2125 }
2126
2127
2128
2129
2130 while( optimize < jumper ) {
2131 mjd_nodelen += Node_Length((optimize));
2132 OP( optimize ) = OPTIMIZED;
2133 Set_Node_Offset_Length(optimize,0,0);
2134 optimize++;
2135 }
2136 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2137 });
2138 }
2139 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2140#ifdef DEBUGGING
2141 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2142 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2143#else
2144 SvREFCNT_dec(revcharmap);
2145#endif
2146 return trie->jump
2147 ? MADE_JUMP_TRIE
2148 : trie->startstate>1
2149 ? MADE_EXACT_TRIE
2150 : MADE_TRIE;
2151}
2152
2153STATIC void
2154S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2155{
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174 const U32 trie_offset = ARG(source);
2175 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2176 U32 *q;
2177 const U32 ucharcount = trie->uniquecharcount;
2178 const U32 numstates = trie->statecount;
2179 const U32 ubound = trie->lasttrans + ucharcount;
2180 U32 q_read = 0;
2181 U32 q_write = 0;
2182 U32 charid;
2183 U32 base = trie->states[ 1 ].trans.base;
2184 U32 *fail;
2185 reg_ac_data *aho;
2186 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2187 GET_RE_DEBUG_FLAGS_DECL;
2188
2189 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2190#ifndef DEBUGGING
2191 PERL_UNUSED_ARG(depth);
2192#endif
2193
2194
2195 ARG_SET( stclass, data_slot );
2196 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2197 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2198 aho->trie=trie_offset;
2199 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2200 Copy( trie->states, aho->states, numstates, reg_trie_state );
2201 Newxz( q, numstates, U32);
2202 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2203 aho->refcount = 1;
2204 fail = aho->fail;
2205
2206
2207 fail[ 0 ] = fail[ 1 ] = 1;
2208
2209 for ( charid = 0; charid < ucharcount ; charid++ ) {
2210 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2211 if ( newstate ) {
2212 q[ q_write ] = newstate;
2213
2214 fail[ q[ q_write++ ] ]=1;
2215 }
2216 }
2217 while ( q_read < q_write) {
2218 const U32 cur = q[ q_read++ % numstates ];
2219 base = trie->states[ cur ].trans.base;
2220
2221 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2222 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2223 if (ch_state) {
2224 U32 fail_state = cur;
2225 U32 fail_base;
2226 do {
2227 fail_state = fail[ fail_state ];
2228 fail_base = aho->states[ fail_state ].trans.base;
2229 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2230
2231 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2232 fail[ ch_state ] = fail_state;
2233 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2234 {
2235 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2236 }
2237 q[ q_write++ % numstates] = ch_state;
2238 }
2239 }
2240 }
2241
2242
2243
2244
2245
2246
2247 fail[ 0 ] = fail[ 1 ] = 0;
2248 DEBUG_TRIE_COMPILE_r({
2249 PerlIO_printf(Perl_debug_log,
2250 "%*sStclass Failtable (%"UVuf" states): 0",
2251 (int)(depth * 2), "", (UV)numstates
2252 );
2253 for( q_read=1; q_read<numstates; q_read++ ) {
2254 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2255 }
2256 PerlIO_printf(Perl_debug_log, "\n");
2257 });
2258 Safefree(q);
2259
2260}
2261
2262
2263
2264
2265
2266
2267#if defined(__sparc64__) && defined(__GNUC__)
2268# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2269# undef SPARC64_GCC_WORKAROUND
2270# define SPARC64_GCC_WORKAROUND 1
2271# endif
2272#endif
2273
2274#define DEBUG_PEEP(str,scan,depth) \
2275 DEBUG_OPTIMISE_r({if (scan){ \
2276 SV * const mysv=sv_newmortal(); \
2277 regnode *Next = regnext(scan); \
2278 regprop(RExC_rx, mysv, scan); \
2279 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2280 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2281 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2282 }});
2283
2284
2285
2286
2287
2288#define JOIN_EXACT(scan,min,flags) \
2289 if (PL_regkind[OP(scan)] == EXACT) \
2290 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2291
2292STATIC U32
2293S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2294
2295 regnode *n = regnext(scan);
2296 U32 stringok = 1;
2297 regnode *next = scan + NODE_SZ_STR(scan);
2298 U32 merged = 0;
2299 U32 stopnow = 0;
2300#ifdef DEBUGGING
2301 regnode *stop = scan;
2302 GET_RE_DEBUG_FLAGS_DECL;
2303#else
2304 PERL_UNUSED_ARG(depth);
2305#endif
2306
2307 PERL_ARGS_ASSERT_JOIN_EXACT;
2308#ifndef EXPERIMENTAL_INPLACESCAN
2309 PERL_UNUSED_ARG(flags);
2310 PERL_UNUSED_ARG(val);
2311#endif
2312 DEBUG_PEEP("join",scan,depth);
2313
2314
2315 while (n &&
2316 ( PL_regkind[OP(n)] == NOTHING ||
2317 (stringok && (OP(n) == OP(scan))))
2318 && NEXT_OFF(n)
2319 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2320
2321 if (OP(n) == TAIL || n > next)
2322 stringok = 0;
2323 if (PL_regkind[OP(n)] == NOTHING) {
2324 DEBUG_PEEP("skip:",n,depth);
2325 NEXT_OFF(scan) += NEXT_OFF(n);
2326 next = n + NODE_STEP_REGNODE;
2327#ifdef DEBUGGING
2328 if (stringok)
2329 stop = n;
2330#endif
2331 n = regnext(n);
2332 }
2333 else if (stringok) {
2334 const unsigned int oldl = STR_LEN(scan);
2335 regnode * const nnext = regnext(n);
2336
2337 DEBUG_PEEP("merg",n,depth);
2338
2339 merged++;
2340 if (oldl + STR_LEN(n) > U8_MAX)
2341 break;
2342 NEXT_OFF(scan) += NEXT_OFF(n);
2343 STR_LEN(scan) += STR_LEN(n);
2344 next = n + NODE_SZ_STR(n);
2345
2346 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2347#ifdef DEBUGGING
2348 stop = next - 1;
2349#endif
2350 n = nnext;
2351 if (stopnow) break;
2352 }
2353
2354#ifdef EXPERIMENTAL_INPLACESCAN
2355 if (flags && !NEXT_OFF(n)) {
2356 DEBUG_PEEP("atch", val, depth);
2357 if (reg_off_by_arg[OP(n)]) {
2358 ARG_SET(n, val - n);
2359 }
2360 else {
2361 NEXT_OFF(n) = val - n;
2362 }
2363 stopnow = 1;
2364 }
2365#endif
2366 }
2367
2368 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397 char * const s0 = STRING(scan), *s, *t;
2398 char * const s1 = s0 + STR_LEN(scan) - 1;
2399 char * const s2 = s1 - 4;
2400#ifdef EBCDIC
2401 const char t0[] = "\xaf\x49\xaf\x42";
2402#else
2403 const char t0[] = "\xcc\x88\xcc\x81";
2404#endif
2405 const char * const t1 = t0 + 3;
2406
2407 for (s = s0 + 2;
2408 s < s2 && (t = ninstr(s, s1, t0, t1));
2409 s = t + 4) {
2410#ifdef EBCDIC
2411 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2412 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2413#else
2414 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2415 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2416#endif
2417 *min -= 4;
2418 }
2419 }
2420
2421#ifdef DEBUGGING
2422
2423 n = scan + NODE_SZ_STR(scan);
2424 while (n <= stop) {
2425 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2426 OP(n) = OPTIMIZED;
2427 NEXT_OFF(n) = 0;
2428 }
2429 n++;
2430 }
2431#endif
2432 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2433 return stopnow;
2434}
2435
2436
2437
2438
2439
2440
2441
2442#define INIT_AND_WITHP \
2443 assert(!and_withp); \
2444 Newx(and_withp,1,struct regnode_charclass_class); \
2445 SAVEFREEPV(and_withp)
2446
2447
2448
2449
2450struct scan_frame;
2451typedef struct scan_frame {
2452 regnode *last;
2453 regnode *next;
2454 struct scan_frame *prev;
2455 I32 stop;
2456} scan_frame;
2457
2458
2459#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2460
2461#define CASE_SYNST_FNC(nAmE) \
2462case nAmE: \
2463 if (flags & SCF_DO_STCLASS_AND) { \
2464 for (value = 0; value < 256; value++) \
2465 if (!is_ ## nAmE ## _cp(value)) \
2466 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2467 } \
2468 else { \
2469 for (value = 0; value < 256; value++) \
2470 if (is_ ## nAmE ## _cp(value)) \
2471 ANYOF_BITMAP_SET(data->start_class, value); \
2472 } \
2473 break; \
2474case N ## nAmE: \
2475 if (flags & SCF_DO_STCLASS_AND) { \
2476 for (value = 0; value < 256; value++) \
2477 if (is_ ## nAmE ## _cp(value)) \
2478 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2479 } \
2480 else { \
2481 for (value = 0; value < 256; value++) \
2482 if (!is_ ## nAmE ## _cp(value)) \
2483 ANYOF_BITMAP_SET(data->start_class, value); \
2484 } \
2485 break
2486
2487
2488
2489STATIC I32
2490S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2491 I32 *minlenp, I32 *deltap,
2492 regnode *last,
2493 scan_data_t *data,
2494 I32 stopparen,
2495 U8* recursed,
2496 struct regnode_charclass_class *and_withp,
2497 U32 flags, U32 depth)
2498
2499
2500
2501
2502
2503
2504
2505{
2506 dVAR;
2507 I32 min = 0, pars = 0, code;
2508 regnode *scan = *scanp, *next;
2509 I32 delta = 0;
2510 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2511 int is_inf_internal = 0;
2512 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2513 scan_data_t data_fake;
2514 SV *re_trie_maxbuff = NULL;
2515 regnode *first_non_open = scan;
2516 I32 stopmin = I32_MAX;
2517 scan_frame *frame = NULL;
2518 GET_RE_DEBUG_FLAGS_DECL;
2519
2520 PERL_ARGS_ASSERT_STUDY_CHUNK;
2521
2522#ifdef DEBUGGING
2523 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2524#endif
2525
2526 if ( depth == 0 ) {
2527 while (first_non_open && OP(first_non_open) == OPEN)
2528 first_non_open=regnext(first_non_open);
2529 }
2530
2531
2532 fake_study_recurse:
2533 while ( scan && OP(scan) != END && scan < last ){
2534
2535 DEBUG_STUDYDATA("Peep:", data,depth);
2536 DEBUG_PEEP("Peep",scan,depth);
2537 JOIN_EXACT(scan,&min,0);
2538
2539
2540
2541 if (OP(scan) != CURLYX) {
2542 const int max = (reg_off_by_arg[OP(scan)]
2543 ? I32_MAX
2544
2545 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2546 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2547 int noff;
2548 regnode *n = scan;
2549
2550
2551 while ((n = regnext(n))
2552 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2553 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2554 && off + noff < max)
2555 off += noff;
2556 if (reg_off_by_arg[OP(scan)])
2557 ARG(scan) = off;
2558 else
2559 NEXT_OFF(scan) = off;
2560 }
2561
2562
2563
2564
2565
2566 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2567 || OP(scan) == IFTHEN) {
2568 next = regnext(scan);
2569 code = OP(scan);
2570
2571
2572 if (OP(next) == code || code == IFTHEN) {
2573
2574
2575
2576 I32 max1 = 0, min1 = I32_MAX, num = 0;
2577 struct regnode_charclass_class accum;
2578 regnode * const startbranch=scan;
2579
2580 if (flags & SCF_DO_SUBSTR)
2581 SCAN_COMMIT(pRExC_state, data, minlenp);
2582 if (flags & SCF_DO_STCLASS)
2583 cl_init_zero(pRExC_state, &accum);
2584
2585 while (OP(scan) == code) {
2586 I32 deltanext, minnext, f = 0, fake;
2587 struct regnode_charclass_class this_class;
2588
2589 num++;
2590 data_fake.flags = 0;
2591 if (data) {
2592 data_fake.whilem_c = data->whilem_c;
2593 data_fake.last_closep = data->last_closep;
2594 }
2595 else
2596 data_fake.last_closep = &fake;
2597
2598 data_fake.pos_delta = delta;
2599 next = regnext(scan);
2600 scan = NEXTOPER(scan);
2601 if (code != BRANCH)
2602 scan = NEXTOPER(scan);
2603 if (flags & SCF_DO_STCLASS) {
2604 cl_init(pRExC_state, &this_class);
2605 data_fake.start_class = &this_class;
2606 f = SCF_DO_STCLASS_AND;
2607 }
2608 if (flags & SCF_WHILEM_VISITED_POS)
2609 f |= SCF_WHILEM_VISITED_POS;
2610
2611
2612 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2613 next, &data_fake,
2614 stopparen, recursed, NULL, f,depth+1);
2615 if (min1 > minnext)
2616 min1 = minnext;
2617 if (max1 < minnext + deltanext)
2618 max1 = minnext + deltanext;
2619 if (deltanext == I32_MAX)
2620 is_inf = is_inf_internal = 1;
2621 scan = next;
2622 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2623 pars++;
2624 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2625 if ( stopmin > minnext)
2626 stopmin = min + min1;
2627 flags &= ~SCF_DO_SUBSTR;
2628 if (data)
2629 data->flags |= SCF_SEEN_ACCEPT;
2630 }
2631 if (data) {
2632 if (data_fake.flags & SF_HAS_EVAL)
2633 data->flags |= SF_HAS_EVAL;
2634 data->whilem_c = data_fake.whilem_c;
2635 }
2636 if (flags & SCF_DO_STCLASS)
2637 cl_or(pRExC_state, &accum, &this_class);
2638 }
2639 if (code == IFTHEN && num < 2)
2640 min1 = 0;
2641 if (flags & SCF_DO_SUBSTR) {
2642 data->pos_min += min1;
2643 data->pos_delta += max1 - min1;
2644 if (max1 != min1 || is_inf)
2645 data->longest = &(data->longest_float);
2646 }
2647 min += min1;
2648 delta += max1 - min1;
2649 if (flags & SCF_DO_STCLASS_OR) {
2650 cl_or(pRExC_state, data->start_class, &accum);
2651 if (min1) {
2652 cl_and(data->start_class, and_withp);
2653 flags &= ~SCF_DO_STCLASS;
2654 }
2655 }
2656 else if (flags & SCF_DO_STCLASS_AND) {
2657 if (min1) {
2658 cl_and(data->start_class, &accum);
2659 flags &= ~SCF_DO_STCLASS;
2660 }
2661 else {
2662
2663
2664 INIT_AND_WITHP;
2665 StructCopy(data->start_class, and_withp,
2666 struct regnode_charclass_class);
2667 flags &= ~SCF_DO_STCLASS_AND;
2668 StructCopy(&accum, data->start_class,
2669 struct regnode_charclass_class);
2670 flags |= SCF_DO_STCLASS_OR;
2671 data->start_class->flags |= ANYOF_EOS;
2672 }
2673 }
2674
2675 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717 int made=0;
2718 if (!re_trie_maxbuff) {
2719 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2720 if (!SvIOK(re_trie_maxbuff))
2721 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2722 }
2723 if ( SvIV(re_trie_maxbuff)>=0 ) {
2724 regnode *cur;
2725 regnode *first = (regnode *)NULL;
2726 regnode *last = (regnode *)NULL;
2727 regnode *tail = scan;
2728 U8 optype = 0;
2729 U32 count=0;
2730
2731#ifdef DEBUGGING
2732 SV * const mysv = sv_newmortal();
2733#endif
2734
2735
2736
2737
2738
2739
2740
2741
2742 while ( OP( tail ) == TAIL ) {
2743
2744 tail = regnext( tail );
2745 }
2746
2747
2748 DEBUG_OPTIMISE_r({
2749 regprop(RExC_rx, mysv, tail );
2750 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2751 (int)depth * 2 + 2, "",
2752 "Looking for TRIE'able sequences. Tail node is: ",
2753 SvPV_nolen_const( mysv )
2754 );
2755 });
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2784 regnode * const noper = NEXTOPER( cur );
2785#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2786 regnode * const noper_next = regnext( noper );
2787#endif
2788
2789 DEBUG_OPTIMISE_r({
2790 regprop(RExC_rx, mysv, cur);
2791 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2792 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2793
2794 regprop(RExC_rx, mysv, noper);
2795 PerlIO_printf( Perl_debug_log, " -> %s",
2796 SvPV_nolen_const(mysv));
2797
2798 if ( noper_next ) {
2799 regprop(RExC_rx, mysv, noper_next );
2800 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2801 SvPV_nolen_const(mysv));
2802 }
2803 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2804 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2805 });
2806 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2807 : PL_regkind[ OP( noper ) ] == EXACT )
2808 || OP(noper) == NOTHING )
2809#ifdef NOJUMPTRIE
2810 && noper_next == tail
2811#endif
2812 && count < U16_MAX)
2813 {
2814 count++;
2815 if ( !first || optype == NOTHING ) {
2816 if (!first) first = cur;
2817 optype = OP( noper );
2818 } else {
2819 last = cur;
2820 }
2821 } else {
2822
2823
2824
2825
2826
2827
2828
2829#define TRIE_TYPE_IS_SAFE 1
2830 if ( last && TRIE_TYPE_IS_SAFE ) {
2831 make_trie( pRExC_state,
2832 startbranch, first, cur, tail, count,
2833 optype, depth+1 );
2834 }
2835 if ( PL_regkind[ OP( noper ) ] == EXACT
2836#ifdef NOJUMPTRIE
2837 && noper_next == tail
2838#endif
2839 ){
2840 count = 1;
2841 first = cur;
2842 optype = OP( noper );
2843 } else {
2844 count = 0;
2845 first = NULL;
2846 optype = 0;
2847 }
2848 last = NULL;
2849 }
2850 }
2851 DEBUG_OPTIMISE_r({
2852 regprop(RExC_rx, mysv, cur);
2853 PerlIO_printf( Perl_debug_log,
2854 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2855 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2856
2857 });
2858
2859 if ( last && TRIE_TYPE_IS_SAFE ) {
2860 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2861#ifdef TRIE_STUDY_OPT
2862 if ( ((made == MADE_EXACT_TRIE &&
2863 startbranch == first)
2864 || ( first_non_open == first )) &&
2865 depth==0 ) {
2866 flags |= SCF_TRIE_RESTUDY;
2867 if ( startbranch == first
2868 && scan == tail )
2869 {
2870 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2871 }
2872 }
2873#endif
2874 }
2875 }
2876
2877 }
2878
2879 }
2880 else if ( code == BRANCHJ ) {
2881 scan = NEXTOPER(NEXTOPER(scan));
2882 } else
2883 scan = NEXTOPER(scan);
2884 continue;
2885 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2886 scan_frame *newframe = NULL;
2887 I32 paren;
2888 regnode *start;
2889 regnode *end;
2890
2891 if (OP(scan) != SUSPEND) {
2892
2893 if (OP(scan) == GOSUB) {
2894 paren = ARG(scan);
2895 RExC_recurse[ARG2L(scan)] = scan;
2896 start = RExC_open_parens[paren-1];
2897 end = RExC_close_parens[paren-1];
2898 } else {
2899 paren = 0;
2900 start = RExC_rxi->program + 1;
2901 end = RExC_opend;
2902 }
2903 if (!recursed) {
2904 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2905 SAVEFREEPV(recursed);
2906 }
2907 if (!PAREN_TEST(recursed,paren+1)) {
2908 PAREN_SET(recursed,paren+1);
2909 Newx(newframe,1,scan_frame);
2910 } else {
2911 if (flags & SCF_DO_SUBSTR) {
2912 SCAN_COMMIT(pRExC_state,data,minlenp);
2913 data->longest = &(data->longest_float);
2914 }
2915 is_inf = is_inf_internal = 1;
2916 if (flags & SCF_DO_STCLASS_OR)
2917 cl_anything(pRExC_state, data->start_class);
2918 flags &= ~SCF_DO_STCLASS;
2919 }
2920 } else {
2921 Newx(newframe,1,scan_frame);
2922 paren = stopparen;
2923 start = scan+2;
2924 end = regnext(scan);
2925 }
2926 if (newframe) {
2927 assert(start);
2928 assert(end);
2929 SAVEFREEPV(newframe);
2930 newframe->next = regnext(scan);
2931 newframe->last = last;
2932 newframe->stop = stopparen;
2933 newframe->prev = frame;
2934
2935 frame = newframe;
2936 scan = start;
2937 stopparen = paren;
2938 last = end;
2939
2940 continue;
2941 }
2942 }
2943 else if (OP(scan) == EXACT) {
2944 I32 l = STR_LEN(scan);
2945 UV uc;
2946 if (UTF) {
2947 const U8 * const s = (U8*)STRING(scan);
2948 l = utf8_length(s, s + l);
2949 uc = utf8_to_uvchr(s, NULL);
2950 } else {
2951 uc = *((U8*)STRING(scan));
2952 }
2953 min += l;
2954 if (flags & SCF_DO_SUBSTR) {
2955
2956
2957 if (data->last_end == -1) {
2958 data->last_start_min = data->pos_min;
2959 data->last_start_max = is_inf
2960 ? I32_MAX : data->pos_min + data->pos_delta;
2961 }
2962 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2963 if (UTF)
2964 SvUTF8_on(data->last_found);
2965 {
2966 SV * const sv = data->last_found;
2967 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2968 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2969 if (mg && mg->mg_len >= 0)
2970 mg->mg_len += utf8_length((U8*)STRING(scan),
2971 (U8*)STRING(scan)+STR_LEN(scan));
2972 }
2973 data->last_end = data->pos_min + l;
2974 data->pos_min += l;
2975 data->flags &= ~SF_BEFORE_EOL;
2976 }
2977 if (flags & SCF_DO_STCLASS_AND) {
2978
2979 int compat = 1;
2980
2981 if (uc >= 0x100 ||
2982 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2983 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2984 && (!(data->start_class->flags & ANYOF_FOLD)
2985 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2986 )
2987 compat = 0;
2988 ANYOF_CLASS_ZERO(data->start_class);
2989 ANYOF_BITMAP_ZERO(data->start_class);
2990 if (compat)
2991 ANYOF_BITMAP_SET(data->start_class, uc);
2992 data->start_class->flags &= ~ANYOF_EOS;
2993 if (uc < 0x100)
2994 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2995 }
2996 else if (flags & SCF_DO_STCLASS_OR) {
2997
2998 if (uc < 0x100)
2999 ANYOF_BITMAP_SET(data->start_class, uc);
3000 else
3001 data->start_class->flags |= ANYOF_UNICODE_ALL;
3002 data->start_class->flags &= ~ANYOF_EOS;
3003 cl_and(data->start_class, and_withp);
3004 }
3005 flags &= ~SCF_DO_STCLASS;
3006 }
3007 else if (PL_regkind[OP(scan)] == EXACT) {
3008 I32 l = STR_LEN(scan);
3009 UV uc = *((U8*)STRING(scan));
3010
3011
3012 if (flags & SCF_DO_SUBSTR) {
3013 assert(data);
3014 SCAN_COMMIT(pRExC_state, data, minlenp);
3015 }
3016 if (UTF) {
3017 const U8 * const s = (U8 *)STRING(scan);
3018 l = utf8_length(s, s + l);
3019 uc = utf8_to_uvchr(s, NULL);
3020 }
3021 min += l;
3022 if (flags & SCF_DO_SUBSTR)
3023 data->pos_min += l;
3024 if (flags & SCF_DO_STCLASS_AND) {
3025
3026 int compat = 1;
3027
3028 if (uc >= 0x100 ||
3029 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3030 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3031 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3032 compat = 0;
3033 ANYOF_CLASS_ZERO(data->start_class);
3034 ANYOF_BITMAP_ZERO(data->start_class);
3035 if (compat) {
3036 ANYOF_BITMAP_SET(data->start_class, uc);
3037 data->start_class->flags &= ~ANYOF_EOS;
3038 data->start_class->flags |= ANYOF_FOLD;
3039 if (OP(scan) == EXACTFL)
3040 data->start_class->flags |= ANYOF_LOCALE;
3041 }
3042 }
3043 else if (flags & SCF_DO_STCLASS_OR) {
3044 if (data->start_class->flags & ANYOF_FOLD) {
3045
3046
3047 if (uc < 0x100)
3048 ANYOF_BITMAP_SET(data->start_class, uc);
3049 data->start_class->flags &= ~ANYOF_EOS;
3050 }
3051 cl_and(data->start_class, and_withp);
3052 }
3053 flags &= ~SCF_DO_STCLASS;
3054 }
3055 else if (strchr((const char*)PL_varies,OP(scan))) {
3056 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3057 I32 f = flags, pos_before = 0;
3058 regnode * const oscan = scan;
3059 struct regnode_charclass_class this_class;
3060 struct regnode_charclass_class *oclass = NULL;
3061 I32 next_is_eval = 0;
3062
3063 switch (PL_regkind[OP(scan)]) {
3064 case WHILEM:
3065 scan = NEXTOPER(scan);
3066 goto finish;
3067 case PLUS:
3068 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3069 next = NEXTOPER(scan);
3070 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3071 mincount = 1;
3072 maxcount = REG_INFTY;
3073 next = regnext(scan);
3074 scan = NEXTOPER(scan);
3075 goto do_curly;
3076 }
3077 }
3078 if (flags & SCF_DO_SUBSTR)
3079 data->pos_min++;
3080 min++;
3081
3082 case STAR:
3083 if (flags & SCF_DO_STCLASS) {
3084 mincount = 0;
3085 maxcount = REG_INFTY;
3086 next = regnext(scan);
3087 scan = NEXTOPER(scan);
3088 goto do_curly;
3089 }
3090 is_inf = is_inf_internal = 1;
3091 scan = regnext(scan);
3092 if (flags & SCF_DO_SUBSTR) {
3093 SCAN_COMMIT(pRExC_state, data, minlenp);
3094 data->longest = &(data->longest_float);
3095 }
3096 goto optimize_curly_tail;
3097 case CURLY:
3098 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3099 && (scan->flags == stopparen))
3100 {
3101 mincount = 1;
3102 maxcount = 1;
3103 } else {
3104 mincount = ARG1(scan);
3105 maxcount = ARG2(scan);
3106 }
3107 next = regnext(scan);
3108 if (OP(scan) == CURLYX) {
3109 I32 lp = (data ? *(data->last_closep) : 0);
3110 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3111 }
3112 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3113 next_is_eval = (OP(scan) == EVAL);
3114 do_curly:
3115 if (flags & SCF_DO_SUBSTR) {
3116 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp);
3117 pos_before = data->pos_min;
3118 }
3119 if (data) {
3120 fl = data->flags;
3121 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3122 if (is_inf)
3123 data->flags |= SF_IS_INF;
3124 }
3125 if (flags & SCF_DO_STCLASS) {
3126 cl_init(pRExC_state, &this_class);
3127 oclass = data->start_class;
3128 data->start_class = &this_class;
3129 f |= SCF_DO_STCLASS_AND;
3130 f &= ~SCF_DO_STCLASS_OR;
3131 }
3132
3133
3134
3135
3136
3137
3138 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3139 f &= ~SCF_WHILEM_VISITED_POS;
3140
3141
3142 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3143 last, data, stopparen, recursed, NULL,
3144 (mincount == 0
3145 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3146
3147 if (flags & SCF_DO_STCLASS)
3148 data->start_class = oclass;
3149 if (mincount == 0 || minnext == 0) {
3150 if (flags & SCF_DO_STCLASS_OR) {
3151 cl_or(pRExC_state, data->start_class, &this_class);
3152 }
3153 else if (flags & SCF_DO_STCLASS_AND) {
3154
3155
3156 INIT_AND_WITHP;
3157 StructCopy(data->start_class, and_withp,
3158 struct regnode_charclass_class);
3159 flags &= ~SCF_DO_STCLASS_AND;
3160 StructCopy(&this_class, data->start_class,
3161 struct regnode_charclass_class);
3162 flags |= SCF_DO_STCLASS_OR;
3163 data->start_class->flags |= ANYOF_EOS;
3164 }
3165 } else {
3166 if (flags & SCF_DO_STCLASS_OR) {
3167 cl_or(pRExC_state, data->start_class, &this_class);
3168 cl_and(data->start_class, and_withp);
3169 }
3170 else if (flags & SCF_DO_STCLASS_AND)
3171 cl_and(data->start_class, &this_class);
3172 flags &= ~SCF_DO_STCLASS;
3173 }
3174 if (!scan)
3175 scan = next;
3176 if (
3177 (next_is_eval || !(mincount == 0 && maxcount == 1))
3178 && (minnext == 0) && (deltanext == 0)
3179 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3180 && maxcount <= REG_INFTY/3
3181 && ckWARN(WARN_REGEXP))
3182 {
3183 vWARN(RExC_parse,
3184 "Quantifier unexpected on zero-length expression");
3185 }
3186
3187 min += minnext * mincount;
3188 is_inf_internal |= ((maxcount == REG_INFTY
3189 && (minnext + deltanext) > 0)
3190 || deltanext == I32_MAX);
3191 is_inf |= is_inf_internal;
3192 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3193
3194
3195 if ( OP(oscan) == CURLYX && data
3196 && data->flags & SF_IN_PAR
3197 && !(data->flags & SF_HAS_EVAL)
3198 && !deltanext && minnext == 1 ) {
3199
3200 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3201 regnode * const nxt1 = nxt;
3202#ifdef DEBUGGING
3203 regnode *nxt2;
3204#endif
3205
3206
3207 nxt = regnext(nxt);
3208 if (!strchr((const char*)PL_simple,OP(nxt))
3209 && !(PL_regkind[OP(nxt)] == EXACT
3210 && STR_LEN(nxt) == 1))
3211 goto nogo;
3212#ifdef DEBUGGING
3213 nxt2 = nxt;
3214#endif
3215 nxt = regnext(nxt);
3216 if (OP(nxt) != CLOSE)
3217 goto nogo;
3218 if (RExC_open_parens) {
3219 RExC_open_parens[ARG(nxt1)-1]=oscan;
3220 RExC_close_parens[ARG(nxt1)-1]=nxt+2;
3221 }
3222
3223 oscan->flags = (U8)ARG(nxt);
3224 OP(oscan) = CURLYN;
3225 OP(nxt1) = NOTHING;
3226
3227#ifdef DEBUGGING
3228 OP(nxt1 + 1) = OPTIMIZED;
3229 NEXT_OFF(nxt1+ 1) = 0;
3230 NEXT_OFF(nxt2) = 0;
3231 OP(nxt) = OPTIMIZED;
3232 OP(nxt + 1) = OPTIMIZED;
3233 NEXT_OFF(nxt+ 1) = 0;
3234#endif
3235 }
3236 nogo:
3237
3238
3239 if ( OP(oscan) == CURLYX && data
3240 && !(data->flags & SF_HAS_PAR)
3241 && !(data->flags & SF_HAS_EVAL)
3242 && !deltanext
3243 && minnext != 0
3244 ) {
3245
3246
3247 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3248 regnode *nxt2;
3249
3250 OP(oscan) = CURLYM;
3251 while ( (nxt2 = regnext(nxt))
3252 && (OP(nxt2) != WHILEM))
3253 nxt = nxt2;
3254 OP(nxt2) = SUCCEED;
3255
3256 if (data->flags & SF_IN_PAR) {
3257
3258 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3259
3260 if (OP(nxt) != CLOSE)
3261 FAIL("Panic opt close");
3262 oscan->flags = (U8)ARG(nxt);
3263 if (RExC_open_parens) {
3264 RExC_open_parens[ARG(nxt1)-1]=oscan;
3265 RExC_close_parens[ARG(nxt1)-1]=nxt2+1;
3266 }
3267 OP(nxt1) = OPTIMIZED;
3268 OP(nxt) = OPTIMIZED;
3269
3270#ifdef DEBUGGING
3271 OP(nxt1 + 1) = OPTIMIZED;
3272 OP(nxt + 1) = OPTIMIZED;
3273 NEXT_OFF(nxt1 + 1) = 0;
3274 NEXT_OFF(nxt + 1) = 0;
3275#endif
3276#if 0
3277 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3278 regnode *nnxt = regnext(nxt1);
3279
3280 if (nnxt == nxt) {
3281 if (reg_off_by_arg[OP(nxt1)])
3282 ARG_SET(nxt1, nxt2 - nxt1);
3283 else if (nxt2 - nxt1 < U16_MAX)
3284 NEXT_OFF(nxt1) = nxt2 - nxt1;
3285 else
3286 OP(nxt) = NOTHING;
3287 }
3288 nxt1 = nnxt;
3289 }
3290#endif
3291
3292 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3293 NULL, stopparen, recursed, NULL, 0,depth+1);
3294 }
3295 else
3296 oscan->flags = 0;
3297 }
3298 else if ((OP(oscan) == CURLYX)
3299 && (flags & SCF_WHILEM_VISITED_POS)
3300
3301
3302
3303 && (maxcount == REG_INFTY)
3304 && data && ++data->whilem_c < 16) {
3305
3306
3307 regnode *nxt = oscan + NEXT_OFF(oscan);
3308
3309 if (OP(PREVOPER(nxt)) == NOTHING)
3310 nxt += ARG(nxt);
3311 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3312 | (RExC_whilem_seen << 4));
3313 }
3314 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3315 pars++;
3316 if (flags & SCF_DO_SUBSTR) {
3317 SV *last_str = NULL;
3318 int counted = mincount != 0;
3319
3320 if (data->last_end > 0 && mincount != 0) {
3321#if defined(SPARC64_GCC_WORKAROUND)
3322 I32 b = 0;
3323 STRLEN l = 0;
3324 const char *s = NULL;
3325 I32 old = 0;
3326
3327 if (pos_before >= data->last_start_min)
3328 b = pos_before;
3329 else
3330 b = data->last_start_min;
3331
3332 l = 0;
3333 s = SvPV_const(data->last_found, l);
3334 old = b - data->last_start_min;
3335
3336#else
3337 I32 b = pos_before >= data->last_start_min
3338 ? pos_before : data->last_start_min;
3339 STRLEN l;
3340 const char * const s = SvPV_const(data->last_found, l);
3341 I32 old = b - data->last_start_min;
3342#endif
3343
3344 if (UTF)
3345 old = utf8_hop((U8*)s, old) - (U8*)s;
3346
3347 l -= old;
3348
3349 last_str = newSVpvn_utf8(s + old, l, UTF);
3350 if (deltanext == 0 && pos_before == b) {
3351
3352 if (mincount > 1) {
3353 SvGROW(last_str, (mincount * l) + 1);
3354 repeatcpy(SvPVX(last_str) + l,
3355 SvPVX_const(last_str), l, mincount - 1);
3356 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3357
3358 SvCUR_set(data->last_found,
3359 SvCUR(data->last_found) - l);
3360 sv_catsv(data->last_found, last_str);
3361 {
3362 SV * sv = data->last_found;
3363 MAGIC *mg =
3364 SvUTF8(sv) && SvMAGICAL(sv) ?
3365 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3366 if (mg && mg->mg_len >= 0)
3367 mg->mg_len += CHR_SVLEN(last_str) - l;
3368 }
3369 data->last_end += l * (mincount - 1);
3370 }
3371 } else {
3372
3373 data->last_start_min += minnext * (mincount - 1);
3374 data->last_start_max += is_inf ? I32_MAX
3375 : (maxcount - 1) * (minnext + data->pos_delta);
3376 }
3377 }
3378
3379 data->pos_min += minnext * (mincount - counted);
3380 data->pos_delta += - counted * deltanext +
3381 (minnext + deltanext) * maxcount - minnext * mincount;
3382 if (mincount != maxcount) {
3383
3384
3385 SCAN_COMMIT(pRExC_state,data,minlenp);
3386 if (mincount && last_str) {
3387 SV * const sv = data->last_found;
3388 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3389 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3390
3391 if (mg)
3392 mg->mg_len = -1;
3393 sv_setsv(sv, last_str);
3394 data->last_end = data->pos_min;
3395 data->last_start_min =
3396 data->pos_min - CHR_SVLEN(last_str);
3397 data->last_start_max = is_inf
3398 ? I32_MAX
3399 : data->pos_min + data->pos_delta
3400 - CHR_SVLEN(last_str);
3401 }
3402 data->longest = &(data->longest_float);
3403 }
3404 SvREFCNT_dec(last_str);
3405 }
3406 if (data && (fl & SF_HAS_EVAL))
3407 data->flags |= SF_HAS_EVAL;
3408 optimize_curly_tail:
3409 if (OP(oscan) != CURLYX) {
3410 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3411 && NEXT_OFF(next))
3412 NEXT_OFF(oscan) += NEXT_OFF(next);
3413 }
3414 continue;
3415 default:
3416 if (flags & SCF_DO_SUBSTR) {
3417 SCAN_COMMIT(pRExC_state,data,minlenp);
3418 data->longest = &(data->longest_float);
3419 }
3420 is_inf = is_inf_internal = 1;
3421 if (flags & SCF_DO_STCLASS_OR)
3422 cl_anything(pRExC_state, data->start_class);
3423 flags &= ~SCF_DO_STCLASS;
3424 break;
3425 }
3426 }
3427 else if (OP(scan) == LNBREAK) {
3428 if (flags & SCF_DO_STCLASS) {
3429 int value = 0;
3430 data->start_class->flags &= ~ANYOF_EOS;
3431 if (flags & SCF_DO_STCLASS_AND) {
3432 for (value = 0; value < 256; value++)
3433 if (!is_VERTWS_cp(value))
3434 ANYOF_BITMAP_CLEAR(data->start_class, value);
3435 }
3436 else {
3437 for (value = 0; value < 256; value++)
3438 if (is_VERTWS_cp(value))
3439 ANYOF_BITMAP_SET(data->start_class, value);
3440 }
3441 if (flags & SCF_DO_STCLASS_OR)
3442 cl_and(data->start_class, and_withp);
3443 flags &= ~SCF_DO_STCLASS;
3444 }
3445 min += 1;
3446 delta += 1;
3447 if (flags & SCF_DO_SUBSTR) {
3448 SCAN_COMMIT(pRExC_state,data,minlenp);
3449 data->pos_min += 1;
3450 data->pos_delta += 1;
3451 data->longest = &(data->longest_float);
3452 }
3453
3454 }
3455 else if (OP(scan) == FOLDCHAR) {
3456 int d = ARG(scan)==0xDF ? 1 : 2;
3457 flags &= ~SCF_DO_STCLASS;
3458 min += 1;
3459 delta += d;
3460 if (flags & SCF_DO_SUBSTR) {
3461 SCAN_COMMIT(pRExC_state,data,minlenp);
3462 data->pos_min += 1;
3463 data->pos_delta += d;
3464 data->longest = &(data->longest_float);
3465 }
3466 }
3467 else if (strchr((const char*)PL_simple,OP(scan))) {
3468 int value = 0;
3469
3470 if (flags & SCF_DO_SUBSTR) {
3471 SCAN_COMMIT(pRExC_state,data,minlenp);
3472 data->pos_min++;
3473 }
3474 min++;
3475 if (flags & SCF_DO_STCLASS) {
3476 data->start_class->flags &= ~ANYOF_EOS;
3477
3478
3479
3480 switch (PL_regkind[OP(scan)]) {
3481 case SANY:
3482 default:
3483 do_default:
3484
3485 if (flags & SCF_DO_STCLASS_OR)
3486 cl_anything(pRExC_state, data->start_class);
3487 break;
3488 case REG_ANY:
3489 if (OP(scan) == SANY)
3490 goto do_default;
3491 if (flags & SCF_DO_STCLASS_OR) {
3492 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3493 || (data->start_class->flags & ANYOF_CLASS));
3494 cl_anything(pRExC_state, data->start_class);
3495 }
3496 if (flags & SCF_DO_STCLASS_AND || !value)
3497 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3498 break;
3499 case ANYOF:
3500 if (flags & SCF_DO_STCLASS_AND)
3501 cl_and(data->start_class,
3502 (struct regnode_charclass_class*)scan);
3503 else
3504 cl_or(pRExC_state, data->start_class,
3505 (struct regnode_charclass_class*)scan);
3506 break;
3507 case ALNUM:
3508 if (flags & SCF_DO_STCLASS_AND) {
3509 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3510 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3511 for (value = 0; value < 256; value++)
3512 if (!isALNUM(value))
3513 ANYOF_BITMAP_CLEAR(data->start_class, value);
3514 }
3515 }
3516 else {
3517 if (data->start_class->flags & ANYOF_LOCALE)
3518 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3519 else {
3520 for (value = 0; value < 256; value++)
3521 if (isALNUM(value))
3522 ANYOF_BITMAP_SET(data->start_class, value);
3523 }
3524 }
3525 break;
3526 case ALNUML:
3527 if (flags & SCF_DO_STCLASS_AND) {
3528 if (data->start_class->flags & ANYOF_LOCALE)
3529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3530 }
3531 else {
3532 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3533 data->start_class->flags |= ANYOF_LOCALE;
3534 }
3535 break;
3536 case NALNUM:
3537 if (flags & SCF_DO_STCLASS_AND) {
3538 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3540 for (value = 0; value < 256; value++)
3541 if (isALNUM(value))
3542 ANYOF_BITMAP_CLEAR(data->start_class, value);
3543 }
3544 }
3545 else {
3546 if (data->start_class->flags & ANYOF_LOCALE)
3547 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3548 else {
3549 for (value = 0; value < 256; value++)
3550 if (!isALNUM(value))
3551 ANYOF_BITMAP_SET(data->start_class, value);
3552 }
3553 }
3554 break;
3555 case NALNUML:
3556 if (flags & SCF_DO_STCLASS_AND) {
3557 if (data->start_class->flags & ANYOF_LOCALE)
3558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3559 }
3560 else {
3561 data->start_class->flags |= ANYOF_LOCALE;
3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3563 }
3564 break;
3565 case SPACE:
3566 if (flags & SCF_DO_STCLASS_AND) {
3567 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3568 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3569 for (value = 0; value < 256; value++)
3570 if (!isSPACE(value))
3571 ANYOF_BITMAP_CLEAR(data->start_class, value);
3572 }
3573 }
3574 else {
3575 if (data->start_class->flags & ANYOF_LOCALE)
3576 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3577 else {
3578 for (value = 0; value < 256; value++)
3579 if (isSPACE(value))
3580 ANYOF_BITMAP_SET(data->start_class, value);
3581 }
3582 }
3583 break;
3584 case SPACEL:
3585 if (flags & SCF_DO_STCLASS_AND) {
3586 if (data->start_class->flags & ANYOF_LOCALE)
3587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3588 }
3589 else {
3590 data->start_class->flags |= ANYOF_LOCALE;
3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3592 }
3593 break;
3594 case NSPACE:
3595 if (flags & SCF_DO_STCLASS_AND) {
3596 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3597 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3598 for (value = 0; value < 256; value++)
3599 if (isSPACE(value))
3600 ANYOF_BITMAP_CLEAR(data->start_class, value);
3601 }
3602 }
3603 else {
3604 if (data->start_class->flags & ANYOF_LOCALE)
3605 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3606 else {
3607 for (value = 0; value < 256; value++)
3608 if (!isSPACE(value))
3609 ANYOF_BITMAP_SET(data->start_class, value);
3610 }
3611 }
3612 break;
3613 case NSPACEL:
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (data->start_class->flags & ANYOF_LOCALE) {
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3617 for (value = 0; value < 256; value++)
3618 if (!isSPACE(value))
3619 ANYOF_BITMAP_CLEAR(data->start_class, value);
3620 }
3621 }
3622 else {
3623 data->start_class->flags |= ANYOF_LOCALE;
3624 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3625 }
3626 break;
3627 case DIGIT:
3628 if (flags & SCF_DO_STCLASS_AND) {
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3630 for (value = 0; value < 256; value++)
3631 if (!isDIGIT(value))
3632 ANYOF_BITMAP_CLEAR(data->start_class, value);
3633 }
3634 else {
3635 if (data->start_class->flags & ANYOF_LOCALE)
3636 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3637 else {
3638 for (value = 0; value < 256; value++)
3639 if (isDIGIT(value))
3640 ANYOF_BITMAP_SET(data->start_class, value);
3641 }
3642 }
3643 break;
3644 case NDIGIT:
3645 if (flags & SCF_DO_STCLASS_AND) {
3646 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3647 for (value = 0; value < 256; value++)
3648 if (isDIGIT(value))
3649 ANYOF_BITMAP_CLEAR(data->start_class, value);
3650 }
3651 else {
3652 if (data->start_class->flags & ANYOF_LOCALE)
3653 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3654 else {
3655 for (value = 0; value < 256; value++)
3656 if (!isDIGIT(value))
3657 ANYOF_BITMAP_SET(data->start_class, value);
3658 }
3659 }
3660 break;
3661 CASE_SYNST_FNC(VERTWS);
3662 CASE_SYNST_FNC(HORIZWS);
3663
3664 }
3665 if (flags & SCF_DO_STCLASS_OR)
3666 cl_and(data->start_class, and_withp);
3667 flags &= ~SCF_DO_STCLASS;
3668 }
3669 }
3670 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3671 data->flags |= (OP(scan) == MEOL
3672 ? SF_BEFORE_MEOL
3673 : SF_BEFORE_SEOL);
3674 }
3675 else if ( PL_regkind[OP(scan)] == BRANCHJ
3676
3677 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3678 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3679 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3680 || OP(scan) == UNLESSM )
3681 {
3682
3683
3684
3685
3686 I32 deltanext, minnext, fake = 0;
3687 regnode *nscan;
3688 struct regnode_charclass_class intrnl;
3689 int f = 0;
3690
3691 data_fake.flags = 0;
3692 if (data) {
3693 data_fake.whilem_c = data->whilem_c;
3694 data_fake.last_closep = data->last_closep;
3695 }
3696 else
3697 data_fake.last_closep = &fake;
3698 data_fake.pos_delta = delta;
3699 if ( flags & SCF_DO_STCLASS && !scan->flags
3700 && OP(scan) == IFMATCH ) {
3701 cl_init(pRExC_state, &intrnl);
3702 data_fake.start_class = &intrnl;
3703 f |= SCF_DO_STCLASS_AND;
3704 }
3705 if (flags & SCF_WHILEM_VISITED_POS)
3706 f |= SCF_WHILEM_VISITED_POS;
3707 next = regnext(scan);
3708 nscan = NEXTOPER(NEXTOPER(scan));
3709 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3710 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3711 if (scan->flags) {
3712 if (deltanext) {
3713 FAIL("Variable length lookbehind not implemented");
3714 }
3715 else if (minnext > (I32)U8_MAX) {
3716 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3717 }
3718 scan->flags = (U8)minnext;
3719 }
3720 if (data) {
3721 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3722 pars++;
3723 if (data_fake.flags & SF_HAS_EVAL)
3724 data->flags |= SF_HAS_EVAL;
3725 data->whilem_c = data_fake.whilem_c;
3726 }
3727 if (f & SCF_DO_STCLASS_AND) {
3728 if (flags & SCF_DO_STCLASS_OR) {
3729
3730
3731
3732
3733
3734
3735 cl_init(pRExC_state, data->start_class);
3736 } else {
3737
3738 const int was = (data->start_class->flags & ANYOF_EOS);
3739
3740 cl_and(data->start_class, &intrnl);
3741 if (was)
3742 data->start_class->flags |= ANYOF_EOS;
3743 }
3744 }
3745 }
3746#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3747 else {
3748
3749
3750
3751
3752
3753
3754
3755 I32 deltanext, fake = 0;
3756 regnode *nscan;
3757 struct regnode_charclass_class intrnl;
3758 int f = 0;
3759
3760
3761
3762
3763
3764
3765 I32 *minnextp;
3766 Newx( minnextp, 1, I32 );
3767 SAVEFREEPV(minnextp);
3768
3769 if (data) {
3770 StructCopy(data, &data_fake, scan_data_t);
3771 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3772 f |= SCF_DO_SUBSTR;
3773 if (scan->flags)
3774 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3775 data_fake.last_found=newSVsv(data->last_found);
3776 }
3777 }
3778 else
3779 data_fake.last_closep = &fake;
3780 data_fake.flags = 0;
3781 data_fake.pos_delta = delta;
3782 if (is_inf)
3783 data_fake.flags |= SF_IS_INF;
3784 if ( flags & SCF_DO_STCLASS && !scan->flags
3785 && OP(scan) == IFMATCH ) {
3786 cl_init(pRExC_state, &intrnl);
3787 data_fake.start_class = &intrnl;
3788 f |= SCF_DO_STCLASS_AND;
3789 }
3790 if (flags & SCF_WHILEM_VISITED_POS)
3791 f |= SCF_WHILEM_VISITED_POS;
3792 next = regnext(scan);
3793 nscan = NEXTOPER(NEXTOPER(scan));
3794
3795 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3796 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3797 if (scan->flags) {
3798 if (deltanext) {
3799 FAIL("Variable length lookbehind not implemented");
3800 }
3801 else if (*minnextp > (I32)U8_MAX) {
3802 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3803 }
3804 scan->flags = (U8)*minnextp;
3805 }
3806
3807 *minnextp += min;
3808
3809 if (f & SCF_DO_STCLASS_AND) {
3810 const int was = (data->start_class->flags & ANYOF_EOS);
3811
3812 cl_and(data->start_class, &intrnl);
3813 if (was)
3814 data->start_class->flags |= ANYOF_EOS;
3815 }
3816 if (data) {
3817 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3818 pars++;
3819 if (data_fake.flags & SF_HAS_EVAL)
3820 data->flags |= SF_HAS_EVAL;
3821 data->whilem_c = data_fake.whilem_c;
3822 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3823 if (RExC_rx->minlen<*minnextp)
3824 RExC_rx->minlen=*minnextp;
3825 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3826 SvREFCNT_dec(data_fake.last_found);
3827
3828 if ( data_fake.minlen_fixed != minlenp )
3829 {
3830 data->offset_fixed= data_fake.offset_fixed;
3831 data->minlen_fixed= data_fake.minlen_fixed;
3832 data->lookbehind_fixed+= scan->flags;
3833 }
3834 if ( data_fake.minlen_float != minlenp )
3835 {
3836 data->minlen_float= data_fake.minlen_float;
3837 data->offset_float_min=data_fake.offset_float_min;
3838 data->offset_float_max=data_fake.offset_float_max;
3839 data->lookbehind_float+= scan->flags;
3840 }
3841 }
3842 }
3843
3844
3845 }
3846#endif
3847 }
3848 else if (OP(scan) == OPEN) {
3849 if (stopparen != (I32)ARG(scan))
3850 pars++;
3851 }
3852 else if (OP(scan) == CLOSE) {
3853 if (stopparen == (I32)ARG(scan)) {
3854 break;
3855 }
3856 if ((I32)ARG(scan) == is_par) {
3857 next = regnext(scan);
3858
3859 if ( next && (OP(next) != WHILEM) && next < last)
3860 is_par = 0;
3861 }
3862 if (data)
3863 *(data->last_closep) = ARG(scan);
3864 }
3865 else if (OP(scan) == EVAL) {
3866 if (data)
3867 data->flags |= SF_HAS_EVAL;
3868 }
3869 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3870 if (flags & SCF_DO_SUBSTR) {
3871 SCAN_COMMIT(pRExC_state,data,minlenp);
3872 flags &= ~SCF_DO_SUBSTR;
3873 }
3874 if (data && OP(scan)==ACCEPT) {
3875 data->flags |= SCF_SEEN_ACCEPT;
3876 if (stopmin > min)
3877 stopmin = min;
3878 }
3879 }
3880 else if (OP(scan) == LOGICAL && scan->flags == 2)
3881 {
3882 if (flags & SCF_DO_SUBSTR) {
3883 SCAN_COMMIT(pRExC_state,data,minlenp);
3884 data->longest = &(data->longest_float);
3885 }
3886 is_inf = is_inf_internal = 1;
3887 if (flags & SCF_DO_STCLASS_OR)
3888 cl_anything(pRExC_state, data->start_class);
3889 flags &= ~SCF_DO_STCLASS;
3890 }
3891 else if (OP(scan) == GPOS) {
3892 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3893 !(delta || is_inf || (data && data->pos_delta)))
3894 {
3895 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3896 RExC_rx->extflags |= RXf_ANCH_GPOS;
3897 if (RExC_rx->gofs < (U32)min)
3898 RExC_rx->gofs = min;
3899 } else {
3900 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3901 RExC_rx->gofs = 0;
3902 }
3903 }
3904#ifdef TRIE_STUDY_OPT
3905#ifdef FULL_TRIE_STUDY
3906 else if (PL_regkind[OP(scan)] == TRIE) {
3907
3908
3909
3910 regnode *trie_node= scan;
3911 regnode *tail= regnext(scan);
3912 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3913 I32 max1 = 0, min1 = I32_MAX;
3914 struct regnode_charclass_class accum;
3915
3916 if (flags & SCF_DO_SUBSTR)
3917 SCAN_COMMIT(pRExC_state, data,minlenp);
3918 if (flags & SCF_DO_STCLASS)
3919 cl_init_zero(pRExC_state, &accum);
3920
3921 if (!trie->jump) {
3922 min1= trie->minlen;
3923 max1= trie->maxlen;
3924 } else {
3925 const regnode *nextbranch= NULL;
3926 U32 word;
3927
3928 for ( word=1 ; word <= trie->wordcount ; word++)
3929 {
3930 I32 deltanext=0, minnext=0, f = 0, fake;
3931 struct regnode_charclass_class this_class;
3932
3933 data_fake.flags = 0;
3934 if (data) {
3935 data_fake.whilem_c = data->whilem_c;
3936 data_fake.last_closep = data->last_closep;
3937 }
3938 else
3939 data_fake.last_closep = &fake;
3940 data_fake.pos_delta = delta;
3941 if (flags & SCF_DO_STCLASS) {
3942 cl_init(pRExC_state, &this_class);
3943 data_fake.start_class = &this_class;
3944 f = SCF_DO_STCLASS_AND;
3945 }
3946 if (flags & SCF_WHILEM_VISITED_POS)
3947 f |= SCF_WHILEM_VISITED_POS;
3948
3949 if (trie->jump[word]) {
3950 if (!nextbranch)
3951 nextbranch = trie_node + trie->jump[0];
3952 scan= trie_node + trie->jump[word];
3953
3954
3955
3956
3957 minnext = study_chunk(pRExC_state, &scan, minlenp,
3958 &deltanext, (regnode *)nextbranch, &data_fake,
3959 stopparen, recursed, NULL, f,depth+1);
3960 }
3961 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3962 nextbranch= regnext((regnode*)nextbranch);
3963
3964 if (min1 > (I32)(minnext + trie->minlen))
3965 min1 = minnext + trie->minlen;
3966 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3967 max1 = minnext + deltanext + trie->maxlen;
3968 if (deltanext == I32_MAX)
3969 is_inf = is_inf_internal = 1;
3970
3971 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3972 pars++;
3973 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3974 if ( stopmin > min + min1)
3975 stopmin = min + min1;
3976 flags &= ~SCF_DO_SUBSTR;
3977 if (data)
3978 data->flags |= SCF_SEEN_ACCEPT;
3979 }
3980 if (data) {
3981 if (data_fake.flags & SF_HAS_EVAL)
3982 data->flags |= SF_HAS_EVAL;
3983 data->whilem_c = data_fake.whilem_c;
3984 }
3985 if (flags & SCF_DO_STCLASS)
3986 cl_or(pRExC_state, &accum, &this_class);
3987 }
3988 }
3989 if (flags & SCF_DO_SUBSTR) {
3990 data->pos_min += min1;
3991 data->pos_delta += max1 - min1;
3992 if (max1 != min1 || is_inf)
3993 data->longest = &(data->longest_float);
3994 }
3995 min += min1;
3996 delta += max1 - min1;
3997 if (flags & SCF_DO_STCLASS_OR) {
3998 cl_or(pRExC_state, data->start_class, &accum);
3999 if (min1) {
4000 cl_and(data->start_class, and_withp);
4001 flags &= ~SCF_DO_STCLASS;
4002 }
4003 }
4004 else if (flags & SCF_DO_STCLASS_AND) {
4005 if (min1) {
4006 cl_and(data->start_class, &accum);
4007 flags &= ~SCF_DO_STCLASS;
4008 }
4009 else {
4010
4011
4012 INIT_AND_WITHP;
4013 StructCopy(data->start_class, and_withp,
4014 struct regnode_charclass_class);
4015 flags &= ~SCF_DO_STCLASS_AND;
4016 StructCopy(&accum, data->start_class,
4017 struct regnode_charclass_class);
4018 flags |= SCF_DO_STCLASS_OR;
4019 data->start_class->flags |= ANYOF_EOS;
4020 }
4021 }
4022 scan= tail;
4023 continue;
4024 }
4025#else
4026 else if (PL_regkind[OP(scan)] == TRIE) {
4027 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4028 U8*bang=NULL;
4029
4030 min += trie->minlen;
4031 delta += (trie->maxlen - trie->minlen);
4032 flags &= ~SCF_DO_STCLASS;
4033 if (flags & SCF_DO_SUBSTR) {
4034 SCAN_COMMIT(pRExC_state,data,minlenp);
4035 data->pos_min += trie->minlen;
4036 data->pos_delta += (trie->maxlen - trie->minlen);
4037 if (trie->maxlen != trie->minlen)
4038 data->longest = &(data->longest_float);
4039 }
4040 if (trie->jump)
4041 flags &= ~SCF_DO_SUBSTR;
4042 }
4043#endif
4044#endif
4045
4046
4047 scan = regnext(scan);
4048 }
4049 if (frame) {
4050 last = frame->last;
4051 scan = frame->next;
4052 stopparen = frame->stop;
4053 frame = frame->prev;
4054 goto fake_study_recurse;
4055 }
4056
4057 finish:
4058 assert(!frame);
4059 DEBUG_STUDYDATA("pre-fin:",data,depth);
4060
4061 *scanp = scan;
4062 *deltap = is_inf_internal ? I32_MAX : delta;
4063 if (flags & SCF_DO_SUBSTR && is_inf)
4064 data->pos_delta = I32_MAX - data->pos_min;
4065 if (is_par > (I32)U8_MAX)
4066 is_par = 0;
4067 if (is_par && pars==1 && data) {
4068 data->flags |= SF_IN_PAR;
4069 data->flags &= ~SF_HAS_PAR;
4070 }
4071 else if (pars && data) {
4072 data->flags |= SF_HAS_PAR;
4073 data->flags &= ~SF_IN_PAR;
4074 }
4075 if (flags & SCF_DO_STCLASS_OR)
4076 cl_and(data->start_class, and_withp);
4077 if (flags & SCF_TRIE_RESTUDY)
4078 data->flags |= SCF_TRIE_RESTUDY;
4079
4080 DEBUG_STUDYDATA("post-fin:",data,depth);
4081
4082 return min < stopmin ? min : stopmin;
4083}
4084
4085STATIC U32
4086S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4087{
4088 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4089
4090 PERL_ARGS_ASSERT_ADD_DATA;
4091
4092 Renewc(RExC_rxi->data,
4093 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4094 char, struct reg_data);
4095 if(count)
4096 Renew(RExC_rxi->data->what, count + n, U8);
4097 else
4098 Newx(RExC_rxi->data->what, n, U8);
4099 RExC_rxi->data->count = count + n;
4100 Copy(s, RExC_rxi->data->what + count, n, U8);
4101 return count;
4102}
4103
4104
4105#ifndef PERL_IN_XSUB_RE
4106void
4107Perl_reginitcolors(pTHX)
4108{
4109 dVAR;
4110 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4111 if (s) {
4112 char *t = savepv(s);
4113 int i = 0;
4114 PL_colors[0] = t;
4115 while (++i < 6) {
4116 t = strchr(t, '\t');
4117 if (t) {
4118 *t = '\0';
4119 PL_colors[i] = ++t;
4120 }
4121 else
4122 PL_colors[i] = t = (char *)"";
4123 }
4124 } else {
4125 int i = 0;
4126 while (i < 6)
4127 PL_colors[i++] = (char *)"";
4128 }
4129 PL_colorset = 1;
4130}
4131#endif
4132
4133
4134#ifdef TRIE_STUDY_OPT
4135#define CHECK_RESTUDY_GOTO \
4136 if ( \
4137 (data.flags & SCF_TRIE_RESTUDY) \
4138 && ! restudied++ \
4139 ) goto reStudy
4140#else
4141#define CHECK_RESTUDY_GOTO
4142#endif
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162#ifndef PERL_IN_XSUB_RE
4163#define RE_ENGINE_PTR &PL_core_reg_engine
4164#else
4165extern const struct regexp_engine my_reg_engine;
4166#define RE_ENGINE_PTR &my_reg_engine
4167#endif
4168
4169#ifndef PERL_IN_XSUB_RE
4170REGEXP *
4171Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4172{
4173 dVAR;
4174 HV * const table = GvHV(PL_hintgv);
4175
4176 PERL_ARGS_ASSERT_PREGCOMP;
4177
4178
4179
4180 if (table) {
4181 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4182 GET_RE_DEBUG_FLAGS_DECL;
4183 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4184 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4185 DEBUG_COMPILE_r({
4186 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4187 SvIV(*ptr));
4188 });
4189 return CALLREGCOMP_ENG(eng, pattern, flags);
4190 }
4191 }
4192 return Perl_re_compile(aTHX_ pattern, flags);
4193}
4194#endif
4195
4196REGEXP *
4197Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4198{
4199 dVAR;
4200 register REGEXP *r;
4201 register regexp_internal *ri;
4202 STRLEN plen;
4203 char* exp = SvPV((SV*)pattern, plen);
4204 char* xend = exp + plen;
4205 regnode *scan;
4206 I32 flags;
4207 I32 minlen = 0;
4208 I32 sawplus = 0;
4209 I32 sawopen = 0;
4210 scan_data_t data;
4211 RExC_state_t RExC_state;
4212 RExC_state_t * const pRExC_state = &RExC_state;
4213#ifdef TRIE_STUDY_OPT
4214 int restudied= 0;
4215 RExC_state_t copyRExC_state;
4216#endif
4217 GET_RE_DEBUG_FLAGS_DECL;
4218
4219 PERL_ARGS_ASSERT_RE_COMPILE;
4220
4221 DEBUG_r(if (!PL_colorset) reginitcolors());
4222
4223 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4224
4225 DEBUG_COMPILE_r({
4226 SV *dsv= sv_newmortal();
4227 RE_PV_QUOTED_DECL(s, RExC_utf8,
4228 dsv, exp, plen, 60);
4229 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4230 PL_colors[4],PL_colors[5],s);
4231 });
4232
4233redo_first_pass:
4234 RExC_precomp = exp;
4235 RExC_flags = pm_flags;
4236 RExC_sawback = 0;
4237
4238 RExC_seen = 0;
4239 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4240 RExC_seen_evals = 0;
4241 RExC_extralen = 0;
4242
4243
4244 RExC_parse = exp;
4245 RExC_start = exp;
4246 RExC_end = xend;
4247 RExC_naughty = 0;
4248 RExC_npar = 1;
4249 RExC_nestroot = 0;
4250 RExC_size = 0L;
4251 RExC_emit = &PL_regdummy;
4252 RExC_whilem_seen = 0;
4253 RExC_charnames = NULL;
4254 RExC_open_parens = NULL;
4255 RExC_close_parens = NULL;
4256 RExC_opend = NULL;
4257 RExC_paren_names = NULL;
4258#ifdef DEBUGGING
4259 RExC_paren_name_list = NULL;
4260#endif
4261 RExC_recurse = NULL;
4262 RExC_recurse_count = 0;
4263
4264#if 0
4265
4266 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4267#endif
4268 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4269 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4270 RExC_precomp = NULL;
4271 return(NULL);
4272 }
4273 if (RExC_utf8 && !RExC_orig_utf8) {
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283 STRLEN len = plen;
4284 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4285 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4286 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4287 xend = exp + len;
4288 RExC_orig_utf8 = RExC_utf8;
4289 SAVEFREEPV(exp);
4290 goto redo_first_pass;
4291 }
4292 DEBUG_PARSE_r({
4293 PerlIO_printf(Perl_debug_log,
4294 "Required size %"IVdf" nodes\n"
4295 "Starting second pass (creation)\n",
4296 (IV)RExC_size);
4297 RExC_lastnum=0;
4298 RExC_lastparse=NULL;
4299 });
4300
4301
4302 if (RExC_size >= 0x10000L && RExC_extralen)
4303 RExC_size += RExC_extralen;
4304 else
4305 RExC_extralen = 0;
4306 if (RExC_whilem_seen > 15)
4307 RExC_whilem_seen = 15;
4308
4309
4310
4311
4312 Newxz(r, 1, regexp);
4313 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4314 char, regexp_internal);
4315 if ( r == NULL || ri == NULL )
4316 FAIL("Regexp out of space");
4317#ifdef DEBUGGING
4318
4319 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4320#else
4321
4322 Zero(ri, sizeof(regexp_internal), char);
4323#endif
4324
4325
4326 RXi_SET( r, ri );
4327 r->engine= RE_ENGINE_PTR;
4328 r->refcnt = 1;
4329 RX_PRELEN(r) = plen;
4330 r->extflags = pm_flags;
4331 {
4332 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4333 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4334 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4335 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4336 >> RXf_PMf_STD_PMMOD_SHIFT);
4337 const char *fptr = STD_PAT_MODS;
4338 char *p;
4339 RX_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4340 + (sizeof(STD_PAT_MODS) - 1)
4341 + (sizeof("(?:)") - 1);
4342
4343 Newx(RX_WRAPPED(r), RX_WRAPLEN(r) + 1, char );
4344 p = RX_WRAPPED(r);
4345 *p++='('; *p++='?';
4346 if (has_p)
4347 *p++ = KEEPCOPY_PAT_MOD;
4348 {
4349 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4350 char *colon = r + 1;
4351 char ch;
4352
4353 while((ch = *fptr++)) {
4354 if(reganch & 1)
4355 *p++ = ch;
4356 else
4357 *r-- = ch;
4358 reganch >>= 1;
4359 }
4360 if(has_minus) {
4361 *r = '-';
4362 p = colon;
4363 }
4364 }
4365
4366 *p++ = ':';
4367 Copy(RExC_precomp, p, plen, char);
4368 RX_PRECOMP(r) = p;
4369 p += plen;
4370 if (has_runon)
4371 *p++ = '\n';
4372 *p++ = ')';
4373 *p = 0;
4374 }
4375
4376 r->intflags = 0;
4377 r->nparens = RExC_npar - 1;
4378
4379 if (RExC_seen & REG_SEEN_RECURSE) {
4380 Newxz(RExC_open_parens, RExC_npar,regnode *);
4381 SAVEFREEPV(RExC_open_parens);
4382 Newxz(RExC_close_parens,RExC_npar,regnode *);
4383 SAVEFREEPV(RExC_close_parens);
4384 }
4385
4386
4387#ifdef RE_TRACK_PATTERN_OFFSETS
4388 Newxz(ri->u.offsets, 2*RExC_size+1, U32);
4389 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4390 "%s %"UVuf" bytes for offset annotations.\n",
4391 ri->u.offsets ? "Got" : "Couldn't get",
4392 (UV)((2*RExC_size+1) * sizeof(U32))));
4393#endif
4394 SetProgLen(ri,RExC_size);
4395 RExC_rx = r;
4396 RExC_rxi = ri;
4397
4398
4399 RExC_flags = pm_flags;
4400 RExC_parse = exp;
4401 RExC_end = xend;
4402 RExC_naughty = 0;
4403 RExC_npar = 1;
4404 RExC_emit_start = ri->program;
4405 RExC_emit = ri->program;
4406 RExC_emit_bound = ri->program + RExC_size + 1;
4407
4408
4409 RExC_rx->seen_evals = RExC_seen_evals;
4410 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4411 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4412 ReREFCNT_dec(r);
4413 return(NULL);
4414 }
4415
4416
4417 Newx(r->substrs, 1, struct reg_substr_data);
4418 if (RExC_recurse_count) {
4419 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4420 SAVEFREEPV(RExC_recurse);
4421 }
4422
4423reStudy:
4424 r->minlen = minlen = sawplus = sawopen = 0;
4425 Zero(r->substrs, 1, struct reg_substr_data);
4426
4427#ifdef TRIE_STUDY_OPT
4428 if (!restudied) {
4429 StructCopy(&zero_scan_data, &data, scan_data_t);
4430 copyRExC_state = RExC_state;
4431 } else {
4432 U32 seen=RExC_seen;
4433 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4434
4435 RExC_state = copyRExC_state;
4436 if (seen & REG_TOP_LEVEL_BRANCHES)
4437 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4438 else
4439 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4440 if (data.last_found) {
4441 SvREFCNT_dec(data.longest_fixed);
4442 SvREFCNT_dec(data.longest_float);
4443 SvREFCNT_dec(data.last_found);
4444 }
4445 StructCopy(&zero_scan_data, &data, scan_data_t);
4446 }
4447#else
4448 StructCopy(&zero_scan_data, &data, scan_data_t);
4449#endif
4450
4451
4452 r->extflags = RExC_flags;
4453
4454
4455 if (UTF)
4456 r->extflags |= RXf_UTF8;
4457 ri->regstclass = NULL;
4458 if (RExC_naughty >= 10)
4459 r->intflags |= PREGf_NAUGHTY;
4460 scan = ri->program + 1;
4461
4462
4463
4464 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) {
4465 I32 fake;
4466 STRLEN longest_float_length, longest_fixed_length;
4467 struct regnode_charclass_class ch_class;
4468 int stclass_flag;
4469 I32 last_close = 0;
4470 regnode *first= scan;
4471 regnode *first_next= regnext(first);
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484 while ((OP(first) == OPEN && (sawopen = 1)) ||
4485
4486 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4487
4488 (OP(first) == IFMATCH && !first->flags) ||
4489 (OP(first) == PLUS) ||
4490 (OP(first) == MINMOD) ||
4491
4492 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4493 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4494 {
4495
4496
4497
4498
4499
4500 if (OP(first) == PLUS)
4501 sawplus = 1;
4502 else
4503 first += regarglen[OP(first)];
4504
4505 first = NEXTOPER(first);
4506 first_next= regnext(first);
4507 }
4508
4509
4510 again:
4511 DEBUG_PEEP("first:",first,0);
4512
4513 if (PL_regkind[OP(first)] == EXACT) {
4514 if (OP(first) == EXACT)
4515 NOOP;
4516 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4517 ri->regstclass = first;
4518 }
4519#ifdef TRIE_STCLASS
4520 else if (PL_regkind[OP(first)] == TRIE &&
4521 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4522 {
4523 regnode *trie_op;
4524
4525 if ( OP(first) == TRIE ) {
4526 struct regnode_1 *trieop = (struct regnode_1 *)
4527 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4528 StructCopy(first,trieop,struct regnode_1);
4529 trie_op=(regnode *)trieop;
4530 } else {
4531 struct regnode_charclass *trieop = (struct regnode_charclass *)
4532 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4533 StructCopy(first,trieop,struct regnode_charclass);
4534 trie_op=(regnode *)trieop;
4535 }
4536 OP(trie_op)+=2;
4537 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4538 ri->regstclass = trie_op;
4539 }
4540#endif
4541 else if (strchr((const char*)PL_simple,OP(first)))
4542 ri->regstclass = first;
4543 else if (PL_regkind[OP(first)] == BOUND ||
4544 PL_regkind[OP(first)] == NBOUND)
4545 ri->regstclass = first;
4546 else if (PL_regkind[OP(first)] == BOL) {
4547 r->extflags |= (OP(first) == MBOL
4548 ? RXf_ANCH_MBOL
4549 : (OP(first) == SBOL
4550 ? RXf_ANCH_SBOL
4551 : RXf_ANCH_BOL));
4552 first = NEXTOPER(first);
4553 goto again;
4554 }
4555 else if (OP(first) == GPOS) {
4556 r->extflags |= RXf_ANCH_GPOS;
4557 first = NEXTOPER(first);
4558 goto again;
4559 }
4560 else if ((!sawopen || !RExC_sawback) &&
4561 (OP(first) == STAR &&
4562 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4563 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4564 {
4565
4566 const int type =
4567 (OP(NEXTOPER(first)) == REG_ANY)
4568 ? RXf_ANCH_MBOL
4569 : RXf_ANCH_SBOL;
4570 r->extflags |= type;
4571 r->intflags |= PREGf_IMPLICIT;
4572 first = NEXTOPER(first);
4573 goto again;
4574 }
4575 if (sawplus && (!sawopen || !RExC_sawback)
4576 && !(RExC_seen & REG_SEEN_EVAL))
4577
4578 r->intflags |= PREGf_SKIP;
4579
4580
4581#ifdef TRIE_STUDY_OPT
4582 DEBUG_PARSE_r(
4583 if (!restudied)
4584 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4585 (IV)(first - scan + 1))
4586 );
4587#else
4588 DEBUG_PARSE_r(
4589 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4590 (IV)(first - scan + 1))
4591 );
4592#endif
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607 data.longest_fixed = newSVpvs("");
4608 data.longest_float = newSVpvs("");
4609 data.last_found = newSVpvs("");
4610 data.longest = &(data.longest_fixed);
4611 first = scan;
4612 if (!ri->regstclass) {
4613 cl_init(pRExC_state, &ch_class);
4614 data.start_class = &ch_class;
4615 stclass_flag = SCF_DO_STCLASS_AND;
4616 } else
4617 stclass_flag = 0;
4618 data.last_closep = &last_close;
4619
4620 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size,
4621 &data, -1, NULL, NULL,
4622 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4623
4624
4625 CHECK_RESTUDY_GOTO;
4626
4627
4628 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4629 && data.last_start_min == 0 && data.last_end > 0
4630 && !RExC_seen_zerolen
4631 && !(RExC_seen & REG_SEEN_VERBARG)
4632 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4633 r->extflags |= RXf_CHECK_ALL;
4634 scan_commit(pRExC_state, &data,&minlen,0);
4635 SvREFCNT_dec(data.last_found);
4636
4637
4638
4639
4640
4641 longest_float_length = CHR_SVLEN(data.longest_float);
4642 if (longest_float_length
4643 || (data.flags & SF_FL_BEFORE_EOL
4644 && (!(data.flags & SF_FL_BEFORE_MEOL)
4645 || (RExC_flags & RXf_PMf_MULTILINE))))
4646 {
4647 I32 t,ml;
4648
4649 if (SvCUR(data.longest_fixed)
4650 && data.offset_fixed == data.offset_float_min
4651 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4652 goto remove_float;
4653
4654
4655
4656 if (SvUTF8(data.longest_float)) {
4657 r->float_utf8 = data.longest_float;
4658 r->float_substr = NULL;
4659 } else {
4660 r->float_substr = data.longest_float;
4661 r->float_utf8 = NULL;
4662 }
4663
4664
4665
4666
4667 ml = data.minlen_float ? *(data.minlen_float)
4668 : (I32)longest_float_length;
4669 r->float_end_shift = ml - data.offset_float_min
4670 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4671 + data.lookbehind_float;
4672 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4673 r->float_max_offset = data.offset_float_max;
4674 if (data.offset_float_max < I32_MAX)
4675 r->float_max_offset -= data.lookbehind_float;
4676
4677 t = (data.flags & SF_FL_BEFORE_EOL
4678 && (!(data.flags & SF_FL_BEFORE_MEOL)
4679 || (RExC_flags & RXf_PMf_MULTILINE)));
4680 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4681 }
4682 else {
4683 remove_float:
4684 r->float_substr = r->float_utf8 = NULL;
4685 SvREFCNT_dec(data.longest_float);
4686 longest_float_length = 0;
4687 }
4688
4689
4690
4691
4692
4693 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4694 if (longest_fixed_length
4695 || (data.flags & SF_FIX_BEFORE_EOL
4696 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4697 || (RExC_flags & RXf_PMf_MULTILINE))))
4698 {
4699 I32 t,ml;
4700
4701
4702
4703 if (SvUTF8(data.longest_fixed)) {
4704 r->anchored_utf8 = data.longest_fixed;
4705 r->anchored_substr = NULL;
4706 } else {
4707 r->anchored_substr = data.longest_fixed;
4708 r->anchored_utf8 = NULL;
4709 }
4710
4711
4712
4713
4714 ml = data.minlen_fixed ? *(data.minlen_fixed)
4715 : (I32)longest_fixed_length;
4716 r->anchored_end_shift = ml - data.offset_fixed
4717 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4718 + data.lookbehind_fixed;
4719 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4720
4721 t = (data.flags & SF_FIX_BEFORE_EOL
4722 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4723 || (RExC_flags & RXf_PMf_MULTILINE)));
4724 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4725 }
4726 else {
4727 r->anchored_substr = r->anchored_utf8 = NULL;
4728 SvREFCNT_dec(data.longest_fixed);
4729 longest_fixed_length = 0;
4730 }
4731 if (ri->regstclass
4732 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4733 ri->regstclass = NULL;
4734 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4735 && stclass_flag
4736 && !(data.start_class->flags & ANYOF_EOS)
4737 && !cl_is_anything(data.start_class))
4738 {
4739 const U32 n = add_data(pRExC_state, 1, "f");
4740
4741 Newx(RExC_rxi->data->data[n], 1,
4742 struct regnode_charclass_class);
4743 StructCopy(data.start_class,
4744 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4745 struct regnode_charclass_class);
4746 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4747 r->intflags &= ~PREGf_SKIP;
4748 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4749 regprop(r, sv, (regnode*)data.start_class);
4750 PerlIO_printf(Perl_debug_log,
4751 "synthetic stclass \"%s\".\n",
4752 SvPVX_const(sv));});
4753 }
4754
4755
4756 if (longest_fixed_length > longest_float_length) {
4757 r->check_end_shift = r->anchored_end_shift;
4758 r->check_substr = r->anchored_substr;
4759 r->check_utf8 = r->anchored_utf8;
4760 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4761 if (r->extflags & RXf_ANCH_SINGLE)
4762 r->extflags |= RXf_NOSCAN;
4763 }
4764 else {
4765 r->check_end_shift = r->float_end_shift;
4766 r->check_substr = r->float_substr;
4767 r->check_utf8 = r->float_utf8;
4768 r->check_offset_min = r->float_min_offset;
4769 r->check_offset_max = r->float_max_offset;
4770 }
4771
4772
4773 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4774 r->extflags |= RXf_USE_INTUIT;
4775 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4776 r->extflags |= RXf_INTUIT_TAIL;
4777 }
4778
4779
4780
4781
4782
4783
4784 }
4785 else {
4786
4787 I32 fake;
4788 struct regnode_charclass_class ch_class;
4789 I32 last_close = 0;
4790
4791 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4792
4793 scan = ri->program + 1;
4794 cl_init(pRExC_state, &ch_class);
4795 data.start_class = &ch_class;
4796 data.last_closep = &last_close;
4797
4798
4799 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4800 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4801
4802 CHECK_RESTUDY_GOTO;
4803
4804 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4805 = r->float_substr = r->float_utf8 = NULL;
4806 if (!(data.start_class->flags & ANYOF_EOS)
4807 && !cl_is_anything(data.start_class))
4808 {
4809 const U32 n = add_data(pRExC_state, 1, "f");
4810
4811 Newx(RExC_rxi->data->data[n], 1,
4812 struct regnode_charclass_class);
4813 StructCopy(data.start_class,
4814 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4815 struct regnode_charclass_class);
4816 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4817 r->intflags &= ~PREGf_SKIP;
4818 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4819 regprop(r, sv, (regnode*)data.start_class);
4820 PerlIO_printf(Perl_debug_log,
4821 "synthetic stclass \"%s\".\n",
4822 SvPVX_const(sv));});
4823 }
4824 }
4825
4826
4827
4828 DEBUG_OPTIMISE_r({
4829 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4830 (IV)minlen, (IV)r->minlen);
4831 });
4832 r->minlenret = minlen;
4833 if (r->minlen < minlen)
4834 r->minlen = minlen;
4835
4836 if (RExC_seen & REG_SEEN_GPOS)
4837 r->extflags |= RXf_GPOS_SEEN;
4838 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4839 r->extflags |= RXf_LOOKBEHIND_SEEN;
4840 if (RExC_seen & REG_SEEN_EVAL)
4841 r->extflags |= RXf_EVAL_SEEN;
4842 if (RExC_seen & REG_SEEN_CANY)
4843 r->extflags |= RXf_CANY_SEEN;
4844 if (RExC_seen & REG_SEEN_VERBARG)
4845 r->intflags |= PREGf_VERBARG_SEEN;
4846 if (RExC_seen & REG_SEEN_CUTGROUP)
4847 r->intflags |= PREGf_CUTGROUP_SEEN;
4848 if (RExC_paren_names)
4849 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4850 else
4851 RXp_PAREN_NAMES(r) = NULL;
4852
4853#ifdef STUPID_PATTERN_CHECKS
4854 if (RX_PRELEN(r) == 0)
4855 r->extflags |= RXf_NULL;
4856 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4857
4858 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4859 else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3))
4860 r->extflags |= RXf_WHITE;
4861 else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4862 r->extflags |= RXf_START_ONLY;
4863#else
4864 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4865
4866 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4867 else {
4868 regnode *first = ri->program + 1;
4869 U8 fop = OP(first);
4870 U8 nop = OP(NEXTOPER(first));
4871
4872 if (PL_regkind[fop] == NOTHING && nop == END)
4873 r->extflags |= RXf_NULL;
4874 else if (PL_regkind[fop] == BOL && nop == END)
4875 r->extflags |= RXf_START_ONLY;
4876 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4877 r->extflags |= RXf_WHITE;
4878 }
4879#endif
4880#ifdef DEBUGGING
4881 if (RExC_paren_names) {
4882 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4883 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4884 } else
4885#endif
4886 ri->name_list_idx = 0;
4887
4888 if (RExC_recurse_count) {
4889 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4890 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4891 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4892 }
4893 }
4894 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4895
4896
4897 DEBUG_DUMP_r({
4898 PerlIO_printf(Perl_debug_log,"Final program:\n");
4899 regdump(r);
4900 });
4901#ifdef RE_TRACK_PATTERN_OFFSETS
4902 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4903 const U32 len = ri->u.offsets[0];
4904 U32 i;
4905 GET_RE_DEBUG_FLAGS_DECL;
4906 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4907 for (i = 1; i <= len; i++) {
4908 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4909 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4910 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4911 }
4912 PerlIO_printf(Perl_debug_log, "\n");
4913 });
4914#endif
4915 return(r);
4916}
4917
4918#undef RE_ENGINE_PTR
4919
4920
4921SV*
4922Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4923 const U32 flags)
4924{
4925 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4926
4927 PERL_UNUSED_ARG(value);
4928
4929 if (flags & RXapif_FETCH) {
4930 return reg_named_buff_fetch(rx, key, flags);
4931 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4932 Perl_croak(aTHX_ "%s", PL_no_modify);
4933 return NULL;
4934 } else if (flags & RXapif_EXISTS) {
4935 return reg_named_buff_exists(rx, key, flags)
4936 ? &PL_sv_yes
4937 : &PL_sv_no;
4938 } else if (flags & RXapif_REGNAMES) {
4939 return reg_named_buff_all(rx, flags);
4940 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4941 return reg_named_buff_scalar(rx, flags);
4942 } else {
4943 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4944 return NULL;
4945 }
4946}
4947
4948SV*
4949Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4950 const U32 flags)
4951{
4952 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4953 PERL_UNUSED_ARG(lastkey);
4954
4955 if (flags & RXapif_FIRSTKEY)
4956 return reg_named_buff_firstkey(rx, flags);
4957 else if (flags & RXapif_NEXTKEY)
4958 return reg_named_buff_nextkey(rx, flags);
4959 else {
4960 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4961 return NULL;
4962 }
4963}
4964
4965SV*
4966Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4967{
4968 AV *retarray = NULL;
4969 SV *ret;
4970
4971 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4972
4973 if (flags & RXapif_ALL)
4974 retarray=newAV();
4975
4976 if (rx && RXp_PAREN_NAMES(rx)) {
4977 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4978 if (he_str) {
4979 IV i;
4980 SV* sv_dat=HeVAL(he_str);
4981 I32 *nums=(I32*)SvPVX(sv_dat);
4982 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4983 if ((I32)(rx->nparens) >= nums[i]
4984 && rx->offs[nums[i]].start != -1
4985 && rx->offs[nums[i]].end != -1)
4986 {
4987 ret = newSVpvs("");
4988 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4989 if (!retarray)
4990 return ret;
4991 } else {
4992 ret = newSVsv(&PL_sv_undef);
4993 }
4994 if (retarray)
4995 av_push(retarray, ret);
4996 }
4997 if (retarray)
4998 return newRV_noinc(MUTABLE_SV(retarray));
4999 }
5000 }
5001 return NULL;
5002}
5003
5004bool
5005Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
5006 const U32 flags)
5007{
5008
5009 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5010
5011 if (rx && RXp_PAREN_NAMES(rx)) {
5012 if (flags & RXapif_ALL) {
5013 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5014 } else {
5015 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
5016 if (sv) {
5017 SvREFCNT_dec(sv);
5018 return TRUE;
5019 } else {
5020 return FALSE;
5021 }
5022 }
5023 } else {
5024 return FALSE;
5025 }
5026}
5027
5028SV*
5029Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
5030{
5031
5032 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5033
5034 if ( rx && RXp_PAREN_NAMES(rx) ) {
5035 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5036
5037 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
5038 } else {
5039 return FALSE;
5040 }
5041}
5042
5043SV*
5044Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
5045{
5046 GET_RE_DEBUG_FLAGS_DECL;
5047
5048 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5049
5050 if (rx && RXp_PAREN_NAMES(rx)) {
5051 HV *hv = RXp_PAREN_NAMES(rx);
5052 HE *temphe;
5053 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5054 IV i;
5055 IV parno = 0;
5056 SV* sv_dat = HeVAL(temphe);
5057 I32 *nums = (I32*)SvPVX(sv_dat);
5058 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5059 if ((I32)(rx->lastparen) >= nums[i] &&
5060 rx->offs[nums[i]].start != -1 &&
5061 rx->offs[nums[i]].end != -1)
5062 {
5063 parno = nums[i];
5064 break;
5065 }
5066 }
5067 if (parno || flags & RXapif_ALL) {
5068 return newSVhek(HeKEY_hek(temphe));
5069 }
5070 }
5071 }
5072 return NULL;
5073}
5074
5075SV*
5076Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5077{
5078 SV *ret;
5079 AV *av;
5080 I32 length;
5081
5082 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5083
5084 if (rx && RXp_PAREN_NAMES(rx)) {
5085 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5086 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5087 } else if (flags & RXapif_ONE) {
5088 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5089 av = MUTABLE_AV(SvRV(ret));
5090 length = av_len(av);
5091 SvREFCNT_dec(ret);
5092 return newSViv(length + 1);
5093 } else {
5094 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5095 return NULL;
5096 }
5097 }
5098 return &PL_sv_undef;
5099}
5100
5101SV*
5102Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5103{
5104 AV *av = newAV();
5105
5106 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5107
5108 if (rx && RXp_PAREN_NAMES(rx)) {
5109 HV *hv= RXp_PAREN_NAMES(rx);
5110 HE *temphe;
5111 (void)hv_iterinit(hv);
5112 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5113 IV i;
5114 IV parno = 0;
5115 SV* sv_dat = HeVAL(temphe);
5116 I32 *nums = (I32*)SvPVX(sv_dat);
5117 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5118 if ((I32)(rx->lastparen) >= nums[i] &&
5119 rx->offs[nums[i]].start != -1 &&
5120 rx->offs[nums[i]].end != -1)
5121 {
5122 parno = nums[i];
5123 break;
5124 }
5125 }
5126 if (parno || flags & RXapif_ALL) {
5127 av_push(av, newSVhek(HeKEY_hek(temphe)));
5128 }
5129 }
5130 }
5131
5132 return newRV_noinc(MUTABLE_SV(av));
5133}
5134
5135void
5136Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5137{
5138 char *s = NULL;
5139 I32 i = 0;
5140 I32 s1, t1;
5141
5142 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5143
5144 if (!rx->subbeg) {
5145 sv_setsv(sv,&PL_sv_undef);
5146 return;
5147 }
5148 else
5149 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5150
5151 i = rx->offs[0].start;
5152 s = rx->subbeg;
5153 }
5154 else
5155 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5156
5157 s = rx->subbeg + rx->offs[0].end;
5158 i = rx->sublen - rx->offs[0].end;
5159 }
5160 else
5161 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5162 (s1 = rx->offs[paren].start) != -1 &&
5163 (t1 = rx->offs[paren].end) != -1)
5164 {
5165
5166 i = t1 - s1;
5167 s = rx->subbeg + s1;
5168 } else {
5169 sv_setsv(sv,&PL_sv_undef);
5170 return;
5171 }
5172 assert(rx->sublen >= (s - rx->subbeg) + i );
5173 if (i >= 0) {
5174 const int oldtainted = PL_tainted;
5175 TAINT_NOT;
5176 sv_setpvn(sv, s, i);
5177 PL_tainted = oldtainted;
5178 if ( (rx->extflags & RXf_CANY_SEEN)
5179 ? (RXp_MATCH_UTF8(rx)
5180 && (!i || is_utf8_string((U8*)s, i)))
5181 : (RXp_MATCH_UTF8(rx)) )
5182 {
5183 SvUTF8_on(sv);
5184 }
5185 else
5186 SvUTF8_off(sv);
5187 if (PL_tainting) {
5188 if (RXp_MATCH_TAINTED(rx)) {
5189 if (SvTYPE(sv) >= SVt_PVMG) {
5190 MAGIC* const mg = SvMAGIC(sv);
5191 MAGIC* mgt;
5192 PL_tainted = 1;
5193 SvMAGIC_set(sv, mg->mg_moremagic);
5194 SvTAINT(sv);
5195 if ((mgt = SvMAGIC(sv))) {
5196 mg->mg_moremagic = mgt;
5197 SvMAGIC_set(sv, mg);
5198 }
5199 } else {
5200 PL_tainted = 1;
5201 SvTAINT(sv);
5202 }
5203 } else
5204 SvTAINTED_off(sv);
5205 }
5206 } else {
5207 sv_setsv(sv,&PL_sv_undef);
5208 return;
5209 }
5210}
5211
5212void
5213Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5214 SV const * const value)
5215{
5216 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5217
5218 PERL_UNUSED_ARG(rx);
5219 PERL_UNUSED_ARG(paren);
5220 PERL_UNUSED_ARG(value);
5221
5222 if (!PL_localizing)
5223 Perl_croak(aTHX_ "%s", PL_no_modify);
5224}
5225
5226I32
5227Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5228 const I32 paren)
5229{
5230 I32 i;
5231 I32 s1, t1;
5232
5233 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5234
5235
5236 switch (paren) {
5237
5238 case RX_BUFF_IDX_PREMATCH:
5239 if (rx->offs[0].start != -1) {
5240 i = rx->offs[0].start;
5241 if (i > 0) {
5242 s1 = 0;
5243 t1 = i;
5244 goto getlen;
5245 }
5246 }
5247 return 0;
5248
5249 case RX_BUFF_IDX_POSTMATCH:
5250 if (rx->offs[0].end != -1) {
5251 i = rx->sublen - rx->offs[0].end;
5252 if (i > 0) {
5253 s1 = rx->offs[0].end;
5254 t1 = rx->sublen;
5255 goto getlen;
5256 }
5257 }
5258 return 0;
5259
5260 default:
5261 if (paren <= (I32)rx->nparens &&
5262 (s1 = rx->offs[paren].start) != -1 &&
5263 (t1 = rx->offs[paren].end) != -1)
5264 {
5265 i = t1 - s1;
5266 goto getlen;
5267 } else {
5268 if (ckWARN(WARN_UNINITIALIZED))
5269 report_uninit((SV *)sv);
5270 return 0;
5271 }
5272 }
5273 getlen:
5274 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5275 const char * const s = rx->subbeg + s1;
5276 const U8 *ep;
5277 STRLEN el;
5278
5279 i = t1 - s1;
5280 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5281 i = el;
5282 }
5283 return i;
5284}
5285
5286SV*
5287Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5288{
5289 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5290 PERL_UNUSED_ARG(rx);
5291 return newSVpvs("Regexp");
5292}
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302#define REG_RSN_RETURN_NULL 0
5303#define REG_RSN_RETURN_NAME 1
5304#define REG_RSN_RETURN_DATA 2
5305
5306STATIC SV*
5307S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5308{
5309 char *name_start = RExC_parse;
5310
5311 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5312
5313 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5314
5315 if (UTF)
5316 do {
5317 RExC_parse += UTF8SKIP(RExC_parse);
5318 } while (isALNUM_utf8((U8*)RExC_parse));
5319 else
5320 do {
5321 RExC_parse++;
5322 } while (isALNUM(*RExC_parse));
5323 }
5324
5325 if ( flags ) {
5326 SV* sv_name
5327 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5328 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5329 if ( flags == REG_RSN_RETURN_NAME)
5330 return sv_name;
5331 else if (flags==REG_RSN_RETURN_DATA) {
5332 HE *he_str = NULL;
5333 SV *sv_dat = NULL;
5334 if ( ! sv_name )
5335 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5336 if (RExC_paren_names)
5337 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5338 if ( he_str )
5339 sv_dat = HeVAL(he_str);
5340 if ( ! sv_dat )
5341 vFAIL("Reference to nonexistent named group");
5342 return sv_dat;
5343 }
5344 else {
5345 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5346 }
5347
5348 }
5349 return NULL;
5350}
5351
5352#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5353 int rem=(int)(RExC_end - RExC_parse); \
5354 int cut; \
5355 int num; \
5356 int iscut=0; \
5357 if (rem>10) { \
5358 rem=10; \
5359 iscut=1; \
5360 } \
5361 cut=10-rem; \
5362 if (RExC_lastparse!=RExC_parse) \
5363 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5364 rem, RExC_parse, \
5365 cut + 4, \
5366 iscut ? "..." : "<" \
5367 ); \
5368 else \
5369 PerlIO_printf(Perl_debug_log,"%16s",""); \
5370 \
5371 if (SIZE_ONLY) \
5372 num = RExC_size + 1; \
5373 else \
5374 num=REG_NODE_NUM(RExC_emit); \
5375 if (RExC_lastnum!=num) \
5376 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5377 else \
5378 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5379 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5380 (int)((depth*2)), "", \
5381 (funcname) \
5382 ); \
5383 RExC_lastnum=num; \
5384 RExC_lastparse=RExC_parse; \
5385})
5386
5387
5388
5389#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5390 DEBUG_PARSE_MSG((funcname)); \
5391 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5392})
5393#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5394 DEBUG_PARSE_MSG((funcname)); \
5395 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5396})
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5407#ifdef DEBUGGING
5408#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5409#else
5410#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5411#endif
5412
5413STATIC regnode *
5414S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5415
5416{
5417 dVAR;
5418 register regnode *ret;
5419 register regnode *br;
5420 register regnode *lastbr;
5421 register regnode *ender = NULL;
5422 register I32 parno = 0;
5423 I32 flags;
5424 U32 oregflags = RExC_flags;
5425 bool have_branch = 0;
5426 bool is_open = 0;
5427 I32 freeze_paren = 0;
5428 I32 after_freeze = 0;
5429
5430
5431
5432
5433#define WASTED_O 0x01
5434#define WASTED_G 0x02
5435#define WASTED_C 0x04