perl/regcomp.c
<<
>>
Prefs
   1/*    regcomp.c
   2 */
   3
   4/*
   5 * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6 *
   7 *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8 */
   9
  10/* This file contains functions for compiling a regular expression.  See
  11 * also regexec.c which funnily enough, contains functions for executing
  12 * a regular expression.
  13 *
  14 * This file is also copied at build time to ext/re/re_comp.c, where
  15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16 * This causes the main functions to be compiled under new names and with
  17 * debugging support added, which makes "use re 'debug'" work.
  18 */
  19
  20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21 * confused with the original package (see point 3 below).  Thanks, Henry!
  22 */
  23
  24/* Additional note: this code is very heavily munged from Henry's version
  25 * in places.  In some spots I've traded clarity for efficiency, so don't
  26 * blame Henry for some of the lack of readability.
  27 */
  28
  29/* The names of the functions have been changed from regcomp and
  30 * regexec to  pregcomp and pregexec in order to avoid conflicts
  31 * with the POSIX routines of the same names.
  32*/
  33
  34#ifdef PERL_EXT_RE_BUILD
  35#include "re_top.h"
  36#endif
  37
  38/*
  39 * pregcomp and pregexec -- regsub and regerror are not used in perl
  40 *
  41 *      Copyright (c) 1986 by University of Toronto.
  42 *      Written by Henry Spencer.  Not derived from licensed software.
  43 *
  44 *      Permission is granted to anyone to use this software for any
  45 *      purpose on any computer system, and to redistribute it freely,
  46 *      subject to the following restrictions:
  47 *
  48 *      1. The author is not responsible for the consequences of use of
  49 *              this software, no matter how awful, even if they arise
  50 *              from defects in it.
  51 *
  52 *      2. The origin of this software must not be misrepresented, either
  53 *              by explicit claim or by omission.
  54 *
  55 *      3. Altered versions must be plainly marked as such, and must not
  56 *              be misrepresented as being the original software.
  57 *
  58 *
  59 ****    Alterations to Henry's code are...
  60 ****
  61 ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62 ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63 ****    by Larry Wall and others
  64 ****
  65 ****    You may distribute under the terms of either the GNU General Public
  66 ****    License or the Artistic License, as specified in the README file.
  67
  68 *
  69 * Beware that some of this code is subtly aware of the way operator
  70 * precedence is structured in regular expressions.  Serious changes in
  71 * regular-expression syntax might require a total rethink.
  72 */
  73#include "EXTERN.h"
  74#define PERL_IN_REGCOMP_C
  75#include "perl.h"
  76
  77#ifndef PERL_IN_XSUB_RE
  78#  include "INTERN.h"
  79#endif
  80
  81#define REG_COMP_C
  82#ifdef PERL_IN_XSUB_RE
  83#  include "re_comp.h"
  84#else
  85#  include "regcomp.h"
  86#endif
  87
  88#ifdef op
  89#undef op
  90#endif /* op */
  91
  92#ifdef MSDOS
  93#  if defined(BUGGY_MSC6)
  94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  95#    pragma optimize("a",off)
  96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  97#    pragma optimize("w",on )
  98#  endif /* BUGGY_MSC6 */
  99#endif /* MSDOS */
 100
 101#ifndef STATIC
 102#define STATIC  static
 103#endif
 104
 105typedef struct RExC_state_t {
 106    U32         flags;                  /* are we folding, multilining? */
 107    char        *precomp;               /* uncompiled string. */
 108    regexp      *rx;                    /* perl core regexp structure */
 109    regexp_internal     *rxi;           /* internal data for regexp object pprivate field */        
 110    char        *start;                 /* Start of input for compile */
 111    char        *end;                   /* End of input for compile */
 112    char        *parse;                 /* Input-scan pointer. */
 113    I32         whilem_seen;            /* number of WHILEM in this expr */
 114    regnode     *emit_start;            /* Start of emitted-code area */
 115    regnode     *emit_bound;            /* First regnode outside of the allocated space */
 116    regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 117    I32         naughty;                /* How bad is this pattern? */
 118    I32         sawback;                /* Did we see \1, ...? */
 119    U32         seen;
 120    I32         size;                   /* Code size. */
 121    I32         npar;                   /* Capture buffer count, (OPEN). */
 122    I32         cpar;                   /* Capture buffer count, (CLOSE). */
 123    I32         nestroot;               /* root parens we are in - used by accept */
 124    I32         extralen;
 125    I32         seen_zerolen;
 126    I32         seen_evals;
 127    regnode     **open_parens;          /* pointers to open parens */
 128    regnode     **close_parens;         /* pointers to close parens */
 129    regnode     *opend;                 /* END node in program */
 130    I32         utf8;           /* whether the pattern is utf8 or not */
 131    I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 132                                /* XXX use this for future optimisation of case
 133                                 * where pattern must be upgraded to utf8. */
 134    HV          *charnames;             /* cache of named sequences */
 135    HV          *paren_names;           /* Paren names */
 136    
 137    regnode     **recurse;              /* Recurse regops */
 138    I32         recurse_count;          /* Number of recurse regops */
 139#if ADD_TO_REGEXEC
 140    char        *starttry;              /* -Dr: where regtry was called. */
 141#define RExC_starttry   (pRExC_state->starttry)
 142#endif
 143#ifdef DEBUGGING
 144    const char  *lastparse;
 145    I32         lastnum;
 146    AV          *paren_name_list;       /* idx -> name */
 147#define RExC_lastparse  (pRExC_state->lastparse)
 148#define RExC_lastnum    (pRExC_state->lastnum)
 149#define RExC_paren_name_list    (pRExC_state->paren_name_list)
 150#endif
 151} RExC_state_t;
 152
 153#define RExC_flags      (pRExC_state->flags)
 154#define RExC_precomp    (pRExC_state->precomp)
 155#define RExC_rx         (pRExC_state->rx)
 156#define RExC_rxi        (pRExC_state->rxi)
 157#define RExC_start      (pRExC_state->start)
 158#define RExC_end        (pRExC_state->end)
 159#define RExC_parse      (pRExC_state->parse)
 160#define RExC_whilem_seen        (pRExC_state->whilem_seen)
 161#ifdef RE_TRACK_PATTERN_OFFSETS
 162#define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 163#endif
 164#define RExC_emit       (pRExC_state->emit)
 165#define RExC_emit_start (pRExC_state->emit_start)
 166#define RExC_emit_bound (pRExC_state->emit_bound)
 167#define RExC_naughty    (pRExC_state->naughty)
 168#define RExC_sawback    (pRExC_state->sawback)
 169#define RExC_seen       (pRExC_state->seen)
 170#define RExC_size       (pRExC_state->size)
 171#define RExC_npar       (pRExC_state->npar)
 172#define RExC_nestroot   (pRExC_state->nestroot)
 173#define RExC_extralen   (pRExC_state->extralen)
 174#define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 175#define RExC_seen_evals (pRExC_state->seen_evals)
 176#define RExC_utf8       (pRExC_state->utf8)
 177#define RExC_orig_utf8  (pRExC_state->orig_utf8)
 178#define RExC_charnames  (pRExC_state->charnames)
 179#define RExC_open_parens        (pRExC_state->open_parens)
 180#define RExC_close_parens       (pRExC_state->close_parens)
 181#define RExC_opend      (pRExC_state->opend)
 182#define RExC_paren_names        (pRExC_state->paren_names)
 183#define RExC_recurse    (pRExC_state->recurse)
 184#define RExC_recurse_count      (pRExC_state->recurse_count)
 185
 186
 187#define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 188#define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 189        ((*s) == '{' && regcurly(s)))
 190
 191#ifdef SPSTART
 192#undef SPSTART          /* dratted cpp namespace... */
 193#endif
 194/*
 195 * Flags to be passed up and down.
 196 */
 197#define WORST           0       /* Worst case. */
 198#define HASWIDTH        0x01    /* Known to match non-null strings. */
 199#define SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
 200#define SPSTART         0x04    /* Starts with * or +. */
 201#define TRYAGAIN        0x08    /* Weeded out a declaration. */
 202#define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 203
 204#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 205
 206/* whether trie related optimizations are enabled */
 207#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 208#define TRIE_STUDY_OPT
 209#define FULL_TRIE_STUDY
 210#define TRIE_STCLASS
 211#endif
 212
 213
 214
 215#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 216#define PBITVAL(paren) (1 << ((paren) & 7))
 217#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 218#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 219#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 220
 221
 222/* About scan_data_t.
 223
 224  During optimisation we recurse through the regexp program performing
 225  various inplace (keyhole style) optimisations. In addition study_chunk
 226  and scan_commit populate this data structure with information about
 227  what strings MUST appear in the pattern. We look for the longest 
 228  string that must appear for at a fixed location, and we look for the
 229  longest string that may appear at a floating location. So for instance
 230  in the pattern:
 231  
 232    /FOO[xX]A.*B[xX]BAR/
 233    
 234  Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 235  strings (because they follow a .* construct). study_chunk will identify
 236  both FOO and BAR as being the longest fixed and floating strings respectively.
 237  
 238  The strings can be composites, for instance
 239  
 240     /(f)(o)(o)/
 241     
 242  will result in a composite fixed substring 'foo'.
 243  
 244  For each string some basic information is maintained:
 245  
 246  - offset or min_offset
 247    This is the position the string must appear at, or not before.
 248    It also implicitly (when combined with minlenp) tells us how many
 249    character must match before the string we are searching.
 250    Likewise when combined with minlenp and the length of the string
 251    tells us how many characters must appear after the string we have 
 252    found.
 253  
 254  - max_offset
 255    Only used for floating strings. This is the rightmost point that
 256    the string can appear at. Ifset to I32 max it indicates that the
 257    string can occur infinitely far to the right.
 258  
 259  - minlenp
 260    A pointer to the minimum length of the pattern that the string 
 261    was found inside. This is important as in the case of positive 
 262    lookahead or positive lookbehind we can have multiple patterns 
 263    involved. Consider
 264    
 265    /(?=FOO).*F/
 266    
 267    The minimum length of the pattern overall is 3, the minimum length
 268    of the lookahead part is 3, but the minimum length of the part that
 269    will actually match is 1. So 'FOO's minimum length is 3, but the 
 270    minimum length for the F is 1. This is important as the minimum length
 271    is used to determine offsets in front of and behind the string being 
 272    looked for.  Since strings can be composites this is the length of the
 273    pattern at the time it was commited with a scan_commit. Note that
 274    the length is calculated by study_chunk, so that the minimum lengths
 275    are not known until the full pattern has been compiled, thus the 
 276    pointer to the value.
 277  
 278  - lookbehind
 279  
 280    In the case of lookbehind the string being searched for can be
 281    offset past the start point of the final matching string. 
 282    If this value was just blithely removed from the min_offset it would
 283    invalidate some of the calculations for how many chars must match
 284    before or after (as they are derived from min_offset and minlen and
 285    the length of the string being searched for). 
 286    When the final pattern is compiled and the data is moved from the
 287    scan_data_t structure into the regexp structure the information
 288    about lookbehind is factored in, with the information that would 
 289    have been lost precalculated in the end_shift field for the 
 290    associated string.
 291
 292  The fields pos_min and pos_delta are used to store the minimum offset
 293  and the delta to the maximum offset at the current point in the pattern.    
 294
 295*/
 296
 297typedef struct scan_data_t {
 298    /*I32 len_min;      unused */
 299    /*I32 len_delta;    unused */
 300    I32 pos_min;
 301    I32 pos_delta;
 302    SV *last_found;
 303    I32 last_end;           /* min value, <0 unless valid. */
 304    I32 last_start_min;
 305    I32 last_start_max;
 306    SV **longest;           /* Either &l_fixed, or &l_float. */
 307    SV *longest_fixed;      /* longest fixed string found in pattern */
 308    I32 offset_fixed;       /* offset where it starts */
 309    I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 310    I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 311    SV *longest_float;      /* longest floating string found in pattern */
 312    I32 offset_float_min;   /* earliest point in string it can appear */
 313    I32 offset_float_max;   /* latest point in string it can appear */
 314    I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 315    I32 lookbehind_float;   /* is the position of the string modified by LB */
 316    I32 flags;
 317    I32 whilem_c;
 318    I32 *last_closep;
 319    struct regnode_charclass_class *start_class;
 320} scan_data_t;
 321
 322/*
 323 * Forward declarations for pregcomp()'s friends.
 324 */
 325
 326static const scan_data_t zero_scan_data =
 327  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 328
 329#define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 330#define SF_BEFORE_SEOL          0x0001
 331#define SF_BEFORE_MEOL          0x0002
 332#define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 333#define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 334
 335#ifdef NO_UNARY_PLUS
 336#  define SF_FIX_SHIFT_EOL      (0+2)
 337#  define SF_FL_SHIFT_EOL               (0+4)
 338#else
 339#  define SF_FIX_SHIFT_EOL      (+2)
 340#  define SF_FL_SHIFT_EOL               (+4)
 341#endif
 342
 343#define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 344#define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 345
 346#define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 347#define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 348#define SF_IS_INF               0x0040
 349#define SF_HAS_PAR              0x0080
 350#define SF_IN_PAR               0x0100
 351#define SF_HAS_EVAL             0x0200
 352#define SCF_DO_SUBSTR           0x0400
 353#define SCF_DO_STCLASS_AND      0x0800
 354#define SCF_DO_STCLASS_OR       0x1000
 355#define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 356#define SCF_WHILEM_VISITED_POS  0x2000
 357
 358#define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 359#define SCF_SEEN_ACCEPT         0x8000 
 360
 361#define UTF (RExC_utf8 != 0)
 362#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 363#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 364
 365#define OOB_UNICODE             12345678
 366#define OOB_NAMEDCLASS          -1
 367
 368#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 369#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 370
 371
 372/* length of regex to show in messages that don't mark a position within */
 373#define RegexLengthToShowInErrorMessages 127
 374
 375/*
 376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 378 * op/pragma/warn/regcomp.
 379 */
 380#define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 381#define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 382
 383#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 384
 385/*
 386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
 388 * "...".
 389 */
 390#define _FAIL(code) STMT_START {                                        \
 391    const char *ellipses = "";                                          \
 392    IV len = RExC_end - RExC_precomp;                                   \
 393                                                                        \
 394    if (!SIZE_ONLY)                                                     \
 395        SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
 396    if (len > RegexLengthToShowInErrorMessages) {                       \
 397        /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 398        len = RegexLengthToShowInErrorMessages - 10;                    \
 399        ellipses = "...";                                               \
 400    }                                                                   \
 401    code;                                                               \
 402} STMT_END
 403
 404#define FAIL(msg) _FAIL(                            \
 405    Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 406            msg, (int)len, RExC_precomp, ellipses))
 407
 408#define FAIL2(msg,arg) _FAIL(                       \
 409    Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 410            arg, (int)len, RExC_precomp, ellipses))
 411
 412/*
 413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 414 */
 415#define Simple_vFAIL(m) STMT_START {                                    \
 416    const IV offset = RExC_parse - RExC_precomp;                        \
 417    Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 418            m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 419} STMT_END
 420
 421/*
 422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 423 */
 424#define vFAIL(m) STMT_START {                           \
 425    if (!SIZE_ONLY)                                     \
 426        SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
 427    Simple_vFAIL(m);                                    \
 428} STMT_END
 429
 430/*
 431 * Like Simple_vFAIL(), but accepts two arguments.
 432 */
 433#define Simple_vFAIL2(m,a1) STMT_START {                        \
 434    const IV offset = RExC_parse - RExC_precomp;                        \
 435    S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 436            (int)offset, RExC_precomp, RExC_precomp + offset);  \
 437} STMT_END
 438
 439/*
 440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 441 */
 442#define vFAIL2(m,a1) STMT_START {                       \
 443    if (!SIZE_ONLY)                                     \
 444        SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
 445    Simple_vFAIL2(m, a1);                               \
 446} STMT_END
 447
 448
 449/*
 450 * Like Simple_vFAIL(), but accepts three arguments.
 451 */
 452#define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 453    const IV offset = RExC_parse - RExC_precomp;                \
 454    S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 455            (int)offset, RExC_precomp, RExC_precomp + offset);  \
 456} STMT_END
 457
 458/*
 459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 460 */
 461#define vFAIL3(m,a1,a2) STMT_START {                    \
 462    if (!SIZE_ONLY)                                     \
 463        SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
 464    Simple_vFAIL3(m, a1, a2);                           \
 465} STMT_END
 466
 467/*
 468 * Like Simple_vFAIL(), but accepts four arguments.
 469 */
 470#define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 471    const IV offset = RExC_parse - RExC_precomp;                \
 472    S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 473            (int)offset, RExC_precomp, RExC_precomp + offset);  \
 474} STMT_END
 475
 476#define vWARN(loc,m) STMT_START {                                       \
 477    const IV offset = loc - RExC_precomp;                               \
 478    Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 479            m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 480} STMT_END
 481
 482#define vWARNdep(loc,m) STMT_START {                                    \
 483    const IV offset = loc - RExC_precomp;                               \
 484    Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
 485            "%s" REPORT_LOCATION,                                       \
 486            m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 487} STMT_END
 488
 489
 490#define vWARN2(loc, m, a1) STMT_START {                                 \
 491    const IV offset = loc - RExC_precomp;                               \
 492    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 493            a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 494} STMT_END
 495
 496#define vWARN3(loc, m, a1, a2) STMT_START {                             \
 497    const IV offset = loc - RExC_precomp;                               \
 498    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 499            a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 500} STMT_END
 501
 502#define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 503    const IV offset = loc - RExC_precomp;                               \
 504    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 505            a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 506} STMT_END
 507
 508#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 509    const IV offset = loc - RExC_precomp;                               \
 510    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 511            a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 512} STMT_END
 513
 514
 515/* Allow for side effects in s */
 516#define REGC(c,s) STMT_START {                  \
 517    if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 518} STMT_END
 519
 520/* Macros for recording node offsets.   20001227 mjd@plover.com 
 521 * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 522 * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 523 * Element 0 holds the number n.
 524 * Position is 1 indexed.
 525 */
 526#ifndef RE_TRACK_PATTERN_OFFSETS
 527#define Set_Node_Offset_To_R(node,byte)
 528#define Set_Node_Offset(node,byte)
 529#define Set_Cur_Node_Offset
 530#define Set_Node_Length_To_R(node,len)
 531#define Set_Node_Length(node,len)
 532#define Set_Node_Cur_Length(node)
 533#define Node_Offset(n) 
 534#define Node_Length(n) 
 535#define Set_Node_Offset_Length(node,offset,len)
 536#define ProgLen(ri) ri->u.proglen
 537#define SetProgLen(ri,x) ri->u.proglen = x
 538#else
 539#define ProgLen(ri) ri->u.offsets[0]
 540#define SetProgLen(ri,x) ri->u.offsets[0] = x
 541#define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 542    if (! SIZE_ONLY) {                                                  \
 543        MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 544                    __LINE__, (int)(node), (int)(byte)));               \
 545        if((node) < 0) {                                                \
 546            Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 547        } else {                                                        \
 548            RExC_offsets[2*(node)-1] = (byte);                          \
 549        }                                                               \
 550    }                                                                   \
 551} STMT_END
 552
 553#define Set_Node_Offset(node,byte) \
 554    Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 555#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 556
 557#define Set_Node_Length_To_R(node,len) STMT_START {                     \
 558    if (! SIZE_ONLY) {                                                  \
 559        MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 560                __LINE__, (int)(node), (int)(len)));                    \
 561        if((node) < 0) {                                                \
 562            Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 563        } else {                                                        \
 564            RExC_offsets[2*(node)] = (len);                             \
 565        }                                                               \
 566    }                                                                   \
 567} STMT_END
 568
 569#define Set_Node_Length(node,len) \
 570    Set_Node_Length_To_R((node)-RExC_emit_start, len)
 571#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 572#define Set_Node_Cur_Length(node) \
 573    Set_Node_Length(node, RExC_parse - parse_start)
 574
 575/* Get offsets and lengths */
 576#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 577#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 578
 579#define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 580    Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 581    Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 582} STMT_END
 583#endif
 584
 585#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 586#define EXPERIMENTAL_INPLACESCAN
 587#endif /*RE_TRACK_PATTERN_OFFSETS*/
 588
 589#define DEBUG_STUDYDATA(str,data,depth)                              \
 590DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 591    PerlIO_printf(Perl_debug_log,                                    \
 592        "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 593        " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 594        (int)(depth)*2, "",                                          \
 595        (IV)((data)->pos_min),                                       \
 596        (IV)((data)->pos_delta),                                     \
 597        (UV)((data)->flags),                                         \
 598        (IV)((data)->whilem_c),                                      \
 599        (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 600        is_inf ? "INF " : ""                                         \
 601    );                                                               \
 602    if ((data)->last_found)                                          \
 603        PerlIO_printf(Perl_debug_log,                                \
 604            "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 605            " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 606            SvPVX_const((data)->last_found),                         \
 607            (IV)((data)->last_end),                                  \
 608            (IV)((data)->last_start_min),                            \
 609            (IV)((data)->last_start_max),                            \
 610            ((data)->longest &&                                      \
 611             (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 612            SvPVX_const((data)->longest_fixed),                      \
 613            (IV)((data)->offset_fixed),                              \
 614            ((data)->longest &&                                      \
 615             (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 616            SvPVX_const((data)->longest_float),                      \
 617            (IV)((data)->offset_float_min),                          \
 618            (IV)((data)->offset_float_max)                           \
 619        );                                                           \
 620    PerlIO_printf(Perl_debug_log,"\n");                              \
 621});
 622
 623static void clear_re(pTHX_ void *r);
 624
 625/* Mark that we cannot extend a found fixed substring at this point.
 626   Update the longest found anchored substring and the longest found
 627   floating substrings if needed. */
 628
 629STATIC void
 630S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 631{
 632    const STRLEN l = CHR_SVLEN(data->last_found);
 633    const STRLEN old_l = CHR_SVLEN(*data->longest);
 634    GET_RE_DEBUG_FLAGS_DECL;
 635
 636    PERL_ARGS_ASSERT_SCAN_COMMIT;
 637
 638    if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 639        SvSetMagicSV(*data->longest, data->last_found);
 640        if (*data->longest == data->longest_fixed) {
 641            data->offset_fixed = l ? data->last_start_min : data->pos_min;
 642            if (data->flags & SF_BEFORE_EOL)
 643                data->flags
 644                    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 645            else
 646                data->flags &= ~SF_FIX_BEFORE_EOL;
 647            data->minlen_fixed=minlenp; 
 648            data->lookbehind_fixed=0;
 649        }
 650        else { /* *data->longest == data->longest_float */
 651            data->offset_float_min = l ? data->last_start_min : data->pos_min;
 652            data->offset_float_max = (l
 653                                      ? data->last_start_max
 654                                      : data->pos_min + data->pos_delta);
 655            if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 656                data->offset_float_max = I32_MAX;
 657            if (data->flags & SF_BEFORE_EOL)
 658                data->flags
 659                    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 660            else
 661                data->flags &= ~SF_FL_BEFORE_EOL;
 662            data->minlen_float=minlenp;
 663            data->lookbehind_float=0;
 664        }
 665    }
 666    SvCUR_set(data->last_found, 0);
 667    {
 668        SV * const sv = data->last_found;
 669        if (SvUTF8(sv) && SvMAGICAL(sv)) {
 670            MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 671            if (mg)
 672                mg->mg_len = 0;
 673        }
 674    }
 675    data->last_end = -1;
 676    data->flags &= ~SF_BEFORE_EOL;
 677    DEBUG_STUDYDATA("commit: ",data,0);
 678}
 679
 680/* Can match anything (initialization) */
 681STATIC void
 682S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 683{
 684    PERL_ARGS_ASSERT_CL_ANYTHING;
 685
 686    ANYOF_CLASS_ZERO(cl);
 687    ANYOF_BITMAP_SETALL(cl);
 688    cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 689    if (LOC)
 690        cl->flags |= ANYOF_LOCALE;
 691}
 692
 693/* Can match anything (initialization) */
 694STATIC int
 695S_cl_is_anything(const struct regnode_charclass_class *cl)
 696{
 697    int value;
 698
 699    PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 700
 701    for (value = 0; value <= ANYOF_MAX; value += 2)
 702        if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 703            return 1;
 704    if (!(cl->flags & ANYOF_UNICODE_ALL))
 705        return 0;
 706    if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 707        return 0;
 708    return 1;
 709}
 710
 711/* Can match anything (initialization) */
 712STATIC void
 713S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 714{
 715    PERL_ARGS_ASSERT_CL_INIT;
 716
 717    Zero(cl, 1, struct regnode_charclass_class);
 718    cl->type = ANYOF;
 719    cl_anything(pRExC_state, cl);
 720}
 721
 722STATIC void
 723S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 724{
 725    PERL_ARGS_ASSERT_CL_INIT_ZERO;
 726
 727    Zero(cl, 1, struct regnode_charclass_class);
 728    cl->type = ANYOF;
 729    cl_anything(pRExC_state, cl);
 730    if (LOC)
 731        cl->flags |= ANYOF_LOCALE;
 732}
 733
 734/* 'And' a given class with another one.  Can create false positives */
 735/* We assume that cl is not inverted */
 736STATIC void
 737S_cl_and(struct regnode_charclass_class *cl,
 738        const struct regnode_charclass_class *and_with)
 739{
 740    PERL_ARGS_ASSERT_CL_AND;
 741
 742    assert(and_with->type == ANYOF);
 743    if (!(and_with->flags & ANYOF_CLASS)
 744        && !(cl->flags & ANYOF_CLASS)
 745        && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 746        && !(and_with->flags & ANYOF_FOLD)
 747        && !(cl->flags & ANYOF_FOLD)) {
 748        int i;
 749
 750        if (and_with->flags & ANYOF_INVERT)
 751            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 752                cl->bitmap[i] &= ~and_with->bitmap[i];
 753        else
 754            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 755                cl->bitmap[i] &= and_with->bitmap[i];
 756    } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 757    if (!(and_with->flags & ANYOF_EOS))
 758        cl->flags &= ~ANYOF_EOS;
 759
 760    if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 761        !(and_with->flags & ANYOF_INVERT)) {
 762        cl->flags &= ~ANYOF_UNICODE_ALL;
 763        cl->flags |= ANYOF_UNICODE;
 764        ARG_SET(cl, ARG(and_with));
 765    }
 766    if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 767        !(and_with->flags & ANYOF_INVERT))
 768        cl->flags &= ~ANYOF_UNICODE_ALL;
 769    if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 770        !(and_with->flags & ANYOF_INVERT))
 771        cl->flags &= ~ANYOF_UNICODE;
 772}
 773
 774/* 'OR' a given class with another one.  Can create false positives */
 775/* We assume that cl is not inverted */
 776STATIC void
 777S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 778{
 779    PERL_ARGS_ASSERT_CL_OR;
 780
 781    if (or_with->flags & ANYOF_INVERT) {
 782        /* We do not use
 783         * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 784         *   <= (B1 | !B2) | (CL1 | !CL2)
 785         * which is wasteful if CL2 is small, but we ignore CL2:
 786         *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 787         * XXXX Can we handle case-fold?  Unclear:
 788         *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 789         *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 790         */
 791        if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 792             && !(or_with->flags & ANYOF_FOLD)
 793             && !(cl->flags & ANYOF_FOLD) ) {
 794            int i;
 795
 796            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 797                cl->bitmap[i] |= ~or_with->bitmap[i];
 798        } /* XXXX: logic is complicated otherwise */
 799        else {
 800            cl_anything(pRExC_state, cl);
 801        }
 802    } else {
 803        /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 804        if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 805             && (!(or_with->flags & ANYOF_FOLD)
 806                 || (cl->flags & ANYOF_FOLD)) ) {
 807            int i;
 808
 809            /* OR char bitmap and class bitmap separately */
 810            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                cl->bitmap[i] |= or_with->bitmap[i];
 812            if (or_with->flags & ANYOF_CLASS) {
 813                for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 814                    cl->classflags[i] |= or_with->classflags[i];
 815                cl->flags |= ANYOF_CLASS;
 816            }
 817        }
 818        else { /* XXXX: logic is complicated, leave it along for a moment. */
 819            cl_anything(pRExC_state, cl);
 820        }
 821    }
 822    if (or_with->flags & ANYOF_EOS)
 823        cl->flags |= ANYOF_EOS;
 824
 825    if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 826        ARG(cl) != ARG(or_with)) {
 827        cl->flags |= ANYOF_UNICODE_ALL;
 828        cl->flags &= ~ANYOF_UNICODE;
 829    }
 830    if (or_with->flags & ANYOF_UNICODE_ALL) {
 831        cl->flags |= ANYOF_UNICODE_ALL;
 832        cl->flags &= ~ANYOF_UNICODE;
 833    }
 834}
 835
 836#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 837#define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 838#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 839#define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 840
 841
 842#ifdef DEBUGGING
 843/*
 844   dump_trie(trie,widecharmap,revcharmap)
 845   dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 846   dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 847
 848   These routines dump out a trie in a somewhat readable format.
 849   The _interim_ variants are used for debugging the interim
 850   tables that are used to generate the final compressed
 851   representation which is what dump_trie expects.
 852
 853   Part of the reason for their existance is to provide a form
 854   of documentation as to how the different representations function.
 855
 856*/
 857
 858/*
 859  Dumps the final compressed table form of the trie to Perl_debug_log.
 860  Used for debugging make_trie().
 861*/
 862 
 863STATIC void
 864S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 865            AV *revcharmap, U32 depth)
 866{
 867    U32 state;
 868    SV *sv=sv_newmortal();
 869    int colwidth= widecharmap ? 6 : 4;
 870    GET_RE_DEBUG_FLAGS_DECL;
 871
 872    PERL_ARGS_ASSERT_DUMP_TRIE;
 873
 874    PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 875        (int)depth * 2 + 2,"",
 876        "Match","Base","Ofs" );
 877
 878    for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 879        SV ** const tmp = av_fetch( revcharmap, state, 0);
 880        if ( tmp ) {
 881            PerlIO_printf( Perl_debug_log, "%*s", 
 882                colwidth,
 883                pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 
 884                            PL_colors[0], PL_colors[1],
 885                            (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 886                            PERL_PV_ESCAPE_FIRSTCHAR 
 887                ) 
 888            );
 889        }
 890    }
 891    PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 892        (int)depth * 2 + 2,"");
 893
 894    for( state = 0 ; state < trie->uniquecharcount ; state++ )
 895        PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 896    PerlIO_printf( Perl_debug_log, "\n");
 897
 898    for( state = 1 ; state < trie->statecount ; state++ ) {
 899        const U32 base = trie->states[ state ].trans.base;
 900
 901        PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 902
 903        if ( trie->states[ state ].wordnum ) {
 904            PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 905        } else {
 906            PerlIO_printf( Perl_debug_log, "%6s", "" );
 907        }
 908
 909        PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 910
 911        if ( base ) {
 912            U32 ofs = 0;
 913
 914            while( ( base + ofs  < trie->uniquecharcount ) ||
 915                   ( base + ofs - trie->uniquecharcount < trie->lasttrans
 916                     && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 917                    ofs++;
 918
 919            PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 920
 921            for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 922                if ( ( base + ofs >= trie->uniquecharcount ) &&
 923                     ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 924                     trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 925                {
 926                   PerlIO_printf( Perl_debug_log, "%*"UVXf,
 927                    colwidth,
 928                    (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 929                } else {
 930                    PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 931                }
 932            }
 933
 934            PerlIO_printf( Perl_debug_log, "]");
 935
 936        }
 937        PerlIO_printf( Perl_debug_log, "\n" );
 938    }
 939}    
 940/*
 941  Dumps a fully constructed but uncompressed trie in list form.
 942  List tries normally only are used for construction when the number of 
 943  possible chars (trie->uniquecharcount) is very high.
 944  Used for debugging make_trie().
 945*/
 946STATIC void
 947S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 948                         HV *widecharmap, AV *revcharmap, U32 next_alloc,
 949                         U32 depth)
 950{
 951    U32 state;
 952    SV *sv=sv_newmortal();
 953    int colwidth= widecharmap ? 6 : 4;
 954    GET_RE_DEBUG_FLAGS_DECL;
 955
 956    PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 957
 958    /* print out the table precompression.  */
 959    PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 960        (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 961        "------:-----+-----------------\n" );
 962    
 963    for( state=1 ; state < next_alloc ; state ++ ) {
 964        U16 charid;
 965    
 966        PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 967            (int)depth * 2 + 2,"", (UV)state  );
 968        if ( ! trie->states[ state ].wordnum ) {
 969            PerlIO_printf( Perl_debug_log, "%5s| ","");
 970        } else {
 971            PerlIO_printf( Perl_debug_log, "W%4x| ",
 972                trie->states[ state ].wordnum
 973            );
 974        }
 975        for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 976            SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
 977            if ( tmp ) {
 978                PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
 979                    colwidth,
 980                    pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 
 981                            PL_colors[0], PL_colors[1],
 982                            (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 983                            PERL_PV_ESCAPE_FIRSTCHAR 
 984                    ) ,
 985                    TRIE_LIST_ITEM(state,charid).forid,
 986                    (UV)TRIE_LIST_ITEM(state,charid).newstate
 987                );
 988                if (!(charid % 10)) 
 989                    PerlIO_printf(Perl_debug_log, "\n%*s| ",
 990                        (int)((depth * 2) + 14), "");
 991            }
 992        }
 993        PerlIO_printf( Perl_debug_log, "\n");
 994    }
 995}    
 996
 997/*
 998  Dumps a fully constructed but uncompressed trie in table form.
 999  This is the normal DFA style state transition table, with a few 
1000  twists to facilitate compression later. 
1001  Used for debugging make_trie().
1002*/
1003STATIC void
1004S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1005                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1006                          U32 depth)
1007{
1008    U32 state;
1009    U16 charid;
1010    SV *sv=sv_newmortal();
1011    int colwidth= widecharmap ? 6 : 4;
1012    GET_RE_DEBUG_FLAGS_DECL;
1013
1014    PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1015    
1016    /*
1017       print out the table precompression so that we can do a visual check
1018       that they are identical.
1019     */
1020    
1021    PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1022
1023    for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1024        SV ** const tmp = av_fetch( revcharmap, charid, 0);
1025        if ( tmp ) {
1026            PerlIO_printf( Perl_debug_log, "%*s", 
1027                colwidth,
1028                pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 
1029                            PL_colors[0], PL_colors[1],
1030                            (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031                            PERL_PV_ESCAPE_FIRSTCHAR 
1032                ) 
1033            );
1034        }
1035    }
1036
1037    PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1038
1039    for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1040        PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1041    }
1042
1043    PerlIO_printf( Perl_debug_log, "\n" );
1044
1045    for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1046
1047        PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", 
1048            (int)depth * 2 + 2,"",
1049            (UV)TRIE_NODENUM( state ) );
1050
1051        for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1052            UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1053            if (v)
1054                PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1055            else
1056                PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1057        }
1058        if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1059            PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1060        } else {
1061            PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1062            trie->states[ TRIE_NODENUM( state ) ].wordnum );
1063        }
1064    }
1065}
1066
1067#endif
1068
1069/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1070  startbranch: the first branch in the whole branch sequence
1071  first      : start branch of sequence of branch-exact nodes.
1072               May be the same as startbranch
1073  last       : Thing following the last branch.
1074               May be the same as tail.
1075  tail       : item following the branch sequence
1076  count      : words in the sequence
1077  flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1078  depth      : indent depth
1079
1080Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1081
1082A trie is an N'ary tree where the branches are determined by digital
1083decomposition of the key. IE, at the root node you look up the 1st character and
1084follow that branch repeat until you find the end of the branches. Nodes can be
1085marked as "accepting" meaning they represent a complete word. Eg:
1086
1087  /he|she|his|hers/
1088
1089would convert into the following structure. Numbers represent states, letters
1090following numbers represent valid transitions on the letter from that state, if
1091the number is in square brackets it represents an accepting state, otherwise it
1092will be in parenthesis.
1093
1094      +-h->+-e->[3]-+-r->(8)-+-s->[9]
1095      |    |
1096      |   (2)
1097      |    |
1098     (1)   +-i->(6)-+-s->[7]
1099      |
1100      +-s->(3)-+-h->(4)-+-e->[5]
1101
1102      Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1103
1104This shows that when matching against the string 'hers' we will begin at state 1
1105read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1106then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1107is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1108single traverse. We store a mapping from accepting to state to which word was
1109matched, and then when we have multiple possibilities we try to complete the
1110rest of the regex in the order in which they occured in the alternation.
1111
1112The only prior NFA like behaviour that would be changed by the TRIE support is
1113the silent ignoring of duplicate alternations which are of the form:
1114
1115 / (DUPE|DUPE) X? (?{ ... }) Y /x
1116
1117Thus EVAL blocks follwing a trie may be called a different number of times with
1118and without the optimisation. With the optimisations dupes will be silently
1119ignored. This inconsistant behaviour of EVAL type nodes is well established as
1120the following demonstrates:
1121
1122 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1123
1124which prints out 'word' three times, but
1125
1126 'words'=~/(word|word|word)(?{ print $1 })S/
1127
1128which doesnt print it out at all. This is due to other optimisations kicking in.
1129
1130Example of what happens on a structural level:
1131
1132The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1133
1134   1: CURLYM[1] {1,32767}(18)
1135   5:   BRANCH(8)
1136   6:     EXACT <ac>(16)
1137   8:   BRANCH(11)
1138   9:     EXACT <ad>(16)
1139  11:   BRANCH(14)
1140  12:     EXACT <ab>(16)
1141  16:   SUCCEED(0)
1142  17:   NOTHING(18)
1143  18: END(0)
1144
1145This would be optimizable with startbranch=5, first=5, last=16, tail=16
1146and should turn into:
1147
1148   1: CURLYM[1] {1,32767}(18)
1149   5:   TRIE(16)
1150        [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1151          <ac>
1152          <ad>
1153          <ab>
1154  16:   SUCCEED(0)
1155  17:   NOTHING(18)
1156  18: END(0)
1157
1158Cases where tail != last would be like /(?foo|bar)baz/:
1159
1160   1: BRANCH(4)
1161   2:   EXACT <foo>(8)
1162   4: BRANCH(7)
1163   5:   EXACT <bar>(8)
1164   7: TAIL(8)
1165   8: EXACT <baz>(10)
1166  10: END(0)
1167
1168which would be optimizable with startbranch=1, first=1, last=7, tail=8
1169and would end up looking like:
1170
1171    1: TRIE(8)
1172      [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1173        <foo>
1174        <bar>
1175   7: TAIL(8)
1176   8: EXACT <baz>(10)
1177  10: END(0)
1178
1179    d = uvuni_to_utf8_flags(d, uv, 0);
1180
1181is the recommended Unicode-aware way of saying
1182
1183    *(d++) = uv;
1184*/
1185
1186#define TRIE_STORE_REVCHAR                                                 \
1187    STMT_START {                                                           \
1188        if (UTF) {                                                         \
1189            SV *zlopp = newSV(2);                                          \
1190            unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1191            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1192            SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1193            SvPOK_on(zlopp);                                               \
1194            SvUTF8_on(zlopp);                                              \
1195            av_push(revcharmap, zlopp);                                    \
1196        } else {                                                           \
1197            char ooooff = (char)uvc;                                               \
1198            av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1199        }                                                                  \
1200        } STMT_END
1201
1202#define TRIE_READ_CHAR STMT_START {                                           \
1203    wordlen++;                                                                \
1204    if ( UTF ) {                                                              \
1205        if ( folder ) {                                                       \
1206            if ( foldlen > 0 ) {                                              \
1207               uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1208               foldlen -= len;                                                \
1209               scan += len;                                                   \
1210               len = 0;                                                       \
1211            } else {                                                          \
1212                uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1213                uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1214                foldlen -= UNISKIP( uvc );                                    \
1215                scan = foldbuf + UNISKIP( uvc );                              \
1216            }                                                                 \
1217        } else {                                                              \
1218            uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1219        }                                                                     \
1220    } else {                                                                  \
1221        uvc = (U32)*uc;                                                       \
1222        len = 1;                                                              \
1223    }                                                                         \
1224} STMT_END
1225
1226
1227
1228#define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1229    if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1230        U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1231        Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1232    }                                                           \
1233    TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1234    TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1235    TRIE_LIST_CUR( state )++;                                   \
1236} STMT_END
1237
1238#define TRIE_LIST_NEW(state) STMT_START {                       \
1239    Newxz( trie->states[ state ].trans.list,               \
1240        4, reg_trie_trans_le );                                 \
1241     TRIE_LIST_CUR( state ) = 1;                                \
1242     TRIE_LIST_LEN( state ) = 4;                                \
1243} STMT_END
1244
1245#define TRIE_HANDLE_WORD(state) STMT_START {                    \
1246    U16 dupe= trie->states[ state ].wordnum;                    \
1247    regnode * const noper_next = regnext( noper );              \
1248                                                                \
1249    if (trie->wordlen)                                          \
1250        trie->wordlen[ curword ] = wordlen;                     \
1251    DEBUG_r({                                                   \
1252        /* store the word for dumping */                        \
1253        SV* tmp;                                                \
1254        if (OP(noper) != NOTHING)                               \
1255            tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1256        else                                                    \
1257            tmp = newSVpvn_utf8( "", 0, UTF );                  \
1258        av_push( trie_words, tmp );                             \
1259    });                                                         \
1260                                                                \
1261    curword++;                                                  \
1262                                                                \
1263    if ( noper_next < tail ) {                                  \
1264        if (!trie->jump)                                        \
1265            trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1266        trie->jump[curword] = (U16)(noper_next - convert);      \
1267        if (!jumper)                                            \
1268            jumper = noper_next;                                \
1269        if (!nextbranch)                                        \
1270            nextbranch= regnext(cur);                           \
1271    }                                                           \
1272                                                                \
1273    if ( dupe ) {                                               \
1274        /* So it's a dupe. This means we need to maintain a   */\
1275        /* linked-list from the first to the next.            */\
1276        /* we only allocate the nextword buffer when there    */\
1277        /* a dupe, so first time we have to do the allocation */\
1278        if (!trie->nextword)                                    \
1279            trie->nextword = (U16 *)                                    \
1280                PerlMemShared_calloc( word_count + 1, sizeof(U16));     \
1281        while ( trie->nextword[dupe] )                          \
1282            dupe= trie->nextword[dupe];                         \
1283        trie->nextword[dupe]= curword;                          \
1284    } else {                                                    \
1285        /* we haven't inserted this word yet.                */ \
1286        trie->states[ state ].wordnum = curword;                \
1287    }                                                           \
1288} STMT_END
1289
1290
1291#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1292     ( ( base + charid >=  ucharcount                                   \
1293         && base + charid < ubound                                      \
1294         && state == trie->trans[ base - ucharcount + charid ].check    \
1295         && trie->trans[ base - ucharcount + charid ].next )            \
1296           ? trie->trans[ base - ucharcount + charid ].next             \
1297           : ( state==1 ? special : 0 )                                 \
1298      )
1299
1300#define MADE_TRIE       1
1301#define MADE_JUMP_TRIE  2
1302#define MADE_EXACT_TRIE 4
1303
1304STATIC I32
1305S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1306{
1307    dVAR;
1308    /* first pass, loop through and scan words */
1309    reg_trie_data *trie;
1310    HV *widecharmap = NULL;
1311    AV *revcharmap = newAV();
1312    regnode *cur;
1313    const U32 uniflags = UTF8_ALLOW_DEFAULT;
1314    STRLEN len = 0;
1315    UV uvc = 0;
1316    U16 curword = 0;
1317    U32 next_alloc = 0;
1318    regnode *jumper = NULL;
1319    regnode *nextbranch = NULL;
1320    regnode *convert = NULL;
1321    /* we just use folder as a flag in utf8 */
1322    const U8 * const folder = ( flags == EXACTF
1323                       ? PL_fold
1324                       : ( flags == EXACTFL
1325                           ? PL_fold_locale
1326                           : NULL
1327                         )
1328                     );
1329
1330#ifdef DEBUGGING
1331    const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1332    AV *trie_words = NULL;
1333    /* along with revcharmap, this only used during construction but both are
1334     * useful during debugging so we store them in the struct when debugging.
1335     */
1336#else
1337    const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1338    STRLEN trie_charcount=0;
1339#endif
1340    SV *re_trie_maxbuff;
1341    GET_RE_DEBUG_FLAGS_DECL;
1342
1343    PERL_ARGS_ASSERT_MAKE_TRIE;
1344#ifndef DEBUGGING
1345    PERL_UNUSED_ARG(depth);
1346#endif
1347
1348    trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1349    trie->refcount = 1;
1350    trie->startstate = 1;
1351    trie->wordcount = word_count;
1352    RExC_rxi->data->data[ data_slot ] = (void*)trie;
1353    trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1354    if (!(UTF && folder))
1355        trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1356    DEBUG_r({
1357        trie_words = newAV();
1358    });
1359
1360    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1361    if (!SvIOK(re_trie_maxbuff)) {
1362        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1363    }
1364    DEBUG_OPTIMISE_r({
1365                PerlIO_printf( Perl_debug_log,
1366                  "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1367                  (int)depth * 2 + 2, "", 
1368                  REG_NODE_NUM(startbranch),REG_NODE_NUM(first), 
1369                  REG_NODE_NUM(last), REG_NODE_NUM(tail),
1370                  (int)depth);
1371    });
1372   
1373   /* Find the node we are going to overwrite */
1374    if ( first == startbranch && OP( last ) != BRANCH ) {
1375        /* whole branch chain */
1376        convert = first;
1377    } else {
1378        /* branch sub-chain */
1379        convert = NEXTOPER( first );
1380    }
1381        
1382    /*  -- First loop and Setup --
1383
1384       We first traverse the branches and scan each word to determine if it
1385       contains widechars, and how many unique chars there are, this is
1386       important as we have to build a table with at least as many columns as we
1387       have unique chars.
1388
1389       We use an array of integers to represent the character codes 0..255
1390       (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1391       native representation of the character value as the key and IV's for the
1392       coded index.
1393
1394       *TODO* If we keep track of how many times each character is used we can
1395       remap the columns so that the table compression later on is more
1396       efficient in terms of memory by ensuring most common value is in the
1397       middle and the least common are on the outside.  IMO this would be better
1398       than a most to least common mapping as theres a decent chance the most
1399       common letter will share a node with the least common, meaning the node
1400       will not be compressable. With a middle is most common approach the worst
1401       case is when we have the least common nodes twice.
1402
1403     */
1404
1405    for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1406        regnode * const noper = NEXTOPER( cur );
1407        const U8 *uc = (U8*)STRING( noper );
1408        const U8 * const e  = uc + STR_LEN( noper );
1409        STRLEN foldlen = 0;
1410        U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1411        const U8 *scan = (U8*)NULL;
1412        U32 wordlen      = 0;         /* required init */
1413        STRLEN chars = 0;
1414        bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1415
1416        if (OP(noper) == NOTHING) {
1417            trie->minlen= 0;
1418            continue;
1419        }
1420        if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1421            TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1422                                          regardless of encoding */
1423
1424        for ( ; uc < e ; uc += len ) {
1425            TRIE_CHARCOUNT(trie)++;
1426            TRIE_READ_CHAR;
1427            chars++;
1428            if ( uvc < 256 ) {
1429                if ( !trie->charmap[ uvc ] ) {
1430                    trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1431                    if ( folder )
1432                        trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1433                    TRIE_STORE_REVCHAR;
1434                }
1435                if ( set_bit ) {
1436                    /* store the codepoint in the bitmap, and if its ascii
1437                       also store its folded equivelent. */
1438                    TRIE_BITMAP_SET(trie,uvc);
1439
1440                    /* store the folded codepoint */
1441                    if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1442
1443                    if ( !UTF ) {
1444                        /* store first byte of utf8 representation of
1445                           codepoints in the 127 < uvc < 256 range */
1446                        if (127 < uvc && uvc < 192) {
1447                            TRIE_BITMAP_SET(trie,194);
1448                        } else if (191 < uvc ) {
1449                            TRIE_BITMAP_SET(trie,195);
1450                        /* && uvc < 256 -- we know uvc is < 256 already */
1451                        }
1452                    }
1453                    set_bit = 0; /* We've done our bit :-) */
1454                }
1455            } else {
1456                SV** svpp;
1457                if ( !widecharmap )
1458                    widecharmap = newHV();
1459
1460                svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1461
1462                if ( !svpp )
1463                    Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1464
1465                if ( !SvTRUE( *svpp ) ) {
1466                    sv_setiv( *svpp, ++trie->uniquecharcount );
1467                    TRIE_STORE_REVCHAR;
1468                }
1469            }
1470        }
1471        if( cur == first ) {
1472            trie->minlen=chars;
1473            trie->maxlen=chars;
1474        } else if (chars < trie->minlen) {
1475            trie->minlen=chars;
1476        } else if (chars > trie->maxlen) {
1477            trie->maxlen=chars;
1478        }
1479
1480    } /* end first pass */
1481    DEBUG_TRIE_COMPILE_r(
1482        PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1483                (int)depth * 2 + 2,"",
1484                ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1485                (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1486                (int)trie->minlen, (int)trie->maxlen )
1487    );
1488    trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1489
1490    /*
1491        We now know what we are dealing with in terms of unique chars and
1492        string sizes so we can calculate how much memory a naive
1493        representation using a flat table  will take. If it's over a reasonable
1494        limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1495        conservative but potentially much slower representation using an array
1496        of lists.
1497
1498        At the end we convert both representations into the same compressed
1499        form that will be used in regexec.c for matching with. The latter
1500        is a form that cannot be used to construct with but has memory
1501        properties similar to the list form and access properties similar
1502        to the table form making it both suitable for fast searches and
1503        small enough that its feasable to store for the duration of a program.
1504
1505        See the comment in the code where the compressed table is produced
1506        inplace from the flat tabe representation for an explanation of how
1507        the compression works.
1508
1509    */
1510
1511
1512    if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1513        /*
1514            Second Pass -- Array Of Lists Representation
1515
1516            Each state will be represented by a list of charid:state records
1517            (reg_trie_trans_le) the first such element holds the CUR and LEN
1518            points of the allocated array. (See defines above).
1519
1520            We build the initial structure using the lists, and then convert
1521            it into the compressed table form which allows faster lookups
1522            (but cant be modified once converted).
1523        */
1524
1525        STRLEN transcount = 1;
1526
1527        DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 
1528            "%*sCompiling trie using list compiler\n",
1529            (int)depth * 2 + 2, ""));
1530        
1531        trie->states = (reg_trie_state *)
1532            PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1533                                  sizeof(reg_trie_state) );
1534        TRIE_LIST_NEW(1);
1535        next_alloc = 2;
1536
1537        for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1538
1539            regnode * const noper = NEXTOPER( cur );
1540            U8 *uc           = (U8*)STRING( noper );
1541            const U8 * const e = uc + STR_LEN( noper );
1542            U32 state        = 1;         /* required init */
1543            U16 charid       = 0;         /* sanity init */
1544            U8 *scan         = (U8*)NULL; /* sanity init */
1545            STRLEN foldlen   = 0;         /* required init */
1546            U32 wordlen      = 0;         /* required init */
1547            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1548
1549            if (OP(noper) != NOTHING) {
1550                for ( ; uc < e ; uc += len ) {
1551
1552                    TRIE_READ_CHAR;
1553
1554                    if ( uvc < 256 ) {
1555                        charid = trie->charmap[ uvc ];
1556                    } else {
1557                        SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1558                        if ( !svpp ) {
1559                            charid = 0;
1560                        } else {
1561                            charid=(U16)SvIV( *svpp );
1562                        }
1563                    }
1564                    /* charid is now 0 if we dont know the char read, or nonzero if we do */
1565                    if ( charid ) {
1566
1567                        U16 check;
1568                        U32 newstate = 0;
1569
1570                        charid--;
1571                        if ( !trie->states[ state ].trans.list ) {
1572                            TRIE_LIST_NEW( state );
1573                        }
1574                        for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1575                            if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1576                                newstate = TRIE_LIST_ITEM( state, check ).newstate;
1577                                break;
1578                            }
1579                        }
1580                        if ( ! newstate ) {
1581                            newstate = next_alloc++;
1582                            TRIE_LIST_PUSH( state, charid, newstate );
1583                            transcount++;
1584                        }
1585                        state = newstate;
1586                    } else {
1587                        Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1588                    }
1589                }
1590            }
1591            TRIE_HANDLE_WORD(state);
1592
1593        } /* end second pass */
1594
1595        /* next alloc is the NEXT state to be allocated */
1596        trie->statecount = next_alloc; 
1597        trie->states = (reg_trie_state *)
1598            PerlMemShared_realloc( trie->states,
1599                                   next_alloc
1600                                   * sizeof(reg_trie_state) );
1601
1602        /* and now dump it out before we compress it */
1603        DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1604                                                         revcharmap, next_alloc,
1605                                                         depth+1)
1606        );
1607
1608        trie->trans = (reg_trie_trans *)
1609            PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1610        {
1611            U32 state;
1612            U32 tp = 0;
1613            U32 zp = 0;
1614
1615
1616            for( state=1 ; state < next_alloc ; state ++ ) {
1617                U32 base=0;
1618
1619                /*
1620                DEBUG_TRIE_COMPILE_MORE_r(
1621                    PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1622                );
1623                */
1624
1625                if (trie->states[state].trans.list) {
1626                    U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1627                    U16 maxid=minid;
1628                    U16 idx;
1629
1630                    for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1631                        const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1632                        if ( forid < minid ) {
1633                            minid=forid;
1634                        } else if ( forid > maxid ) {
1635                            maxid=forid;
1636                        }
1637                    }
1638                    if ( transcount < tp + maxid - minid + 1) {
1639                        transcount *= 2;
1640                        trie->trans = (reg_trie_trans *)
1641                            PerlMemShared_realloc( trie->trans,
1642                                                     transcount
1643                                                     * sizeof(reg_trie_trans) );
1644                        Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1645                    }
1646                    base = trie->uniquecharcount + tp - minid;
1647                    if ( maxid == minid ) {
1648                        U32 set = 0;
1649                        for ( ; zp < tp ; zp++ ) {
1650                            if ( ! trie->trans[ zp ].next ) {
1651                                base = trie->uniquecharcount + zp - minid;
1652                                trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1653                                trie->trans[ zp ].check = state;
1654                                set = 1;
1655                                break;
1656                            }
1657                        }
1658                        if ( !set ) {
1659                            trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1660                            trie->trans[ tp ].check = state;
1661                            tp++;
1662                            zp = tp;
1663                        }
1664                    } else {
1665                        for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1666                            const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1667                            trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1668                            trie->trans[ tid ].check = state;
1669                        }
1670                        tp += ( maxid - minid + 1 );
1671                    }
1672                    Safefree(trie->states[ state ].trans.list);
1673                }
1674                /*
1675                DEBUG_TRIE_COMPILE_MORE_r(
1676                    PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1677                );
1678                */
1679                trie->states[ state ].trans.base=base;
1680            }
1681            trie->lasttrans = tp + 1;
1682        }
1683    } else {
1684        /*
1685           Second Pass -- Flat Table Representation.
1686
1687           we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1688           We know that we will need Charcount+1 trans at most to store the data
1689           (one row per char at worst case) So we preallocate both structures
1690           assuming worst case.
1691
1692           We then construct the trie using only the .next slots of the entry
1693           structs.
1694
1695           We use the .check field of the first entry of the node  temporarily to
1696           make compression both faster and easier by keeping track of how many non
1697           zero fields are in the node.
1698
1699           Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1700           transition.
1701
1702           There are two terms at use here: state as a TRIE_NODEIDX() which is a
1703           number representing the first entry of the node, and state as a
1704           TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1705           TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1706           are 2 entrys per node. eg:
1707
1708             A B       A B
1709          1. 2 4    1. 3 7
1710          2. 0 3    3. 0 5
1711          3. 0 0    5. 0 0
1712          4. 0 0    7. 0 0
1713
1714           The table is internally in the right hand, idx form. However as we also
1715           have to deal with the states array which is indexed by nodenum we have to
1716           use TRIE_NODENUM() to convert.
1717
1718        */
1719        DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 
1720            "%*sCompiling trie using table compiler\n",
1721            (int)depth * 2 + 2, ""));
1722
1723        trie->trans = (reg_trie_trans *)
1724            PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1725                                  * trie->uniquecharcount + 1,
1726                                  sizeof(reg_trie_trans) );
1727        trie->states = (reg_trie_state *)
1728            PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1729                                  sizeof(reg_trie_state) );
1730        next_alloc = trie->uniquecharcount + 1;
1731
1732
1733        for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1734
1735            regnode * const noper   = NEXTOPER( cur );
1736            const U8 *uc     = (U8*)STRING( noper );
1737            const U8 * const e = uc + STR_LEN( noper );
1738
1739            U32 state        = 1;         /* required init */
1740
1741            U16 charid       = 0;         /* sanity init */
1742            U32 accept_state = 0;         /* sanity init */
1743            U8 *scan         = (U8*)NULL; /* sanity init */
1744
1745            STRLEN foldlen   = 0;         /* required init */
1746            U32 wordlen      = 0;         /* required init */
1747            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1748
1749            if ( OP(noper) != NOTHING ) {
1750                for ( ; uc < e ; uc += len ) {
1751
1752                    TRIE_READ_CHAR;
1753
1754                    if ( uvc < 256 ) {
1755                        charid = trie->charmap[ uvc ];
1756                    } else {
1757                        SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1758                        charid = svpp ? (U16)SvIV(*svpp) : 0;
1759                    }
1760                    if ( charid ) {
1761                        charid--;
1762                        if ( !trie->trans[ state + charid ].next ) {
1763                            trie->trans[ state + charid ].next = next_alloc;
1764                            trie->trans[ state ].check++;
1765                            next_alloc += trie->uniquecharcount;
1766                        }
1767                        state = trie->trans[ state + charid ].next;
1768                    } else {
1769                        Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1770                    }
1771                    /* charid is now 0 if we dont know the char read, or nonzero if we do */
1772                }
1773            }
1774            accept_state = TRIE_NODENUM( state );
1775            TRIE_HANDLE_WORD(accept_state);
1776
1777        } /* end second pass */
1778
1779        /* and now dump it out before we compress it */
1780        DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1781                                                          revcharmap,
1782                                                          next_alloc, depth+1));
1783
1784        {
1785        /*
1786           * Inplace compress the table.*
1787
1788           For sparse data sets the table constructed by the trie algorithm will
1789           be mostly 0/FAIL transitions or to put it another way mostly empty.
1790           (Note that leaf nodes will not contain any transitions.)
1791
1792           This algorithm compresses the tables by eliminating most such
1793           transitions, at the cost of a modest bit of extra work during lookup:
1794
1795           - Each states[] entry contains a .base field which indicates the
1796           index in the state[] array wheres its transition data is stored.
1797
1798           - If .base is 0 there are no  valid transitions from that node.
1799
1800           - If .base is nonzero then charid is added to it to find an entry in
1801           the trans array.
1802
1803           -If trans[states[state].base+charid].check!=state then the
1804           transition is taken to be a 0/Fail transition. Thus if there are fail
1805           transitions at the front of the node then the .base offset will point
1806           somewhere inside the previous nodes data (or maybe even into a node
1807           even earlier), but the .check field determines if the transition is
1808           valid.
1809
1810           XXX - wrong maybe?
1811           The following process inplace converts the table to the compressed
1812           table: We first do not compress the root node 1,and mark its all its
1813           .check pointers as 1 and set its .base pointer as 1 as well. This
1814           allows to do a DFA construction from the compressed table later, and
1815           ensures that any .base pointers we calculate later are greater than
1816           0.
1817
1818           - We set 'pos' to indicate the first entry of the second node.
1819
1820           - We then iterate over the columns of the node, finding the first and
1821           last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1822           and set the .check pointers accordingly, and advance pos
1823           appropriately and repreat for the next node. Note that when we copy
1824           the next pointers we have to convert them from the original
1825           NODEIDX form to NODENUM form as the former is not valid post
1826           compression.
1827
1828           - If a node has no transitions used we mark its base as 0 and do not
1829           advance the pos pointer.
1830
1831           - If a node only has one transition we use a second pointer into the
1832           structure to fill in allocated fail transitions from other states.
1833           This pointer is independent of the main pointer and scans forward
1834           looking for null transitions that are allocated to a state. When it
1835           finds one it writes the single transition into the "hole".  If the
1836           pointer doesnt find one the single transition is appended as normal.
1837
1838           - Once compressed we can Renew/realloc the structures to release the
1839           excess space.
1840
1841           See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1842           specifically Fig 3.47 and the associated pseudocode.
1843
1844           demq
1845        */
1846        const U32 laststate = TRIE_NODENUM( next_alloc );
1847        U32 state, charid;
1848        U32 pos = 0, zp=0;
1849        trie->statecount = laststate;
1850
1851        for ( state = 1 ; state < laststate ; state++ ) {
1852            U8 flag = 0;
1853            const U32 stateidx = TRIE_NODEIDX( state );
1854            const U32 o_used = trie->trans[ stateidx ].check;
1855            U32 used = trie->trans[ stateidx ].check;
1856            trie->trans[ stateidx ].check = 0;
1857
1858            for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1859                if ( flag || trie->trans[ stateidx + charid ].next ) {
1860                    if ( trie->trans[ stateidx + charid ].next ) {
1861                        if (o_used == 1) {
1862                            for ( ; zp < pos ; zp++ ) {
1863                                if ( ! trie->trans[ zp ].next ) {
1864                                    break;
1865                                }
1866                            }
1867                            trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1868                            trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1869                            trie->trans[ zp ].check = state;
1870                            if ( ++zp > pos ) pos = zp;
1871                            break;
1872                        }
1873                        used--;
1874                    }
1875                    if ( !flag ) {
1876                        flag = 1;
1877                        trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1878                    }
1879                    trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880                    trie->trans[ pos ].check = state;
1881                    pos++;
1882                }
1883            }
1884        }
1885        trie->lasttrans = pos + 1;
1886        trie->states = (reg_trie_state *)
1887            PerlMemShared_realloc( trie->states, laststate
1888                                   * sizeof(reg_trie_state) );
1889        DEBUG_TRIE_COMPILE_MORE_r(
1890                PerlIO_printf( Perl_debug_log,
1891                    "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1892                    (int)depth * 2 + 2,"",
1893                    (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1894                    (IV)next_alloc,
1895                    (IV)pos,
1896                    ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1897            );
1898
1899        } /* end table compress */
1900    }
1901    DEBUG_TRIE_COMPILE_MORE_r(
1902            PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1903                (int)depth * 2 + 2, "",
1904                (UV)trie->statecount,
1905                (UV)trie->lasttrans)
1906    );
1907    /* resize the trans array to remove unused space */
1908    trie->trans = (reg_trie_trans *)
1909        PerlMemShared_realloc( trie->trans, trie->lasttrans
1910                               * sizeof(reg_trie_trans) );
1911
1912    /* and now dump out the compressed format */
1913    DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1914
1915    {   /* Modify the program and insert the new TRIE node*/ 
1916        U8 nodetype =(U8)(flags & 0xFF);
1917        char *str=NULL;
1918        
1919#ifdef DEBUGGING
1920        regnode *optimize = NULL;
1921#ifdef RE_TRACK_PATTERN_OFFSETS
1922
1923        U32 mjd_offset = 0;
1924        U32 mjd_nodelen = 0;
1925#endif /* RE_TRACK_PATTERN_OFFSETS */
1926#endif /* DEBUGGING */
1927        /*
1928           This means we convert either the first branch or the first Exact,
1929           depending on whether the thing following (in 'last') is a branch
1930           or not and whther first is the startbranch (ie is it a sub part of
1931           the alternation or is it the whole thing.)
1932           Assuming its a sub part we conver the EXACT otherwise we convert
1933           the whole branch sequence, including the first.
1934         */
1935        /* Find the node we are going to overwrite */
1936        if ( first != startbranch || OP( last ) == BRANCH ) {
1937            /* branch sub-chain */
1938            NEXT_OFF( first ) = (U16)(last - first);
1939#ifdef RE_TRACK_PATTERN_OFFSETS
1940            DEBUG_r({
1941                mjd_offset= Node_Offset((convert));
1942                mjd_nodelen= Node_Length((convert));
1943            });
1944#endif
1945            /* whole branch chain */
1946        }
1947#ifdef RE_TRACK_PATTERN_OFFSETS
1948        else {
1949            DEBUG_r({
1950                const  regnode *nop = NEXTOPER( convert );
1951                mjd_offset= Node_Offset((nop));
1952                mjd_nodelen= Node_Length((nop));
1953            });
1954        }
1955        DEBUG_OPTIMISE_r(
1956            PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1957                (int)depth * 2 + 2, "",
1958                (UV)mjd_offset, (UV)mjd_nodelen)
1959        );
1960#endif
1961        /* But first we check to see if there is a common prefix we can 
1962           split out as an EXACT and put in front of the TRIE node.  */
1963        trie->startstate= 1;
1964        if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1965            U32 state;
1966            for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1967                U32 ofs = 0;
1968                I32 idx = -1;
1969                U32 count = 0;
1970                const U32 base = trie->states[ state ].trans.base;
1971
1972                if ( trie->states[state].wordnum )
1973                        count = 1;
1974
1975                for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1976                    if ( ( base + ofs >= trie->uniquecharcount ) &&
1977                         ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1978                         trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1979                    {
1980                        if ( ++count > 1 ) {
1981                            SV **tmp = av_fetch( revcharmap, ofs, 0);
1982                            const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1983                            if ( state == 1 ) break;
1984                            if ( count == 2 ) {
1985                                Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1986                                DEBUG_OPTIMISE_r(
1987                                    PerlIO_printf(Perl_debug_log,
1988                                        "%*sNew Start State=%"UVuf" Class: [",
1989                                        (int)depth * 2 + 2, "",
1990                                        (UV)state));
1991                                if (idx >= 0) {
1992                                    SV ** const tmp = av_fetch( revcharmap, idx, 0);
1993                                    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1994
1995                                    TRIE_BITMAP_SET(trie,*ch);
1996                                    if ( folder )
1997                                        TRIE_BITMAP_SET(trie, folder[ *ch ]);
1998                                    DEBUG_OPTIMISE_r(
1999                                        PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2000                                    );
2001                                }
2002                            }
2003                            TRIE_BITMAP_SET(trie,*ch);
2004                            if ( folder )
2005                                TRIE_BITMAP_SET(trie,folder[ *ch ]);
2006                            DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2007                        }
2008                        idx = ofs;
2009                    }
2010                }
2011                if ( count == 1 ) {
2012                    SV **tmp = av_fetch( revcharmap, idx, 0);
2013                    STRLEN len;
2014                    char *ch = SvPV( *tmp, len );
2015                    DEBUG_OPTIMISE_r({
2016                        SV *sv=sv_newmortal();
2017                        PerlIO_printf( Perl_debug_log,
2018                            "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2019                            (int)depth * 2 + 2, "",
2020                            (UV)state, (UV)idx, 
2021                            pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 
2022                                PL_colors[0], PL_colors[1],
2023                                (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2024                                PERL_PV_ESCAPE_FIRSTCHAR 
2025                            )
2026                        );
2027                    });
2028                    if ( state==1 ) {
2029                        OP( convert ) = nodetype;
2030                        str=STRING(convert);
2031                        STR_LEN(convert)=0;
2032                    }
2033                    STR_LEN(convert) += len;
2034                    while (len--)
2035                        *str++ = *ch++;
2036                } else {
2037#ifdef DEBUGGING            
2038                    if (state>1)
2039                        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2040#endif
2041                    break;
2042                }
2043            }
2044            if (str) {
2045                regnode *n = convert+NODE_SZ_STR(convert);
2046                NEXT_OFF(convert) = NODE_SZ_STR(convert);
2047                trie->startstate = state;
2048                trie->minlen -= (state - 1);
2049                trie->maxlen -= (state - 1);
2050#ifdef DEBUGGING
2051               /* At least the UNICOS C compiler choked on this
2052                * being argument to DEBUG_r(), so let's just have
2053                * it right here. */
2054               if (
2055#ifdef PERL_EXT_RE_BUILD
2056                   1
2057#else
2058                   DEBUG_r_TEST
2059#endif
2060                   ) {
2061                   regnode *fix = convert;
2062                   U32 word = trie->wordcount;
2063                   mjd_nodelen++;
2064                   Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2065                   while( ++fix < n ) {
2066                       Set_Node_Offset_Length(fix, 0, 0);
2067                   }
2068                   while (word--) {
2069                       SV ** const tmp = av_fetch( trie_words, word, 0 );
2070                       if (tmp) {
2071                           if ( STR_LEN(convert) <= SvCUR(*tmp) )
2072                               sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2073                           else
2074                               sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2075                       }
2076                   }
2077               }
2078#endif
2079                if (trie->maxlen) {
2080                    convert = n;
2081                } else {
2082                    NEXT_OFF(convert) = (U16)(tail - convert);
2083                    DEBUG_r(optimize= n);
2084                }
2085            }
2086        }
2087        if (!jumper) 
2088            jumper = last; 
2089        if ( trie->maxlen ) {
2090            NEXT_OFF( convert ) = (U16)(tail - convert);
2091            ARG_SET( convert, data_slot );
2092            /* Store the offset to the first unabsorbed branch in 
2093               jump[0], which is otherwise unused by the jump logic. 
2094               We use this when dumping a trie and during optimisation. */
2095            if (trie->jump) 
2096                trie->jump[0] = (U16)(nextbranch - convert);
2097            
2098            /* XXXX */
2099            if ( !trie->states[trie->startstate].wordnum && trie->bitmap && 
2100                 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2101            {
2102                OP( convert ) = TRIEC;
2103                Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2104                PerlMemShared_free(trie->bitmap);
2105                trie->bitmap= NULL;
2106            } else 
2107                OP( convert ) = TRIE;
2108
2109            /* store the type in the flags */
2110            convert->flags = nodetype;
2111            DEBUG_r({
2112            optimize = convert 
2113                      + NODE_STEP_REGNODE 
2114                      + regarglen[ OP( convert ) ];
2115            });
2116            /* XXX We really should free up the resource in trie now, 
2117                   as we won't use them - (which resources?) dmq */
2118        }
2119        /* needed for dumping*/
2120        DEBUG_r(if (optimize) {
2121            regnode *opt = convert;
2122
2123            while ( ++opt < optimize) {
2124                Set_Node_Offset_Length(opt,0,0);
2125            }
2126            /* 
2127                Try to clean up some of the debris left after the 
2128                optimisation.
2129             */
2130            while( optimize < jumper ) {
2131                mjd_nodelen += Node_Length((optimize));
2132                OP( optimize ) = OPTIMIZED;
2133                Set_Node_Offset_Length(optimize,0,0);
2134                optimize++;
2135            }
2136            Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2137        });
2138    } /* end node insert */
2139    RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2140#ifdef DEBUGGING
2141    RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2142    RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2143#else
2144    SvREFCNT_dec(revcharmap);
2145#endif
2146    return trie->jump 
2147           ? MADE_JUMP_TRIE 
2148           : trie->startstate>1 
2149             ? MADE_EXACT_TRIE 
2150             : MADE_TRIE;
2151}
2152
2153STATIC void
2154S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2155{
2156/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2157
2158   This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2159   "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2160   ISBN 0-201-10088-6
2161
2162   We find the fail state for each state in the trie, this state is the longest proper
2163   suffix of the current states 'word' that is also a proper prefix of another word in our
2164   trie. State 1 represents the word '' and is the thus the default fail state. This allows
2165   the DFA not to have to restart after its tried and failed a word at a given point, it
2166   simply continues as though it had been matching the other word in the first place.
2167   Consider
2168      'abcdgu'=~/abcdefg|cdgu/
2169   When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2170   fail, which would bring use to the state representing 'd' in the second word where we would
2171   try 'g' and succeed, prodceding to match 'cdgu'.
2172 */
2173 /* add a fail transition */
2174    const U32 trie_offset = ARG(source);
2175    reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2176    U32 *q;
2177    const U32 ucharcount = trie->uniquecharcount;
2178    const U32 numstates = trie->statecount;
2179    const U32 ubound = trie->lasttrans + ucharcount;
2180    U32 q_read = 0;
2181    U32 q_write = 0;
2182    U32 charid;
2183    U32 base = trie->states[ 1 ].trans.base;
2184    U32 *fail;
2185    reg_ac_data *aho;
2186    const U32 data_slot = add_data( pRExC_state, 1, "T" );
2187    GET_RE_DEBUG_FLAGS_DECL;
2188
2189    PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2190#ifndef DEBUGGING
2191    PERL_UNUSED_ARG(depth);
2192#endif
2193
2194
2195    ARG_SET( stclass, data_slot );
2196    aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2197    RExC_rxi->data->data[ data_slot ] = (void*)aho;
2198    aho->trie=trie_offset;
2199    aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2200    Copy( trie->states, aho->states, numstates, reg_trie_state );
2201    Newxz( q, numstates, U32);
2202    aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2203    aho->refcount = 1;
2204    fail = aho->fail;
2205    /* initialize fail[0..1] to be 1 so that we always have
2206       a valid final fail state */
2207    fail[ 0 ] = fail[ 1 ] = 1;
2208
2209    for ( charid = 0; charid < ucharcount ; charid++ ) {
2210        const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2211        if ( newstate ) {
2212            q[ q_write ] = newstate;
2213            /* set to point at the root */
2214            fail[ q[ q_write++ ] ]=1;
2215        }
2216    }
2217    while ( q_read < q_write) {
2218        const U32 cur = q[ q_read++ % numstates ];
2219        base = trie->states[ cur ].trans.base;
2220
2221        for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2222            const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2223            if (ch_state) {
2224                U32 fail_state = cur;
2225                U32 fail_base;
2226                do {
2227                    fail_state = fail[ fail_state ];
2228                    fail_base = aho->states[ fail_state ].trans.base;
2229                } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2230
2231                fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2232                fail[ ch_state ] = fail_state;
2233                if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2234                {
2235                        aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2236                }
2237                q[ q_write++ % numstates] = ch_state;
2238            }
2239        }
2240    }
2241    /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2242       when we fail in state 1, this allows us to use the
2243       charclass scan to find a valid start char. This is based on the principle
2244       that theres a good chance the string being searched contains lots of stuff
2245       that cant be a start char.
2246     */
2247    fail[ 0 ] = fail[ 1 ] = 0;
2248    DEBUG_TRIE_COMPILE_r({
2249        PerlIO_printf(Perl_debug_log,
2250                      "%*sStclass Failtable (%"UVuf" states): 0", 
2251                      (int)(depth * 2), "", (UV)numstates
2252        );
2253        for( q_read=1; q_read<numstates; q_read++ ) {
2254            PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2255        }
2256        PerlIO_printf(Perl_debug_log, "\n");
2257    });
2258    Safefree(q);
2259    /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2260}
2261
2262
2263/*
2264 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2265 * These need to be revisited when a newer toolchain becomes available.
2266 */
2267#if defined(__sparc64__) && defined(__GNUC__)
2268#   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2269#       undef  SPARC64_GCC_WORKAROUND
2270#       define SPARC64_GCC_WORKAROUND 1
2271#   endif
2272#endif
2273
2274#define DEBUG_PEEP(str,scan,depth) \
2275    DEBUG_OPTIMISE_r({if (scan){ \
2276       SV * const mysv=sv_newmortal(); \
2277       regnode *Next = regnext(scan); \
2278       regprop(RExC_rx, mysv, scan); \
2279       PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2280       (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2281       Next ? (REG_NODE_NUM(Next)) : 0 ); \
2282   }});
2283
2284
2285
2286
2287
2288#define JOIN_EXACT(scan,min,flags) \
2289    if (PL_regkind[OP(scan)] == EXACT) \
2290        join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2291
2292STATIC U32
2293S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2294    /* Merge several consecutive EXACTish nodes into one. */
2295    regnode *n = regnext(scan);
2296    U32 stringok = 1;
2297    regnode *next = scan + NODE_SZ_STR(scan);
2298    U32 merged = 0;
2299    U32 stopnow = 0;
2300#ifdef DEBUGGING
2301    regnode *stop = scan;
2302    GET_RE_DEBUG_FLAGS_DECL;
2303#else
2304    PERL_UNUSED_ARG(depth);
2305#endif
2306
2307    PERL_ARGS_ASSERT_JOIN_EXACT;
2308#ifndef EXPERIMENTAL_INPLACESCAN
2309    PERL_UNUSED_ARG(flags);
2310    PERL_UNUSED_ARG(val);
2311#endif
2312    DEBUG_PEEP("join",scan,depth);
2313    
2314    /* Skip NOTHING, merge EXACT*. */
2315    while (n &&
2316           ( PL_regkind[OP(n)] == NOTHING ||
2317             (stringok && (OP(n) == OP(scan))))
2318           && NEXT_OFF(n)
2319           && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2320        
2321        if (OP(n) == TAIL || n > next)
2322            stringok = 0;
2323        if (PL_regkind[OP(n)] == NOTHING) {
2324            DEBUG_PEEP("skip:",n,depth);
2325            NEXT_OFF(scan) += NEXT_OFF(n);
2326            next = n + NODE_STEP_REGNODE;
2327#ifdef DEBUGGING
2328            if (stringok)
2329                stop = n;
2330#endif
2331            n = regnext(n);
2332        }
2333        else if (stringok) {
2334            const unsigned int oldl = STR_LEN(scan);
2335            regnode * const nnext = regnext(n);
2336            
2337            DEBUG_PEEP("merg",n,depth);
2338            
2339            merged++;
2340            if (oldl + STR_LEN(n) > U8_MAX)
2341                break;
2342            NEXT_OFF(scan) += NEXT_OFF(n);
2343            STR_LEN(scan) += STR_LEN(n);
2344            next = n + NODE_SZ_STR(n);
2345            /* Now we can overwrite *n : */
2346            Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2347#ifdef DEBUGGING
2348            stop = next - 1;
2349#endif
2350            n = nnext;
2351            if (stopnow) break;
2352        }
2353
2354#ifdef EXPERIMENTAL_INPLACESCAN
2355        if (flags && !NEXT_OFF(n)) {
2356            DEBUG_PEEP("atch", val, depth);
2357            if (reg_off_by_arg[OP(n)]) {
2358                ARG_SET(n, val - n);
2359            }
2360            else {
2361                NEXT_OFF(n) = val - n;
2362            }
2363            stopnow = 1;
2364        }
2365#endif
2366    }
2367    
2368    if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2369    /*
2370    Two problematic code points in Unicode casefolding of EXACT nodes:
2371    
2372    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2373    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2374    
2375    which casefold to
2376    
2377    Unicode                      UTF-8
2378    
2379    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2380    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2381    
2382    This means that in case-insensitive matching (or "loose matching",
2383    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2384    length of the above casefolded versions) can match a target string
2385    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2386    This would rather mess up the minimum length computation.
2387    
2388    What we'll do is to look for the tail four bytes, and then peek
2389    at the preceding two bytes to see whether we need to decrease
2390    the minimum length by four (six minus two).
2391    
2392    Thanks to the design of UTF-8, there cannot be false matches:
2393    A sequence of valid UTF-8 bytes cannot be a subsequence of
2394    another valid sequence of UTF-8 bytes.
2395    
2396    */
2397         char * const s0 = STRING(scan), *s, *t;
2398         char * const s1 = s0 + STR_LEN(scan) - 1;
2399         char * const s2 = s1 - 4;
2400#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2401         const char t0[] = "\xaf\x49\xaf\x42";
2402#else
2403         const char t0[] = "\xcc\x88\xcc\x81";
2404#endif
2405         const char * const t1 = t0 + 3;
2406    
2407         for (s = s0 + 2;
2408              s < s2 && (t = ninstr(s, s1, t0, t1));
2409              s = t + 4) {
2410#ifdef EBCDIC
2411              if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2412                  ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2413#else
2414              if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2415                  ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2416#endif
2417                   *min -= 4;
2418         }
2419    }
2420    
2421#ifdef DEBUGGING
2422    /* Allow dumping */
2423    n = scan + NODE_SZ_STR(scan);
2424    while (n <= stop) {
2425        if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2426            OP(n) = OPTIMIZED;
2427            NEXT_OFF(n) = 0;
2428        }
2429        n++;
2430    }
2431#endif
2432    DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2433    return stopnow;
2434}
2435
2436/* REx optimizer.  Converts nodes into quickier variants "in place".
2437   Finds fixed substrings.  */
2438
2439/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2440   to the position after last scanned or to NULL. */
2441
2442#define INIT_AND_WITHP \
2443    assert(!and_withp); \
2444    Newx(and_withp,1,struct regnode_charclass_class); \
2445    SAVEFREEPV(and_withp)
2446
2447/* this is a chain of data about sub patterns we are processing that
2448   need to be handled seperately/specially in study_chunk. Its so
2449   we can simulate recursion without losing state.  */
2450struct scan_frame;
2451typedef struct scan_frame {
2452    regnode *last;  /* last node to process in this frame */
2453    regnode *next;  /* next node to process when last is reached */
2454    struct scan_frame *prev; /*previous frame*/
2455    I32 stop; /* what stopparen do we use */
2456} scan_frame;
2457
2458
2459#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2460
2461#define CASE_SYNST_FNC(nAmE)                                       \
2462case nAmE:                                                         \
2463    if (flags & SCF_DO_STCLASS_AND) {                              \
2464            for (value = 0; value < 256; value++)                  \
2465                if (!is_ ## nAmE ## _cp(value))                       \
2466                    ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2467    }                                                              \
2468    else {                                                         \
2469            for (value = 0; value < 256; value++)                  \
2470                if (is_ ## nAmE ## _cp(value))                        \
2471                    ANYOF_BITMAP_SET(data->start_class, value);    \
2472    }                                                              \
2473    break;                                                         \
2474case N ## nAmE:                                                    \
2475    if (flags & SCF_DO_STCLASS_AND) {                              \
2476            for (value = 0; value < 256; value++)                   \
2477                if (is_ ## nAmE ## _cp(value))                         \
2478                    ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2479    }                                                               \
2480    else {                                                          \
2481            for (value = 0; value < 256; value++)                   \
2482                if (!is_ ## nAmE ## _cp(value))                        \
2483                    ANYOF_BITMAP_SET(data->start_class, value);     \
2484    }                                                               \
2485    break
2486
2487
2488
2489STATIC I32
2490S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2491                        I32 *minlenp, I32 *deltap,
2492                        regnode *last,
2493                        scan_data_t *data,
2494                        I32 stopparen,
2495                        U8* recursed,
2496                        struct regnode_charclass_class *and_withp,
2497                        U32 flags, U32 depth)
2498                        /* scanp: Start here (read-write). */
2499                        /* deltap: Write maxlen-minlen here. */
2500                        /* last: Stop before this one. */
2501                        /* data: string data about the pattern */
2502                        /* stopparen: treat close N as END */
2503                        /* recursed: which subroutines have we recursed into */
2504                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2505{
2506    dVAR;
2507    I32 min = 0, pars = 0, code;
2508    regnode *scan = *scanp, *next;
2509    I32 delta = 0;
2510    int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2511    int is_inf_internal = 0;            /* The studied chunk is infinite */
2512    I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2513    scan_data_t data_fake;
2514    SV *re_trie_maxbuff = NULL;
2515    regnode *first_non_open = scan;
2516    I32 stopmin = I32_MAX;
2517    scan_frame *frame = NULL;
2518    GET_RE_DEBUG_FLAGS_DECL;
2519
2520    PERL_ARGS_ASSERT_STUDY_CHUNK;
2521
2522#ifdef DEBUGGING
2523    StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2524#endif
2525
2526    if ( depth == 0 ) {
2527        while (first_non_open && OP(first_non_open) == OPEN)
2528            first_non_open=regnext(first_non_open);
2529    }
2530
2531
2532  fake_study_recurse:
2533    while ( scan && OP(scan) != END && scan < last ){
2534        /* Peephole optimizer: */
2535        DEBUG_STUDYDATA("Peep:", data,depth);
2536        DEBUG_PEEP("Peep",scan,depth);
2537        JOIN_EXACT(scan,&min,0);
2538
2539        /* Follow the next-chain of the current node and optimize
2540           away all the NOTHINGs from it.  */
2541        if (OP(scan) != CURLYX) {
2542            const int max = (reg_off_by_arg[OP(scan)]
2543                       ? I32_MAX
2544                       /* I32 may be smaller than U16 on CRAYs! */
2545                       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2546            int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2547            int noff;
2548            regnode *n = scan;
2549        
2550            /* Skip NOTHING and LONGJMP. */
2551            while ((n = regnext(n))
2552                   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2553                       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2554                   && off + noff < max)
2555                off += noff;
2556            if (reg_off_by_arg[OP(scan)])
2557                ARG(scan) = off;
2558            else
2559                NEXT_OFF(scan) = off;
2560        }
2561
2562
2563
2564        /* The principal pseudo-switch.  Cannot be a switch, since we
2565           look into several different things.  */
2566        if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2567                   || OP(scan) == IFTHEN) {
2568            next = regnext(scan);
2569            code = OP(scan);
2570            /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2571        
2572            if (OP(next) == code || code == IFTHEN) {
2573                /* NOTE - There is similar code to this block below for handling
2574                   TRIE nodes on a re-study.  If you change stuff here check there
2575                   too. */
2576                I32 max1 = 0, min1 = I32_MAX, num = 0;
2577                struct regnode_charclass_class accum;
2578                regnode * const startbranch=scan;
2579                
2580                if (flags & SCF_DO_SUBSTR)
2581                    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2582                if (flags & SCF_DO_STCLASS)
2583                    cl_init_zero(pRExC_state, &accum);
2584
2585                while (OP(scan) == code) {
2586                    I32 deltanext, minnext, f = 0, fake;
2587                    struct regnode_charclass_class this_class;
2588
2589                    num++;
2590                    data_fake.flags = 0;
2591                    if (data) {
2592                        data_fake.whilem_c = data->whilem_c;
2593                        data_fake.last_closep = data->last_closep;
2594                    }
2595                    else
2596                        data_fake.last_closep = &fake;
2597
2598                    data_fake.pos_delta = delta;
2599                    next = regnext(scan);
2600                    scan = NEXTOPER(scan);
2601                    if (code != BRANCH)
2602                        scan = NEXTOPER(scan);
2603                    if (flags & SCF_DO_STCLASS) {
2604                        cl_init(pRExC_state, &this_class);
2605                        data_fake.start_class = &this_class;
2606                        f = SCF_DO_STCLASS_AND;
2607                    }
2608                    if (flags & SCF_WHILEM_VISITED_POS)
2609                        f |= SCF_WHILEM_VISITED_POS;
2610
2611                    /* we suppose the run is continuous, last=next...*/
2612                    minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2613                                          next, &data_fake,
2614                                          stopparen, recursed, NULL, f,depth+1);
2615                    if (min1 > minnext)
2616                        min1 = minnext;
2617                    if (max1 < minnext + deltanext)
2618                        max1 = minnext + deltanext;
2619                    if (deltanext == I32_MAX)
2620                        is_inf = is_inf_internal = 1;
2621                    scan = next;
2622                    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2623                        pars++;
2624                    if (data_fake.flags & SCF_SEEN_ACCEPT) {
2625                        if ( stopmin > minnext) 
2626                            stopmin = min + min1;
2627                        flags &= ~SCF_DO_SUBSTR;
2628                        if (data)
2629                            data->flags |= SCF_SEEN_ACCEPT;
2630                    }
2631                    if (data) {
2632                        if (data_fake.flags & SF_HAS_EVAL)
2633                            data->flags |= SF_HAS_EVAL;
2634                        data->whilem_c = data_fake.whilem_c;
2635                    }
2636                    if (flags & SCF_DO_STCLASS)
2637                        cl_or(pRExC_state, &accum, &this_class);
2638                }
2639                if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2640                    min1 = 0;
2641                if (flags & SCF_DO_SUBSTR) {
2642                    data->pos_min += min1;
2643                    data->pos_delta += max1 - min1;
2644                    if (max1 != min1 || is_inf)
2645                        data->longest = &(data->longest_float);
2646                }
2647                min += min1;
2648                delta += max1 - min1;
2649                if (flags & SCF_DO_STCLASS_OR) {
2650                    cl_or(pRExC_state, data->start_class, &accum);
2651                    if (min1) {
2652                        cl_and(data->start_class, and_withp);
2653                        flags &= ~SCF_DO_STCLASS;
2654                    }
2655                }
2656                else if (flags & SCF_DO_STCLASS_AND) {
2657                    if (min1) {
2658                        cl_and(data->start_class, &accum);
2659                        flags &= ~SCF_DO_STCLASS;
2660                    }
2661                    else {
2662                        /* Switch to OR mode: cache the old value of
2663                         * data->start_class */
2664                        INIT_AND_WITHP;
2665                        StructCopy(data->start_class, and_withp,
2666                                   struct regnode_charclass_class);
2667                        flags &= ~SCF_DO_STCLASS_AND;
2668                        StructCopy(&accum, data->start_class,
2669                                   struct regnode_charclass_class);
2670                        flags |= SCF_DO_STCLASS_OR;
2671                        data->start_class->flags |= ANYOF_EOS;
2672                    }
2673                }
2674
2675                if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2676                /* demq.
2677
2678                   Assuming this was/is a branch we are dealing with: 'scan' now
2679                   points at the item that follows the branch sequence, whatever
2680                   it is. We now start at the beginning of the sequence and look
2681                   for subsequences of
2682
2683                   BRANCH->EXACT=>x1
2684                   BRANCH->EXACT=>x2
2685                   tail
2686
2687                   which would be constructed from a pattern like /A|LIST|OF|WORDS/
2688
2689                   If we can find such a subseqence we need to turn the first
2690                   element into a trie and then add the subsequent branch exact
2691                   strings to the trie.
2692
2693                   We have two cases
2694
2695                     1. patterns where the whole set of branch can be converted. 
2696
2697                     2. patterns where only a subset can be converted.
2698
2699                   In case 1 we can replace the whole set with a single regop
2700                   for the trie. In case 2 we need to keep the start and end
2701                   branchs so
2702
2703                     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2704                     becomes BRANCH TRIE; BRANCH X;
2705
2706                  There is an additional case, that being where there is a 
2707                  common prefix, which gets split out into an EXACT like node
2708                  preceding the TRIE node.
2709
2710                  If x(1..n)==tail then we can do a simple trie, if not we make
2711                  a "jump" trie, such that when we match the appropriate word
2712                  we "jump" to the appopriate tail node. Essentailly we turn
2713                  a nested if into a case structure of sorts.
2714
2715                */
2716                
2717                    int made=0;
2718                    if (!re_trie_maxbuff) {
2719                        re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2720                        if (!SvIOK(re_trie_maxbuff))
2721                            sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2722                    }
2723                    if ( SvIV(re_trie_maxbuff)>=0  ) {
2724                        regnode *cur;
2725                        regnode *first = (regnode *)NULL;
2726                        regnode *last = (regnode *)NULL;
2727                        regnode *tail = scan;
2728                        U8 optype = 0;
2729                        U32 count=0;
2730
2731#ifdef DEBUGGING
2732                        SV * const mysv = sv_newmortal();       /* for dumping */
2733#endif
2734                        /* var tail is used because there may be a TAIL
2735                           regop in the way. Ie, the exacts will point to the
2736                           thing following the TAIL, but the last branch will
2737                           point at the TAIL. So we advance tail. If we
2738                           have nested (?:) we may have to move through several
2739                           tails.
2740                         */
2741
2742                        while ( OP( tail ) == TAIL ) {
2743                            /* this is the TAIL generated by (?:) */
2744                            tail = regnext( tail );
2745                        }
2746
2747                        
2748                        DEBUG_OPTIMISE_r({
2749                            regprop(RExC_rx, mysv, tail );
2750                            PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2751                                (int)depth * 2 + 2, "", 
2752                                "Looking for TRIE'able sequences. Tail node is: ", 
2753                                SvPV_nolen_const( mysv )
2754                            );
2755                        });
2756                        
2757                        /*
2758
2759                           step through the branches, cur represents each
2760                           branch, noper is the first thing to be matched
2761                           as part of that branch and noper_next is the
2762                           regnext() of that node. if noper is an EXACT
2763                           and noper_next is the same as scan (our current
2764                           position in the regex) then the EXACT branch is
2765                           a possible optimization target. Once we have
2766                           two or more consequetive such branches we can
2767                           create a trie of the EXACT's contents and stich
2768                           it in place. If the sequence represents all of
2769                           the branches we eliminate the whole thing and
2770                           replace it with a single TRIE. If it is a
2771                           subsequence then we need to stitch it in. This
2772                           means the first branch has to remain, and needs
2773                           to be repointed at the item on the branch chain
2774                           following the last branch optimized. This could
2775                           be either a BRANCH, in which case the
2776                           subsequence is internal, or it could be the
2777                           item following the branch sequence in which
2778                           case the subsequence is at the end.
2779
2780                        */
2781
2782                        /* dont use tail as the end marker for this traverse */
2783                        for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2784                            regnode * const noper = NEXTOPER( cur );
2785#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2786                            regnode * const noper_next = regnext( noper );
2787#endif
2788
2789                            DEBUG_OPTIMISE_r({
2790                                regprop(RExC_rx, mysv, cur);
2791                                PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2792                                   (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2793
2794                                regprop(RExC_rx, mysv, noper);
2795                                PerlIO_printf( Perl_debug_log, " -> %s",
2796                                    SvPV_nolen_const(mysv));
2797
2798                                if ( noper_next ) {
2799                                  regprop(RExC_rx, mysv, noper_next );
2800                                  PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2801                                    SvPV_nolen_const(mysv));
2802                                }
2803                                PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2804                                   REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2805                            });
2806                            if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2807                                         : PL_regkind[ OP( noper ) ] == EXACT )
2808                                  || OP(noper) == NOTHING )
2809#ifdef NOJUMPTRIE
2810                                  && noper_next == tail
2811#endif
2812                                  && count < U16_MAX)
2813                            {
2814                                count++;
2815                                if ( !first || optype == NOTHING ) {
2816                                    if (!first) first = cur;
2817                                    optype = OP( noper );
2818                                } else {
2819                                    last = cur;
2820                                }
2821                            } else {
2822/* 
2823    Currently we assume that the trie can handle unicode and ascii
2824    matches fold cased matches. If this proves true then the following
2825    define will prevent tries in this situation. 
2826    
2827    #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2828*/
2829#define TRIE_TYPE_IS_SAFE 1
2830                                if ( last && TRIE_TYPE_IS_SAFE ) {
2831                                    make_trie( pRExC_state, 
2832                                            startbranch, first, cur, tail, count, 
2833                                            optype, depth+1 );
2834                                }
2835                                if ( PL_regkind[ OP( noper ) ] == EXACT
2836#ifdef NOJUMPTRIE
2837                                     && noper_next == tail
2838#endif
2839                                ){
2840                                    count = 1;
2841                                    first = cur;
2842                                    optype = OP( noper );
2843                                } else {
2844                                    count = 0;
2845                                    first = NULL;
2846                                    optype = 0;
2847                                }
2848                                last = NULL;
2849                            }
2850                        }
2851                        DEBUG_OPTIMISE_r({
2852                            regprop(RExC_rx, mysv, cur);
2853                            PerlIO_printf( Perl_debug_log,
2854                              "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2855                              "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2856
2857                        });
2858                        
2859                        if ( last && TRIE_TYPE_IS_SAFE ) {
2860                            made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2861#ifdef TRIE_STUDY_OPT   
2862                            if ( ((made == MADE_EXACT_TRIE && 
2863                                 startbranch == first) 
2864                                 || ( first_non_open == first )) && 
2865                                 depth==0 ) {
2866                                flags |= SCF_TRIE_RESTUDY;
2867                                if ( startbranch == first 
2868                                     && scan == tail ) 
2869                                {
2870                                    RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2871                                }
2872                            }
2873#endif
2874                        }
2875                    }
2876                    
2877                } /* do trie */
2878                
2879            }
2880            else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2881                scan = NEXTOPER(NEXTOPER(scan));
2882            } else                      /* single branch is optimized. */
2883                scan = NEXTOPER(scan);
2884            continue;
2885        } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2886            scan_frame *newframe = NULL;
2887            I32 paren;
2888            regnode *start;
2889            regnode *end;
2890
2891            if (OP(scan) != SUSPEND) {
2892            /* set the pointer */
2893                if (OP(scan) == GOSUB) {
2894                    paren = ARG(scan);
2895                    RExC_recurse[ARG2L(scan)] = scan;
2896                    start = RExC_open_parens[paren-1];
2897                    end   = RExC_close_parens[paren-1];
2898                } else {
2899                    paren = 0;
2900                    start = RExC_rxi->program + 1;
2901                    end   = RExC_opend;
2902                }
2903                if (!recursed) {
2904                    Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2905                    SAVEFREEPV(recursed);
2906                }
2907                if (!PAREN_TEST(recursed,paren+1)) {
2908                    PAREN_SET(recursed,paren+1);
2909                    Newx(newframe,1,scan_frame);
2910                } else {
2911                    if (flags & SCF_DO_SUBSTR) {
2912                        SCAN_COMMIT(pRExC_state,data,minlenp);
2913                        data->longest = &(data->longest_float);
2914                    }
2915                    is_inf = is_inf_internal = 1;
2916                    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2917                        cl_anything(pRExC_state, data->start_class);
2918                    flags &= ~SCF_DO_STCLASS;
2919                }
2920            } else {
2921                Newx(newframe,1,scan_frame);
2922                paren = stopparen;
2923                start = scan+2;
2924                end = regnext(scan);
2925            }
2926            if (newframe) {
2927                assert(start);
2928                assert(end);
2929                SAVEFREEPV(newframe);
2930                newframe->next = regnext(scan);
2931                newframe->last = last;
2932                newframe->stop = stopparen;
2933                newframe->prev = frame;
2934
2935                frame = newframe;
2936                scan =  start;
2937                stopparen = paren;
2938                last = end;
2939
2940                continue;
2941            }
2942        }
2943        else if (OP(scan) == EXACT) {
2944            I32 l = STR_LEN(scan);
2945            UV uc;
2946            if (UTF) {
2947                const U8 * const s = (U8*)STRING(scan);
2948                l = utf8_length(s, s + l);
2949                uc = utf8_to_uvchr(s, NULL);
2950            } else {
2951                uc = *((U8*)STRING(scan));
2952            }
2953            min += l;
2954            if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2955                /* The code below prefers earlier match for fixed
2956                   offset, later match for variable offset.  */
2957                if (data->last_end == -1) { /* Update the start info. */
2958                    data->last_start_min = data->pos_min;
2959                    data->last_start_max = is_inf
2960                        ? I32_MAX : data->pos_min + data->pos_delta;
2961                }
2962                sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2963                if (UTF)
2964                    SvUTF8_on(data->last_found);
2965                {
2966                    SV * const sv = data->last_found;
2967                    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2968                        mg_find(sv, PERL_MAGIC_utf8) : NULL;
2969                    if (mg && mg->mg_len >= 0)
2970                        mg->mg_len += utf8_length((U8*)STRING(scan),
2971                                                  (U8*)STRING(scan)+STR_LEN(scan));
2972                }
2973                data->last_end = data->pos_min + l;
2974                data->pos_min += l; /* As in the first entry. */
2975                data->flags &= ~SF_BEFORE_EOL;
2976            }
2977            if (flags & SCF_DO_STCLASS_AND) {
2978                /* Check whether it is compatible with what we know already! */
2979                int compat = 1;
2980
2981                if (uc >= 0x100 ||
2982                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2983                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
2984                    && (!(data->start_class->flags & ANYOF_FOLD)
2985                        || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2986                    )
2987                    compat = 0;
2988                ANYOF_CLASS_ZERO(data->start_class);
2989                ANYOF_BITMAP_ZERO(data->start_class);
2990                if (compat)
2991                    ANYOF_BITMAP_SET(data->start_class, uc);
2992                data->start_class->flags &= ~ANYOF_EOS;
2993                if (uc < 0x100)
2994                  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2995            }
2996            else if (flags & SCF_DO_STCLASS_OR) {
2997                /* false positive possible if the class is case-folded */
2998                if (uc < 0x100)
2999                    ANYOF_BITMAP_SET(data->start_class, uc);
3000                else
3001                    data->start_class->flags |= ANYOF_UNICODE_ALL;
3002                data->start_class->flags &= ~ANYOF_EOS;
3003                cl_and(data->start_class, and_withp);
3004            }
3005            flags &= ~SCF_DO_STCLASS;
3006        }
3007        else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3008            I32 l = STR_LEN(scan);
3009            UV uc = *((U8*)STRING(scan));
3010
3011            /* Search for fixed substrings supports EXACT only. */
3012            if (flags & SCF_DO_SUBSTR) {
3013                assert(data);
3014                SCAN_COMMIT(pRExC_state, data, minlenp);
3015            }
3016            if (UTF) {
3017                const U8 * const s = (U8 *)STRING(scan);
3018                l = utf8_length(s, s + l);
3019                uc = utf8_to_uvchr(s, NULL);
3020            }
3021            min += l;
3022            if (flags & SCF_DO_SUBSTR)
3023                data->pos_min += l;
3024            if (flags & SCF_DO_STCLASS_AND) {
3025                /* Check whether it is compatible with what we know already! */
3026                int compat = 1;
3027
3028                if (uc >= 0x100 ||
3029                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3030                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
3031                     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3032                    compat = 0;
3033                ANYOF_CLASS_ZERO(data->start_class);
3034                ANYOF_BITMAP_ZERO(data->start_class);
3035                if (compat) {
3036                    ANYOF_BITMAP_SET(data->start_class, uc);
3037                    data->start_class->flags &= ~ANYOF_EOS;
3038                    data->start_class->flags |= ANYOF_FOLD;
3039                    if (OP(scan) == EXACTFL)
3040                        data->start_class->flags |= ANYOF_LOCALE;
3041                }
3042            }
3043            else if (flags & SCF_DO_STCLASS_OR) {
3044                if (data->start_class->flags & ANYOF_FOLD) {
3045                    /* false positive possible if the class is case-folded.
3046                       Assume that the locale settings are the same... */
3047                    if (uc < 0x100)
3048                        ANYOF_BITMAP_SET(data->start_class, uc);
3049                    data->start_class->flags &= ~ANYOF_EOS;
3050                }
3051                cl_and(data->start_class, and_withp);
3052            }
3053            flags &= ~SCF_DO_STCLASS;
3054        }
3055        else if (strchr((const char*)PL_varies,OP(scan))) {
3056            I32 mincount, maxcount, minnext, deltanext, fl = 0;
3057            I32 f = flags, pos_before = 0;
3058            regnode * const oscan = scan;
3059            struct regnode_charclass_class this_class;
3060            struct regnode_charclass_class *oclass = NULL;
3061            I32 next_is_eval = 0;
3062
3063            switch (PL_regkind[OP(scan)]) {
3064            case WHILEM:                /* End of (?:...)* . */
3065                scan = NEXTOPER(scan);
3066                goto finish;
3067            case PLUS:
3068                if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3069                    next = NEXTOPER(scan);
3070                    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3071                        mincount = 1;
3072                        maxcount = REG_INFTY;
3073                        next = regnext(scan);
3074                        scan = NEXTOPER(scan);
3075                        goto do_curly;
3076                    }
3077                }
3078                if (flags & SCF_DO_SUBSTR)
3079                    data->pos_min++;
3080                min++;
3081                /* Fall through. */
3082            case STAR:
3083                if (flags & SCF_DO_STCLASS) {
3084                    mincount = 0;
3085                    maxcount = REG_INFTY;
3086                    next = regnext(scan);
3087                    scan = NEXTOPER(scan);
3088                    goto do_curly;
3089                }
3090                is_inf = is_inf_internal = 1;
3091                scan = regnext(scan);
3092                if (flags & SCF_DO_SUBSTR) {
3093                    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3094                    data->longest = &(data->longest_float);
3095                }
3096                goto optimize_curly_tail;
3097            case CURLY:
3098                if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3099                    && (scan->flags == stopparen))
3100                {
3101                    mincount = 1;
3102                    maxcount = 1;
3103                } else {
3104                    mincount = ARG1(scan);
3105                    maxcount = ARG2(scan);
3106                }
3107                next = regnext(scan);
3108                if (OP(scan) == CURLYX) {
3109                    I32 lp = (data ? *(data->last_closep) : 0);
3110                    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3111                }
3112                scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3113                next_is_eval = (OP(scan) == EVAL);
3114              do_curly:
3115                if (flags & SCF_DO_SUBSTR) {
3116                    if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3117                    pos_before = data->pos_min;
3118                }
3119                if (data) {
3120                    fl = data->flags;
3121                    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3122                    if (is_inf)
3123                        data->flags |= SF_IS_INF;
3124                }
3125                if (flags & SCF_DO_STCLASS) {
3126                    cl_init(pRExC_state, &this_class);
3127                    oclass = data->start_class;
3128                    data->start_class = &this_class;
3129                    f |= SCF_DO_STCLASS_AND;
3130                    f &= ~SCF_DO_STCLASS_OR;
3131                }
3132                /* These are the cases when once a subexpression
3133                   fails at a particular position, it cannot succeed
3134                   even after backtracking at the enclosing scope.
3135                
3136                   XXXX what if minimal match and we are at the
3137                        initial run of {n,m}? */
3138                if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3139                    f &= ~SCF_WHILEM_VISITED_POS;
3140
3141                /* This will finish on WHILEM, setting scan, or on NULL: */
3142                minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 
3143                                      last, data, stopparen, recursed, NULL,
3144                                      (mincount == 0
3145                                        ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3146
3147                if (flags & SCF_DO_STCLASS)
3148                    data->start_class = oclass;
3149                if (mincount == 0 || minnext == 0) {
3150                    if (flags & SCF_DO_STCLASS_OR) {
3151                        cl_or(pRExC_state, data->start_class, &this_class);
3152                    }
3153                    else if (flags & SCF_DO_STCLASS_AND) {
3154                        /* Switch to OR mode: cache the old value of
3155                         * data->start_class */
3156                        INIT_AND_WITHP;
3157                        StructCopy(data->start_class, and_withp,
3158                                   struct regnode_charclass_class);
3159                        flags &= ~SCF_DO_STCLASS_AND;
3160                        StructCopy(&this_class, data->start_class,
3161                                   struct regnode_charclass_class);
3162                        flags |= SCF_DO_STCLASS_OR;
3163                        data->start_class->flags |= ANYOF_EOS;
3164                    }
3165                } else {                /* Non-zero len */
3166                    if (flags & SCF_DO_STCLASS_OR) {
3167                        cl_or(pRExC_state, data->start_class, &this_class);
3168                        cl_and(data->start_class, and_withp);
3169                    }
3170                    else if (flags & SCF_DO_STCLASS_AND)
3171                        cl_and(data->start_class, &this_class);
3172                    flags &= ~SCF_DO_STCLASS;
3173                }
3174                if (!scan)              /* It was not CURLYX, but CURLY. */
3175                    scan = next;
3176                if ( /* ? quantifier ok, except for (?{ ... }) */
3177                    (next_is_eval || !(mincount == 0 && maxcount == 1))
3178                    && (minnext == 0) && (deltanext == 0)
3179                    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3180                    && maxcount <= REG_INFTY/3 /* Complement check for big count */
3181                    && ckWARN(WARN_REGEXP))
3182                {
3183                    vWARN(RExC_parse,
3184                          "Quantifier unexpected on zero-length expression");
3185                }
3186
3187                min += minnext * mincount;
3188                is_inf_internal |= ((maxcount == REG_INFTY
3189                                     && (minnext + deltanext) > 0)
3190                                    || deltanext == I32_MAX);
3191                is_inf |= is_inf_internal;
3192                delta += (minnext + deltanext) * maxcount - minnext * mincount;
3193
3194                /* Try powerful optimization CURLYX => CURLYN. */
3195                if (  OP(oscan) == CURLYX && data
3196                      && data->flags & SF_IN_PAR
3197                      && !(data->flags & SF_HAS_EVAL)
3198                      && !deltanext && minnext == 1 ) {
3199                    /* Try to optimize to CURLYN.  */
3200                    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3201                    regnode * const nxt1 = nxt;
3202#ifdef DEBUGGING
3203                    regnode *nxt2;
3204#endif
3205
3206                    /* Skip open. */
3207                    nxt = regnext(nxt);
3208                    if (!strchr((const char*)PL_simple,OP(nxt))
3209                        && !(PL_regkind[OP(nxt)] == EXACT
3210                             && STR_LEN(nxt) == 1))
3211                        goto nogo;
3212#ifdef DEBUGGING
3213                    nxt2 = nxt;
3214#endif
3215                    nxt = regnext(nxt);
3216                    if (OP(nxt) != CLOSE)
3217                        goto nogo;
3218                    if (RExC_open_parens) {
3219                        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3220                        RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3221                    }
3222                    /* Now we know that nxt2 is the only contents: */
3223                    oscan->flags = (U8)ARG(nxt);
3224                    OP(oscan) = CURLYN;
3225                    OP(nxt1) = NOTHING; /* was OPEN. */
3226
3227#ifdef DEBUGGING
3228                    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3229                    NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3230                    NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3231                    OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3232                    OP(nxt + 1) = OPTIMIZED; /* was count. */
3233                    NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3234#endif
3235                }
3236              nogo:
3237
3238                /* Try optimization CURLYX => CURLYM. */
3239                if (  OP(oscan) == CURLYX && data
3240                      && !(data->flags & SF_HAS_PAR)
3241                      && !(data->flags & SF_HAS_EVAL)
3242                      && !deltanext     /* atom is fixed width */
3243                      && minnext != 0   /* CURLYM can't handle zero width */
3244                ) {
3245                    /* XXXX How to optimize if data == 0? */
3246                    /* Optimize to a simpler form.  */
3247                    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3248                    regnode *nxt2;
3249
3250                    OP(oscan) = CURLYM;
3251                    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3252                            && (OP(nxt2) != WHILEM))
3253                        nxt = nxt2;
3254                    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3255                    /* Need to optimize away parenths. */
3256                    if (data->flags & SF_IN_PAR) {
3257                        /* Set the parenth number.  */
3258                        regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3259
3260                        if (OP(nxt) != CLOSE)
3261                            FAIL("Panic opt close");
3262                        oscan->flags = (U8)ARG(nxt);
3263                        if (RExC_open_parens) {
3264                            RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3265                            RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3266                        }
3267                        OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3268                        OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3269
3270#ifdef DEBUGGING
3271                        OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3272                        OP(nxt + 1) = OPTIMIZED; /* was count. */
3273                        NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3274                        NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3275#endif
3276#if 0
3277                        while ( nxt1 && (OP(nxt1) != WHILEM)) {
3278                            regnode *nnxt = regnext(nxt1);
3279                        
3280                            if (nnxt == nxt) {
3281                                if (reg_off_by_arg[OP(nxt1)])
3282                                    ARG_SET(nxt1, nxt2 - nxt1);
3283                                else if (nxt2 - nxt1 < U16_MAX)
3284                                    NEXT_OFF(nxt1) = nxt2 - nxt1;
3285                                else
3286                                    OP(nxt) = NOTHING;  /* Cannot beautify */
3287                            }
3288                            nxt1 = nnxt;
3289                        }
3290#endif
3291                        /* Optimize again: */
3292                        study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3293                                    NULL, stopparen, recursed, NULL, 0,depth+1);
3294                    }
3295                    else
3296                        oscan->flags = 0;
3297                }
3298                else if ((OP(oscan) == CURLYX)
3299                         && (flags & SCF_WHILEM_VISITED_POS)
3300                         /* See the comment on a similar expression above.
3301                            However, this time it not a subexpression
3302                            we care about, but the expression itself. */
3303                         && (maxcount == REG_INFTY)
3304                         && data && ++data->whilem_c < 16) {
3305                    /* This stays as CURLYX, we can put the count/of pair. */
3306                    /* Find WHILEM (as in regexec.c) */
3307                    regnode *nxt = oscan + NEXT_OFF(oscan);
3308
3309                    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3310                        nxt += ARG(nxt);
3311                    PREVOPER(nxt)->flags = (U8)(data->whilem_c
3312                        | (RExC_whilem_seen << 4)); /* On WHILEM */
3313                }
3314                if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3315                    pars++;
3316                if (flags & SCF_DO_SUBSTR) {
3317                    SV *last_str = NULL;
3318                    int counted = mincount != 0;
3319
3320                    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3321#if defined(SPARC64_GCC_WORKAROUND)
3322                        I32 b = 0;
3323                        STRLEN l = 0;
3324                        const char *s = NULL;
3325                        I32 old = 0;
3326
3327                        if (pos_before >= data->last_start_min)
3328                            b = pos_before;
3329                        else
3330                            b = data->last_start_min;
3331
3332                        l = 0;
3333                        s = SvPV_const(data->last_found, l);
3334                        old = b - data->last_start_min;
3335
3336#else
3337                        I32 b = pos_before >= data->last_start_min
3338                            ? pos_before : data->last_start_min;
3339                        STRLEN l;
3340                        const char * const s = SvPV_const(data->last_found, l);
3341                        I32 old = b - data->last_start_min;
3342#endif
3343
3344                        if (UTF)
3345                            old = utf8_hop((U8*)s, old) - (U8*)s;
3346                        
3347                        l -= old;
3348                        /* Get the added string: */
3349                        last_str = newSVpvn_utf8(s  + old, l, UTF);
3350                        if (deltanext == 0 && pos_before == b) {
3351                            /* What was added is a constant string */
3352                            if (mincount > 1) {
3353                                SvGROW(last_str, (mincount * l) + 1);
3354                                repeatcpy(SvPVX(last_str) + l,
3355                                          SvPVX_const(last_str), l, mincount - 1);
3356                                SvCUR_set(last_str, SvCUR(last_str) * mincount);
3357                                /* Add additional parts. */
3358                                SvCUR_set(data->last_found,
3359                                          SvCUR(data->last_found) - l);
3360                                sv_catsv(data->last_found, last_str);
3361                                {
3362                                    SV * sv = data->last_found;
3363                                    MAGIC *mg =
3364                                        SvUTF8(sv) && SvMAGICAL(sv) ?
3365                                        mg_find(sv, PERL_MAGIC_utf8) : NULL;
3366                                    if (mg && mg->mg_len >= 0)
3367                                        mg->mg_len += CHR_SVLEN(last_str) - l;
3368                                }
3369                                data->last_end += l * (mincount - 1);
3370                            }
3371                        } else {
3372                            /* start offset must point into the last copy */
3373                            data->last_start_min += minnext * (mincount - 1);
3374                            data->last_start_max += is_inf ? I32_MAX
3375                                : (maxcount - 1) * (minnext + data->pos_delta);
3376                        }
3377                    }
3378                    /* It is counted once already... */
3379                    data->pos_min += minnext * (mincount - counted);
3380                    data->pos_delta += - counted * deltanext +
3381                        (minnext + deltanext) * maxcount - minnext * mincount;
3382                    if (mincount != maxcount) {
3383                         /* Cannot extend fixed substrings found inside
3384                            the group.  */
3385                        SCAN_COMMIT(pRExC_state,data,minlenp);
3386                        if (mincount && last_str) {
3387                            SV * const sv = data->last_found;
3388                            MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3389                                mg_find(sv, PERL_MAGIC_utf8) : NULL;
3390
3391                            if (mg)
3392                                mg->mg_len = -1;
3393                            sv_setsv(sv, last_str);
3394                            data->last_end = data->pos_min;
3395                            data->last_start_min =
3396                                data->pos_min - CHR_SVLEN(last_str);
3397                            data->last_start_max = is_inf
3398                                ? I32_MAX
3399                                : data->pos_min + data->pos_delta
3400                                - CHR_SVLEN(last_str);
3401                        }
3402                        data->longest = &(data->longest_float);
3403                    }
3404                    SvREFCNT_dec(last_str);
3405                }
3406                if (data && (fl & SF_HAS_EVAL))
3407                    data->flags |= SF_HAS_EVAL;
3408              optimize_curly_tail:
3409                if (OP(oscan) != CURLYX) {
3410                    while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3411                           && NEXT_OFF(next))
3412                        NEXT_OFF(oscan) += NEXT_OFF(next);
3413                }
3414                continue;
3415            default:                    /* REF and CLUMP only? */
3416                if (flags & SCF_DO_SUBSTR) {
3417                    SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3418                    data->longest = &(data->longest_float);
3419                }
3420                is_inf = is_inf_internal = 1;
3421                if (flags & SCF_DO_STCLASS_OR)
3422                    cl_anything(pRExC_state, data->start_class);
3423                flags &= ~SCF_DO_STCLASS;
3424                break;
3425            }
3426        }
3427        else if (OP(scan) == LNBREAK) {
3428            if (flags & SCF_DO_STCLASS) {
3429                int value = 0;
3430                data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3431                if (flags & SCF_DO_STCLASS_AND) {
3432                    for (value = 0; value < 256; value++)
3433                        if (!is_VERTWS_cp(value))
3434                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
3435                }                                                              
3436                else {                                                         
3437                    for (value = 0; value < 256; value++)
3438                        if (is_VERTWS_cp(value))
3439                            ANYOF_BITMAP_SET(data->start_class, value);    
3440                }                                                              
3441                if (flags & SCF_DO_STCLASS_OR)
3442                    cl_and(data->start_class, and_withp);
3443                flags &= ~SCF_DO_STCLASS;
3444            }
3445            min += 1;
3446            delta += 1;
3447            if (flags & SCF_DO_SUBSTR) {
3448                SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3449                data->pos_min += 1;
3450                data->pos_delta += 1;
3451                data->longest = &(data->longest_float);
3452            }
3453            
3454        }
3455        else if (OP(scan) == FOLDCHAR) {
3456            int d = ARG(scan)==0xDF ? 1 : 2;
3457            flags &= ~SCF_DO_STCLASS;
3458            min += 1;
3459            delta += d;
3460            if (flags & SCF_DO_SUBSTR) {
3461                SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3462                data->pos_min += 1;
3463                data->pos_delta += d;
3464                data->longest = &(data->longest_float);
3465            }
3466        }
3467        else if (strchr((const char*)PL_simple,OP(scan))) {
3468            int value = 0;
3469
3470            if (flags & SCF_DO_SUBSTR) {
3471                SCAN_COMMIT(pRExC_state,data,minlenp);
3472                data->pos_min++;
3473            }
3474            min++;
3475            if (flags & SCF_DO_STCLASS) {
3476                data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3477
3478                /* Some of the logic below assumes that switching
3479                   locale on will only add false positives. */
3480                switch (PL_regkind[OP(scan)]) {
3481                case SANY:
3482                default:
3483                  do_default:
3484                    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3485                    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3486                        cl_anything(pRExC_state, data->start_class);
3487                    break;
3488                case REG_ANY:
3489                    if (OP(scan) == SANY)
3490                        goto do_default;
3491                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3492                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3493                                 || (data->start_class->flags & ANYOF_CLASS));
3494                        cl_anything(pRExC_state, data->start_class);
3495                    }
3496                    if (flags & SCF_DO_STCLASS_AND || !value)
3497                        ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3498                    break;
3499                case ANYOF:
3500                    if (flags & SCF_DO_STCLASS_AND)
3501                        cl_and(data->start_class,
3502                               (struct regnode_charclass_class*)scan);
3503                    else
3504                        cl_or(pRExC_state, data->start_class,
3505                              (struct regnode_charclass_class*)scan);
3506                    break;
3507                case ALNUM:
3508                    if (flags & SCF_DO_STCLASS_AND) {
3509                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
3510                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3511                            for (value = 0; value < 256; value++)
3512                                if (!isALNUM(value))
3513                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
3514                        }
3515                    }
3516                    else {
3517                        if (data->start_class->flags & ANYOF_LOCALE)
3518                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3519                        else {
3520                            for (value = 0; value < 256; value++)
3521                                if (isALNUM(value))
3522                                    ANYOF_BITMAP_SET(data->start_class, value);                 
3523                        }
3524                    }
3525                    break;
3526                case ALNUML:
3527                    if (flags & SCF_DO_STCLASS_AND) {
3528                        if (data->start_class->flags & ANYOF_LOCALE)
3529                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3530                    }
3531                    else {
3532                        ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3533                        data->start_class->flags |= ANYOF_LOCALE;
3534                    }
3535                    break;
3536                case NALNUM:
3537                    if (flags & SCF_DO_STCLASS_AND) {
3538                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
3539                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3540                            for (value = 0; value < 256; value++)
3541                                if (isALNUM(value))
3542                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
3543                        }
3544                    }
3545                    else {
3546                        if (data->start_class->flags & ANYOF_LOCALE)
3547                            ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3548                        else {
3549                            for (value = 0; value < 256; value++)
3550                                if (!isALNUM(value))
3551                                    ANYOF_BITMAP_SET(data->start_class, value);                 
3552                        }
3553                    }
3554                    break;
3555                case NALNUML:
3556                    if (flags & SCF_DO_STCLASS_AND) {
3557                        if (data->start_class->flags & ANYOF_LOCALE)
3558                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3559                    }
3560                    else {
3561                        data->start_class->flags |= ANYOF_LOCALE;
3562                        ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3563                    }
3564                    break;
3565                case SPACE:
3566                    if (flags & SCF_DO_STCLASS_AND) {
3567                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
3568                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3569                            for (value = 0; value < 256; value++)
3570                                if (!isSPACE(value))
3571                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
3572                        }
3573                    }
3574                    else {
3575                        if (data->start_class->flags & ANYOF_LOCALE)
3576                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3577                        else {
3578                            for (value = 0; value < 256; value++)
3579                                if (isSPACE(value))
3580                                    ANYOF_BITMAP_SET(data->start_class, value);                 
3581                        }
3582                    }
3583                    break;
3584                case SPACEL:
3585                    if (flags & SCF_DO_STCLASS_AND) {
3586                        if (data->start_class->flags & ANYOF_LOCALE)
3587                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3588                    }
3589                    else {
3590                        data->start_class->flags |= ANYOF_LOCALE;
3591                        ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3592                    }
3593                    break;
3594                case NSPACE:
3595                    if (flags & SCF_DO_STCLASS_AND) {
3596                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
3597                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3598                            for (value = 0; value < 256; value++)
3599                                if (isSPACE(value))
3600                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
3601                        }
3602                    }
3603                    else {
3604                        if (data->start_class->flags & ANYOF_LOCALE)
3605                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3606                        else {
3607                            for (value = 0; value < 256; value++)
3608                                if (!isSPACE(value))
3609                                    ANYOF_BITMAP_SET(data->start_class, value);                 
3610                        }
3611                    }
3612                    break;
3613                case NSPACEL:
3614                    if (flags & SCF_DO_STCLASS_AND) {
3615                        if (data->start_class->flags & ANYOF_LOCALE) {
3616                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3617                            for (value = 0; value < 256; value++)
3618                                if (!isSPACE(value))
3619                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
3620                        }
3621                    }
3622                    else {
3623                        data->start_class->flags |= ANYOF_LOCALE;
3624                        ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3625                    }
3626                    break;
3627                case DIGIT:
3628                    if (flags & SCF_DO_STCLASS_AND) {
3629                        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3630                        for (value = 0; value < 256; value++)
3631                            if (!isDIGIT(value))
3632                                ANYOF_BITMAP_CLEAR(data->start_class, value);
3633                    }
3634                    else {
3635                        if (data->start_class->flags & ANYOF_LOCALE)
3636                            ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3637                        else {
3638                            for (value = 0; value < 256; value++)
3639                                if (isDIGIT(value))
3640                                    ANYOF_BITMAP_SET(data->start_class, value);                 
3641                        }
3642                    }
3643                    break;
3644                case NDIGIT:
3645                    if (flags & SCF_DO_STCLASS_AND) {
3646                        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3647                        for (value = 0; value < 256; value++)
3648                            if (isDIGIT(value))
3649                                ANYOF_BITMAP_CLEAR(data->start_class, value);
3650                    }
3651                    else {
3652                        if (data->start_class->flags & ANYOF_LOCALE)
3653                            ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3654                        else {
3655                            for (value = 0; value < 256; value++)
3656                                if (!isDIGIT(value))
3657                                    ANYOF_BITMAP_SET(data->start_class, value);                 
3658                        }
3659                    }
3660                    break;
3661                CASE_SYNST_FNC(VERTWS);
3662                CASE_SYNST_FNC(HORIZWS);
3663                
3664                }
3665                if (flags & SCF_DO_STCLASS_OR)
3666                    cl_and(data->start_class, and_withp);
3667                flags &= ~SCF_DO_STCLASS;
3668            }
3669        }
3670        else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3671            data->flags |= (OP(scan) == MEOL
3672                            ? SF_BEFORE_MEOL
3673                            : SF_BEFORE_SEOL);
3674        }
3675        else if (  PL_regkind[OP(scan)] == BRANCHJ
3676                 /* Lookbehind, or need to calculate parens/evals/stclass: */
3677                   && (scan->flags || data || (flags & SCF_DO_STCLASS))
3678                   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3679            if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 
3680                || OP(scan) == UNLESSM )
3681            {
3682                /* Negative Lookahead/lookbehind
3683                   In this case we can't do fixed string optimisation.
3684                */
3685
3686                I32 deltanext, minnext, fake = 0;
3687                regnode *nscan;
3688                struct regnode_charclass_class intrnl;
3689                int f = 0;
3690
3691                data_fake.flags = 0;
3692                if (data) {
3693                    data_fake.whilem_c = data->whilem_c;
3694                    data_fake.last_closep = data->last_closep;
3695                }
3696                else
3697                    data_fake.last_closep = &fake;
3698                data_fake.pos_delta = delta;
3699                if ( flags & SCF_DO_STCLASS && !scan->flags
3700                     && OP(scan) == IFMATCH ) { /* Lookahead */
3701                    cl_init(pRExC_state, &intrnl);
3702                    data_fake.start_class = &intrnl;
3703                    f |= SCF_DO_STCLASS_AND;
3704                }
3705                if (flags & SCF_WHILEM_VISITED_POS)
3706                    f |= SCF_WHILEM_VISITED_POS;
3707                next = regnext(scan);
3708                nscan = NEXTOPER(NEXTOPER(scan));
3709                minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 
3710                    last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3711                if (scan->flags) {
3712                    if (deltanext) {
3713                        FAIL("Variable length lookbehind not implemented");
3714                    }
3715                    else if (minnext > (I32)U8_MAX) {
3716                        FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3717                    }
3718                    scan->flags = (U8)minnext;
3719                }
3720                if (data) {
3721                    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3722                        pars++;
3723                    if (data_fake.flags & SF_HAS_EVAL)
3724                        data->flags |= SF_HAS_EVAL;
3725                    data->whilem_c = data_fake.whilem_c;
3726                }
3727                if (f & SCF_DO_STCLASS_AND) {
3728                    if (flags & SCF_DO_STCLASS_OR) {
3729                        /* OR before, AND after: ideally we would recurse with
3730                         * data_fake to get the AND applied by study of the
3731                         * remainder of the pattern, and then derecurse;
3732                         * *** HACK *** for now just treat as "no information".
3733                         * See [perl #56690].
3734                         */
3735                        cl_init(pRExC_state, data->start_class);
3736                    }  else {
3737                        /* AND before and after: combine and continue */
3738                        const int was = (data->start_class->flags & ANYOF_EOS);
3739
3740                        cl_and(data->start_class, &intrnl);
3741                        if (was)
3742                            data->start_class->flags |= ANYOF_EOS;
3743                    }
3744                }
3745            }
3746#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3747            else {
3748                /* Positive Lookahead/lookbehind
3749                   In this case we can do fixed string optimisation,
3750                   but we must be careful about it. Note in the case of
3751                   lookbehind the positions will be offset by the minimum
3752                   length of the pattern, something we won't know about
3753                   until after the recurse.
3754                */
3755                I32 deltanext, fake = 0;
3756                regnode *nscan;
3757                struct regnode_charclass_class intrnl;
3758                int f = 0;
3759                /* We use SAVEFREEPV so that when the full compile 
3760                    is finished perl will clean up the allocated 
3761                    minlens when its all done. This was we don't
3762                    have to worry about freeing them when we know
3763                    they wont be used, which would be a pain.
3764                 */
3765                I32 *minnextp;
3766                Newx( minnextp, 1, I32 );
3767                SAVEFREEPV(minnextp);
3768
3769                if (data) {
3770                    StructCopy(data, &data_fake, scan_data_t);
3771                    if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3772                        f |= SCF_DO_SUBSTR;
3773                        if (scan->flags) 
3774                            SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3775                        data_fake.last_found=newSVsv(data->last_found);
3776                    }
3777                }
3778                else
3779                    data_fake.last_closep = &fake;
3780                data_fake.flags = 0;
3781                data_fake.pos_delta = delta;
3782                if (is_inf)
3783                    data_fake.flags |= SF_IS_INF;
3784                if ( flags & SCF_DO_STCLASS && !scan->flags
3785                     && OP(scan) == IFMATCH ) { /* Lookahead */
3786                    cl_init(pRExC_state, &intrnl);
3787                    data_fake.start_class = &intrnl;
3788                    f |= SCF_DO_STCLASS_AND;
3789                }
3790                if (flags & SCF_WHILEM_VISITED_POS)
3791                    f |= SCF_WHILEM_VISITED_POS;
3792                next = regnext(scan);
3793                nscan = NEXTOPER(NEXTOPER(scan));
3794
3795                *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, 
3796                    last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3797                if (scan->flags) {
3798                    if (deltanext) {
3799                        FAIL("Variable length lookbehind not implemented");
3800                    }
3801                    else if (*minnextp > (I32)U8_MAX) {
3802                        FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3803                    }
3804                    scan->flags = (U8)*minnextp;
3805                }
3806
3807                *minnextp += min;
3808
3809                if (f & SCF_DO_STCLASS_AND) {
3810                    const int was = (data->start_class->flags & ANYOF_EOS);
3811
3812                    cl_and(data->start_class, &intrnl);
3813                    if (was)
3814                        data->start_class->flags |= ANYOF_EOS;
3815                }
3816                if (data) {
3817                    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3818                        pars++;
3819                    if (data_fake.flags & SF_HAS_EVAL)
3820                        data->flags |= SF_HAS_EVAL;
3821                    data->whilem_c = data_fake.whilem_c;
3822                    if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3823                        if (RExC_rx->minlen<*minnextp)
3824                            RExC_rx->minlen=*minnextp;
3825                        SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3826                        SvREFCNT_dec(data_fake.last_found);
3827                        
3828                        if ( data_fake.minlen_fixed != minlenp ) 
3829                        {
3830                            data->offset_fixed= data_fake.offset_fixed;
3831                            data->minlen_fixed= data_fake.minlen_fixed;
3832                            data->lookbehind_fixed+= scan->flags;
3833                        }
3834                        if ( data_fake.minlen_float != minlenp )
3835                        {
3836                            data->minlen_float= data_fake.minlen_float;
3837                            data->offset_float_min=data_fake.offset_float_min;
3838                            data->offset_float_max=data_fake.offset_float_max;
3839                            data->lookbehind_float+= scan->flags;
3840                        }
3841                    }
3842                }
3843
3844
3845            }
3846#endif
3847        }
3848        else if (OP(scan) == OPEN) {
3849            if (stopparen != (I32)ARG(scan))
3850                pars++;
3851        }
3852        else if (OP(scan) == CLOSE) {
3853            if (stopparen == (I32)ARG(scan)) {
3854                break;
3855            }
3856            if ((I32)ARG(scan) == is_par) {
3857                next = regnext(scan);
3858
3859                if ( next && (OP(next) != WHILEM) && next < last)
3860                    is_par = 0;         /* Disable optimization */
3861            }
3862            if (data)
3863                *(data->last_closep) = ARG(scan);
3864        }
3865        else if (OP(scan) == EVAL) {
3866                if (data)
3867                    data->flags |= SF_HAS_EVAL;
3868        }
3869        else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3870            if (flags & SCF_DO_SUBSTR) {
3871                SCAN_COMMIT(pRExC_state,data,minlenp);
3872                flags &= ~SCF_DO_SUBSTR;
3873            }
3874            if (data && OP(scan)==ACCEPT) {
3875                data->flags |= SCF_SEEN_ACCEPT;
3876                if (stopmin > min)
3877                    stopmin = min;
3878            }
3879        }
3880        else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3881        {
3882                if (flags & SCF_DO_SUBSTR) {
3883                    SCAN_COMMIT(pRExC_state,data,minlenp);
3884                    data->longest = &(data->longest_float);
3885                }
3886                is_inf = is_inf_internal = 1;
3887                if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3888                    cl_anything(pRExC_state, data->start_class);
3889                flags &= ~SCF_DO_STCLASS;
3890        }
3891        else if (OP(scan) == GPOS) {
3892            if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3893                !(delta || is_inf || (data && data->pos_delta))) 
3894            {
3895                if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3896                    RExC_rx->extflags |= RXf_ANCH_GPOS;
3897                if (RExC_rx->gofs < (U32)min)
3898                    RExC_rx->gofs = min;
3899            } else {
3900                RExC_rx->extflags |= RXf_GPOS_FLOAT;
3901                RExC_rx->gofs = 0;
3902            }       
3903        }
3904#ifdef TRIE_STUDY_OPT
3905#ifdef FULL_TRIE_STUDY
3906        else if (PL_regkind[OP(scan)] == TRIE) {
3907            /* NOTE - There is similar code to this block above for handling
3908               BRANCH nodes on the initial study.  If you change stuff here
3909               check there too. */
3910            regnode *trie_node= scan;
3911            regnode *tail= regnext(scan);
3912            reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3913            I32 max1 = 0, min1 = I32_MAX;
3914            struct regnode_charclass_class accum;
3915
3916            if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3917                SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3918            if (flags & SCF_DO_STCLASS)
3919                cl_init_zero(pRExC_state, &accum);
3920                
3921            if (!trie->jump) {
3922                min1= trie->minlen;
3923                max1= trie->maxlen;
3924            } else {
3925                const regnode *nextbranch= NULL;
3926                U32 word;
3927                
3928                for ( word=1 ; word <= trie->wordcount ; word++) 
3929                {
3930                    I32 deltanext=0, minnext=0, f = 0, fake;
3931                    struct regnode_charclass_class this_class;
3932                    
3933                    data_fake.flags = 0;
3934                    if (data) {
3935                        data_fake.whilem_c = data->whilem_c;
3936                        data_fake.last_closep = data->last_closep;
3937                    }
3938                    else
3939                        data_fake.last_closep = &fake;
3940                    data_fake.pos_delta = delta;
3941                    if (flags & SCF_DO_STCLASS) {
3942                        cl_init(pRExC_state, &this_class);
3943                        data_fake.start_class = &this_class;
3944                        f = SCF_DO_STCLASS_AND;
3945                    }
3946                    if (flags & SCF_WHILEM_VISITED_POS)
3947                        f |= SCF_WHILEM_VISITED_POS;
3948    
3949                    if (trie->jump[word]) {
3950                        if (!nextbranch)
3951                            nextbranch = trie_node + trie->jump[0];
3952                        scan= trie_node + trie->jump[word];
3953                        /* We go from the jump point to the branch that follows
3954                           it. Note this means we need the vestigal unused branches
3955                           even though they arent otherwise used.
3956                         */
3957                        minnext = study_chunk(pRExC_state, &scan, minlenp, 
3958                            &deltanext, (regnode *)nextbranch, &data_fake, 
3959                            stopparen, recursed, NULL, f,depth+1);
3960                    }
3961                    if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3962                        nextbranch= regnext((regnode*)nextbranch);
3963                    
3964                    if (min1 > (I32)(minnext + trie->minlen))
3965                        min1 = minnext + trie->minlen;
3966                    if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3967                        max1 = minnext + deltanext + trie->maxlen;
3968                    if (deltanext == I32_MAX)
3969                        is_inf = is_inf_internal = 1;
3970                    
3971                    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3972                        pars++;
3973                    if (data_fake.flags & SCF_SEEN_ACCEPT) {
3974                        if ( stopmin > min + min1) 
3975                            stopmin = min + min1;
3976                        flags &= ~SCF_DO_SUBSTR;
3977                        if (data)
3978                            data->flags |= SCF_SEEN_ACCEPT;
3979                    }
3980                    if (data) {
3981                        if (data_fake.flags & SF_HAS_EVAL)
3982                            data->flags |= SF_HAS_EVAL;
3983                        data->whilem_c = data_fake.whilem_c;
3984                    }
3985                    if (flags & SCF_DO_STCLASS)
3986                        cl_or(pRExC_state, &accum, &this_class);
3987                }
3988            }
3989            if (flags & SCF_DO_SUBSTR) {
3990                data->pos_min += min1;
3991                data->pos_delta += max1 - min1;
3992                if (max1 != min1 || is_inf)
3993                    data->longest = &(data->longest_float);
3994            }
3995            min += min1;
3996            delta += max1 - min1;
3997            if (flags & SCF_DO_STCLASS_OR) {
3998                cl_or(pRExC_state, data->start_class, &accum);
3999                if (min1) {
4000                    cl_and(data->start_class, and_withp);
4001                    flags &= ~SCF_DO_STCLASS;
4002                }
4003            }
4004            else if (flags & SCF_DO_STCLASS_AND) {
4005                if (min1) {
4006                    cl_and(data->start_class, &accum);
4007                    flags &= ~SCF_DO_STCLASS;
4008                }
4009                else {
4010                    /* Switch to OR mode: cache the old value of
4011                     * data->start_class */
4012                    INIT_AND_WITHP;
4013                    StructCopy(data->start_class, and_withp,
4014                               struct regnode_charclass_class);
4015                    flags &= ~SCF_DO_STCLASS_AND;
4016                    StructCopy(&accum, data->start_class,
4017                               struct regnode_charclass_class);
4018                    flags |= SCF_DO_STCLASS_OR;
4019                    data->start_class->flags |= ANYOF_EOS;
4020                }
4021            }
4022            scan= tail;
4023            continue;
4024        }
4025#else
4026        else if (PL_regkind[OP(scan)] == TRIE) {
4027            reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4028            U8*bang=NULL;
4029            
4030            min += trie->minlen;
4031            delta += (trie->maxlen - trie->minlen);
4032            flags &= ~SCF_DO_STCLASS; /* xxx */
4033            if (flags & SCF_DO_SUBSTR) {
4034                SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4035                data->pos_min += trie->minlen;
4036                data->pos_delta += (trie->maxlen - trie->minlen);
4037                if (trie->maxlen != trie->minlen)
4038                    data->longest = &(data->longest_float);
4039            }
4040            if (trie->jump) /* no more substrings -- for now /grr*/
4041                flags &= ~SCF_DO_SUBSTR; 
4042        }
4043#endif /* old or new */
4044#endif /* TRIE_STUDY_OPT */     
4045
4046        /* Else: zero-length, ignore. */
4047        scan = regnext(scan);
4048    }
4049    if (frame) {
4050        last = frame->last;
4051        scan = frame->next;
4052        stopparen = frame->stop;
4053        frame = frame->prev;
4054        goto fake_study_recurse;
4055    }
4056
4057  finish:
4058    assert(!frame);
4059    DEBUG_STUDYDATA("pre-fin:",data,depth);
4060
4061    *scanp = scan;
4062    *deltap = is_inf_internal ? I32_MAX : delta;
4063    if (flags & SCF_DO_SUBSTR && is_inf)
4064        data->pos_delta = I32_MAX - data->pos_min;
4065    if (is_par > (I32)U8_MAX)
4066        is_par = 0;
4067    if (is_par && pars==1 && data) {
4068        data->flags |= SF_IN_PAR;
4069        data->flags &= ~SF_HAS_PAR;
4070    }
4071    else if (pars && data) {
4072        data->flags |= SF_HAS_PAR;
4073        data->flags &= ~SF_IN_PAR;
4074    }
4075    if (flags & SCF_DO_STCLASS_OR)
4076        cl_and(data->start_class, and_withp);
4077    if (flags & SCF_TRIE_RESTUDY)
4078        data->flags |=  SCF_TRIE_RESTUDY;
4079    
4080    DEBUG_STUDYDATA("post-fin:",data,depth);
4081    
4082    return min < stopmin ? min : stopmin;
4083}
4084
4085STATIC U32
4086S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4087{
4088    U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4089
4090    PERL_ARGS_ASSERT_ADD_DATA;
4091
4092    Renewc(RExC_rxi->data,
4093           sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4094           char, struct reg_data);
4095    if(count)
4096        Renew(RExC_rxi->data->what, count + n, U8);
4097    else
4098        Newx(RExC_rxi->data->what, n, U8);
4099    RExC_rxi->data->count = count + n;
4100    Copy(s, RExC_rxi->data->what + count, n, U8);
4101    return count;
4102}
4103
4104/*XXX: todo make this not included in a non debugging perl */
4105#ifndef PERL_IN_XSUB_RE
4106void
4107Perl_reginitcolors(pTHX)
4108{
4109    dVAR;
4110    const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4111    if (s) {
4112        char *t = savepv(s);
4113        int i = 0;
4114        PL_colors[0] = t;
4115        while (++i < 6) {
4116            t = strchr(t, '\t');
4117            if (t) {
4118                *t = '\0';
4119                PL_colors[i] = ++t;
4120            }
4121            else
4122                PL_colors[i] = t = (char *)"";
4123        }
4124    } else {
4125        int i = 0;
4126        while (i < 6)
4127            PL_colors[i++] = (char *)"";
4128    }
4129    PL_colorset = 1;
4130}
4131#endif
4132
4133
4134#ifdef TRIE_STUDY_OPT
4135#define CHECK_RESTUDY_GOTO                                  \
4136        if (                                                \
4137              (data.flags & SCF_TRIE_RESTUDY)               \
4138              && ! restudied++                              \
4139        )     goto reStudy
4140#else
4141#define CHECK_RESTUDY_GOTO
4142#endif        
4143
4144/*
4145 - pregcomp - compile a regular expression into internal code
4146 *
4147 * We can't allocate space until we know how big the compiled form will be,
4148 * but we can't compile it (and thus know how big it is) until we've got a
4149 * place to put the code.  So we cheat:  we compile it twice, once with code
4150 * generation turned off and size counting turned on, and once "for real".
4151 * This also means that we don't allocate space until we are sure that the
4152 * thing really will compile successfully, and we never have to move the
4153 * code and thus invalidate pointers into it.  (Note that it has to be in
4154 * one piece because free() must be able to free it all.) [NB: not true in perl]
4155 *
4156 * Beware that the optimization-preparation code in here knows about some
4157 * of the structure of the compiled regexp.  [I'll say.]
4158 */
4159
4160
4161
4162#ifndef PERL_IN_XSUB_RE
4163#define RE_ENGINE_PTR &PL_core_reg_engine
4164#else
4165extern const struct regexp_engine my_reg_engine;
4166#define RE_ENGINE_PTR &my_reg_engine
4167#endif
4168
4169#ifndef PERL_IN_XSUB_RE 
4170REGEXP *
4171Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4172{
4173    dVAR;
4174    HV * const table = GvHV(PL_hintgv);
4175
4176    PERL_ARGS_ASSERT_PREGCOMP;
4177
4178    /* Dispatch a request to compile a regexp to correct 
4179       regexp engine. */
4180    if (table) {
4181        SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4182        GET_RE_DEBUG_FLAGS_DECL;
4183        if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4184            const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4185            DEBUG_COMPILE_r({
4186                PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4187                    SvIV(*ptr));
4188            });            
4189            return CALLREGCOMP_ENG(eng, pattern, flags);
4190        } 
4191    }
4192    return Perl_re_compile(aTHX_ pattern, flags);
4193}
4194#endif
4195
4196REGEXP *
4197Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4198{
4199    dVAR;
4200    register REGEXP *r;
4201    register regexp_internal *ri;
4202    STRLEN plen;
4203    char*  exp = SvPV((SV*)pattern, plen);
4204    char* xend = exp + plen;
4205    regnode *scan;
4206    I32 flags;
4207    I32 minlen = 0;
4208    I32 sawplus = 0;
4209    I32 sawopen = 0;
4210    scan_data_t data;
4211    RExC_state_t RExC_state;
4212    RExC_state_t * const pRExC_state = &RExC_state;
4213#ifdef TRIE_STUDY_OPT    
4214    int restudied= 0;
4215    RExC_state_t copyRExC_state;
4216#endif    
4217    GET_RE_DEBUG_FLAGS_DECL;
4218
4219    PERL_ARGS_ASSERT_RE_COMPILE;
4220
4221    DEBUG_r(if (!PL_colorset) reginitcolors());
4222
4223    RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4224
4225    DEBUG_COMPILE_r({
4226        SV *dsv= sv_newmortal();
4227        RE_PV_QUOTED_DECL(s, RExC_utf8,
4228            dsv, exp, plen, 60);
4229        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4230                       PL_colors[4],PL_colors[5],s);
4231    });
4232
4233redo_first_pass:
4234    RExC_precomp = exp;
4235    RExC_flags = pm_flags;
4236    RExC_sawback = 0;
4237
4238    RExC_seen = 0;
4239    RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4240    RExC_seen_evals = 0;
4241    RExC_extralen = 0;
4242
4243    /* First pass: determine size, legality. */
4244    RExC_parse = exp;
4245    RExC_start = exp;
4246    RExC_end = xend;
4247    RExC_naughty = 0;
4248    RExC_npar = 1;
4249    RExC_nestroot = 0;
4250    RExC_size = 0L;
4251    RExC_emit = &PL_regdummy;
4252    RExC_whilem_seen = 0;
4253    RExC_charnames = NULL;
4254    RExC_open_parens = NULL;
4255    RExC_close_parens = NULL;
4256    RExC_opend = NULL;
4257    RExC_paren_names = NULL;
4258#ifdef DEBUGGING
4259    RExC_paren_name_list = NULL;
4260#endif
4261    RExC_recurse = NULL;
4262    RExC_recurse_count = 0;
4263
4264#if 0 /* REGC() is (currently) a NOP at the first pass.
4265       * Clever compilers notice this and complain. --jhi */
4266    REGC((U8)REG_MAGIC, (char*)RExC_emit);
4267#endif
4268    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4269    if (reg(pRExC_state, 0, &flags,1) == NULL) {
4270        RExC_precomp = NULL;
4271        return(NULL);
4272    }
4273    if (RExC_utf8 && !RExC_orig_utf8) {
4274        /* It's possible to write a regexp in ascii that represents Unicode
4275        codepoints outside of the byte range, such as via \x{100}. If we
4276        detect such a sequence we have to convert the entire pattern to utf8
4277        and then recompile, as our sizing calculation will have been based
4278        on 1 byte == 1 character, but we will need to use utf8 to encode
4279        at least some part of the pattern, and therefore must convert the whole
4280        thing.
4281        XXX: somehow figure out how to make this less expensive...
4282        -- dmq */
4283        STRLEN len = plen;
4284        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4285            "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4286        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4287        xend = exp + len;
4288        RExC_orig_utf8 = RExC_utf8;
4289        SAVEFREEPV(exp);
4290        goto redo_first_pass;
4291    }
4292    DEBUG_PARSE_r({
4293        PerlIO_printf(Perl_debug_log, 
4294            "Required size %"IVdf" nodes\n"
4295            "Starting second pass (creation)\n", 
4296            (IV)RExC_size);
4297        RExC_lastnum=0; 
4298        RExC_lastparse=NULL; 
4299    });
4300    /* Small enough for pointer-storage convention?
4301       If extralen==0, this means that we will not need long jumps. */
4302    if (RExC_size >= 0x10000L && RExC_extralen)
4303        RExC_size += RExC_extralen;
4304    else
4305        RExC_extralen = 0;
4306    if (RExC_whilem_seen > 15)
4307        RExC_whilem_seen = 15;
4308
4309    /* Allocate space and zero-initialize. Note, the two step process 
4310       of zeroing when in debug mode, thus anything assigned has to 
4311       happen after that */
4312    Newxz(r, 1, regexp);
4313    Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4314         char, regexp_internal);
4315    if ( r == NULL || ri == NULL )
4316        FAIL("Regexp out of space");
4317#ifdef DEBUGGING
4318    /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4319    Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4320#else 
4321    /* bulk initialize base fields with 0. */
4322    Zero(ri, sizeof(regexp_internal), char);        
4323#endif
4324
4325    /* non-zero initialization begins here */
4326    RXi_SET( r, ri );
4327    r->engine= RE_ENGINE_PTR;
4328    r->refcnt = 1;
4329    RX_PRELEN(r) = plen;
4330    r->extflags = pm_flags;
4331    {
4332        bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4333        bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4334        bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4335        U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4336                            >> RXf_PMf_STD_PMMOD_SHIFT);
4337        const char *fptr = STD_PAT_MODS;        /*"msix"*/
4338        char *p;
4339        RX_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4340            + (sizeof(STD_PAT_MODS) - 1)
4341            + (sizeof("(?:)") - 1);
4342
4343        Newx(RX_WRAPPED(r), RX_WRAPLEN(r) + 1, char );
4344        p = RX_WRAPPED(r);
4345        *p++='('; *p++='?';
4346        if (has_p)
4347            *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4348        {
4349            char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4350            char *colon = r + 1;
4351            char ch;
4352
4353            while((ch = *fptr++)) {
4354                if(reganch & 1)
4355                    *p++ = ch;
4356                else
4357                    *r-- = ch;
4358                reganch >>= 1;
4359            }
4360            if(has_minus) {
4361                *r = '-';
4362                p = colon;
4363            }
4364        }
4365
4366        *p++ = ':';
4367        Copy(RExC_precomp, p, plen, char);
4368        RX_PRECOMP(r) = p;
4369        p += plen;
4370        if (has_runon)
4371            *p++ = '\n';
4372        *p++ = ')';
4373        *p = 0;
4374    }
4375
4376    r->intflags = 0;
4377    r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4378    
4379    if (RExC_seen & REG_SEEN_RECURSE) {
4380        Newxz(RExC_open_parens, RExC_npar,regnode *);
4381        SAVEFREEPV(RExC_open_parens);
4382        Newxz(RExC_close_parens,RExC_npar,regnode *);
4383        SAVEFREEPV(RExC_close_parens);
4384    }
4385
4386    /* Useful during FAIL. */
4387#ifdef RE_TRACK_PATTERN_OFFSETS
4388    Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4389    DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4390                          "%s %"UVuf" bytes for offset annotations.\n",
4391                          ri->u.offsets ? "Got" : "Couldn't get",
4392                          (UV)((2*RExC_size+1) * sizeof(U32))));
4393#endif
4394    SetProgLen(ri,RExC_size);
4395    RExC_rx = r;
4396    RExC_rxi = ri;
4397
4398    /* Second pass: emit code. */
4399    RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4400    RExC_parse = exp;
4401    RExC_end = xend;
4402    RExC_naughty = 0;
4403    RExC_npar = 1;
4404    RExC_emit_start = ri->program;
4405    RExC_emit = ri->program;
4406    RExC_emit_bound = ri->program + RExC_size + 1;
4407
4408    /* Store the count of eval-groups for security checks: */
4409    RExC_rx->seen_evals = RExC_seen_evals;
4410    REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4411    if (reg(pRExC_state, 0, &flags,1) == NULL) {
4412        ReREFCNT_dec(r);   
4413        return(NULL);
4414    }
4415    /* XXXX To minimize changes to RE engine we always allocate
4416       3-units-long substrs field. */
4417    Newx(r->substrs, 1, struct reg_substr_data);
4418    if (RExC_recurse_count) {
4419        Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4420        SAVEFREEPV(RExC_recurse);
4421    }
4422
4423reStudy:
4424    r->minlen = minlen = sawplus = sawopen = 0;
4425    Zero(r->substrs, 1, struct reg_substr_data);
4426
4427#ifdef TRIE_STUDY_OPT
4428    if (!restudied) {
4429        StructCopy(&zero_scan_data, &data, scan_data_t);
4430        copyRExC_state = RExC_state;
4431    } else {
4432        U32 seen=RExC_seen;
4433        DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4434        
4435        RExC_state = copyRExC_state;
4436        if (seen & REG_TOP_LEVEL_BRANCHES) 
4437            RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4438        else
4439            RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4440        if (data.last_found) {
4441            SvREFCNT_dec(data.longest_fixed);
4442            SvREFCNT_dec(data.longest_float);
4443            SvREFCNT_dec(data.last_found);
4444        }
4445        StructCopy(&zero_scan_data, &data, scan_data_t);
4446    }
4447#else
4448    StructCopy(&zero_scan_data, &data, scan_data_t);
4449#endif    
4450
4451    /* Dig out information for optimizations. */
4452    r->extflags = RExC_flags; /* was pm_op */
4453    /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4454 
4455    if (UTF)
4456        r->extflags |= RXf_UTF8;        /* Unicode in it? */
4457    ri->regstclass = NULL;
4458    if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4459        r->intflags |= PREGf_NAUGHTY;
4460    scan = ri->program + 1;             /* First BRANCH. */
4461
4462    /* testing for BRANCH here tells us whether there is "must appear"
4463       data in the pattern. If there is then we can use it for optimisations */
4464    if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4465        I32 fake;
4466        STRLEN longest_float_length, longest_fixed_length;
4467        struct regnode_charclass_class ch_class; /* pointed to by data */
4468        int stclass_flag;
4469        I32 last_close = 0; /* pointed to by data */
4470        regnode *first= scan;
4471        regnode *first_next= regnext(first);
4472        
4473        /*
4474         * Skip introductions and multiplicators >= 1
4475         * so that we can extract the 'meat' of the pattern that must 
4476         * match in the large if() sequence following.
4477         * NOTE that EXACT is NOT covered here, as it is normally
4478         * picked up by the optimiser separately. 
4479         *
4480         * This is unfortunate as the optimiser isnt handling lookahead
4481         * properly currently.
4482         *
4483         */
4484        while ((OP(first) == OPEN && (sawopen = 1)) ||
4485               /* An OR of *one* alternative - should not happen now. */
4486            (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4487            /* for now we can't handle lookbehind IFMATCH*/
4488            (OP(first) == IFMATCH && !first->flags) || 
4489            (OP(first) == PLUS) ||
4490            (OP(first) == MINMOD) ||
4491               /* An {n,m} with n>0 */
4492            (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4493            (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4494        {
4495                /* 
4496                 * the only op that could be a regnode is PLUS, all the rest
4497                 * will be regnode_1 or regnode_2.
4498                 *
4499                 */
4500                if (OP(first) == PLUS)
4501                    sawplus = 1;
4502                else
4503                    first += regarglen[OP(first)];
4504                
4505                first = NEXTOPER(first);
4506                first_next= regnext(first);
4507        }
4508
4509        /* Starting-point info. */
4510      again:
4511        DEBUG_PEEP("first:",first,0);
4512        /* Ignore EXACT as we deal with it later. */
4513        if (PL_regkind[OP(first)] == EXACT) {
4514            if (OP(first) == EXACT)
4515                NOOP;   /* Empty, get anchored substr later. */
4516            else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4517                ri->regstclass = first;
4518        }
4519#ifdef TRIE_STCLASS     
4520        else if (PL_regkind[OP(first)] == TRIE &&
4521                ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 
4522        {
4523            regnode *trie_op;
4524            /* this can happen only on restudy */
4525            if ( OP(first) == TRIE ) {
4526                struct regnode_1 *trieop = (struct regnode_1 *)
4527                    PerlMemShared_calloc(1, sizeof(struct regnode_1));
4528                StructCopy(first,trieop,struct regnode_1);
4529                trie_op=(regnode *)trieop;
4530            } else {
4531                struct regnode_charclass *trieop = (struct regnode_charclass *)
4532                    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4533                StructCopy(first,trieop,struct regnode_charclass);
4534                trie_op=(regnode *)trieop;
4535            }
4536            OP(trie_op)+=2;
4537            make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4538            ri->regstclass = trie_op;
4539        }
4540#endif  
4541        else if (strchr((const char*)PL_simple,OP(first)))
4542            ri->regstclass = first;
4543        else if (PL_regkind[OP(first)] == BOUND ||
4544                 PL_regkind[OP(first)] == NBOUND)
4545            ri->regstclass = first;
4546        else if (PL_regkind[OP(first)] == BOL) {
4547            r->extflags |= (OP(first) == MBOL
4548                           ? RXf_ANCH_MBOL
4549                           : (OP(first) == SBOL
4550                              ? RXf_ANCH_SBOL
4551                              : RXf_ANCH_BOL));
4552            first = NEXTOPER(first);
4553            goto again;
4554        }
4555        else if (OP(first) == GPOS) {
4556            r->extflags |= RXf_ANCH_GPOS;
4557            first = NEXTOPER(first);
4558            goto again;
4559        }
4560        else if ((!sawopen || !RExC_sawback) &&
4561            (OP(first) == STAR &&
4562            PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4563            !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4564        {
4565            /* turn .* into ^.* with an implied $*=1 */
4566            const int type =
4567                (OP(NEXTOPER(first)) == REG_ANY)
4568                    ? RXf_ANCH_MBOL
4569                    : RXf_ANCH_SBOL;
4570            r->extflags |= type;
4571            r->intflags |= PREGf_IMPLICIT;
4572            first = NEXTOPER(first);
4573            goto again;
4574        }
4575        if (sawplus && (!sawopen || !RExC_sawback)
4576            && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4577            /* x+ must match at the 1st pos of run of x's */
4578            r->intflags |= PREGf_SKIP;
4579
4580        /* Scan is after the zeroth branch, first is atomic matcher. */
4581#ifdef TRIE_STUDY_OPT
4582        DEBUG_PARSE_r(
4583            if (!restudied)
4584                PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4585                              (IV)(first - scan + 1))
4586        );
4587#else
4588        DEBUG_PARSE_r(
4589            PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4590                (IV)(first - scan + 1))
4591        );
4592#endif
4593
4594
4595        /*
4596        * If there's something expensive in the r.e., find the
4597        * longest literal string that must appear and make it the
4598        * regmust.  Resolve ties in favor of later strings, since
4599        * the regstart check works with the beginning of the r.e.
4600        * and avoiding duplication strengthens checking.  Not a
4601        * strong reason, but sufficient in the absence of others.
4602        * [Now we resolve ties in favor of the earlier string if
4603        * it happens that c_offset_min has been invalidated, since the
4604        * earlier string may buy us something the later one won't.]
4605        */
4606        
4607        data.longest_fixed = newSVpvs("");
4608        data.longest_float = newSVpvs("");
4609        data.last_found = newSVpvs("");
4610        data.longest = &(data.longest_fixed);
4611        first = scan;
4612        if (!ri->regstclass) {
4613            cl_init(pRExC_state, &ch_class);
4614            data.start_class = &ch_class;
4615            stclass_flag = SCF_DO_STCLASS_AND;
4616        } else                          /* XXXX Check for BOUND? */
4617            stclass_flag = 0;
4618        data.last_closep = &last_close;
4619        
4620        minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4621            &data, -1, NULL, NULL,
4622            SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4623
4624        
4625        CHECK_RESTUDY_GOTO;
4626
4627
4628        if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4629             && data.last_start_min == 0 && data.last_end > 0
4630             && !RExC_seen_zerolen
4631             && !(RExC_seen & REG_SEEN_VERBARG)
4632             && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4633            r->extflags |= RXf_CHECK_ALL;
4634        scan_commit(pRExC_state, &data,&minlen,0);
4635        SvREFCNT_dec(data.last_found);
4636
4637        /* Note that code very similar to this but for anchored string 
4638           follows immediately below, changes may need to be made to both. 
4639           Be careful. 
4640         */
4641        longest_float_length = CHR_SVLEN(data.longest_float);
4642        if (longest_float_length
4643            || (data.flags & SF_FL_BEFORE_EOL
4644                && (!(data.flags & SF_FL_BEFORE_MEOL)
4645                    || (RExC_flags & RXf_PMf_MULTILINE)))) 
4646        {
4647            I32 t,ml;
4648
4649            if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4650                && data.offset_fixed == data.offset_float_min
4651                && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4652                    goto remove_float;          /* As in (a)+. */
4653
4654            /* copy the information about the longest float from the reg_scan_data
4655               over to the program. */
4656            if (SvUTF8(data.longest_float)) {
4657                r->float_utf8 = data.longest_float;
4658                r->float_substr = NULL;
4659            } else {
4660                r->float_substr = data.longest_float;
4661                r->float_utf8 = NULL;
4662            }
4663            /* float_end_shift is how many chars that must be matched that 
4664               follow this item. We calculate it ahead of time as once the
4665               lookbehind offset is added in we lose the ability to correctly
4666               calculate it.*/
4667            ml = data.minlen_float ? *(data.minlen_float) 
4668                                   : (I32)longest_float_length;
4669            r->float_end_shift = ml - data.offset_float_min
4670                - longest_float_length + (SvTAIL(data.longest_float) != 0)
4671                + data.lookbehind_float;
4672            r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4673            r->float_max_offset = data.offset_float_max;
4674            if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4675                r->float_max_offset -= data.lookbehind_float;
4676            
4677            t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4678                       && (!(data.flags & SF_FL_BEFORE_MEOL)
4679                           || (RExC_flags & RXf_PMf_MULTILINE)));
4680            fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4681        }
4682        else {
4683          remove_float:
4684            r->float_substr = r->float_utf8 = NULL;
4685            SvREFCNT_dec(data.longest_float);
4686            longest_float_length = 0;
4687        }
4688
4689        /* Note that code very similar to this but for floating string 
4690           is immediately above, changes may need to be made to both. 
4691           Be careful. 
4692         */
4693        longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4694        if (longest_fixed_length
4695            || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4696                && (!(data.flags & SF_FIX_BEFORE_MEOL)
4697                    || (RExC_flags & RXf_PMf_MULTILINE)))) 
4698        {
4699            I32 t,ml;
4700
4701            /* copy the information about the longest fixed 
4702               from the reg_scan_data over to the program. */
4703            if (SvUTF8(data.longest_fixed)) {
4704                r->anchored_utf8 = data.longest_fixed;
4705                r->anchored_substr = NULL;
4706            } else {
4707                r->anchored_substr = data.longest_fixed;
4708                r->anchored_utf8 = NULL;
4709            }
4710            /* fixed_end_shift is how many chars that must be matched that 
4711               follow this item. We calculate it ahead of time as once the
4712               lookbehind offset is added in we lose the ability to correctly
4713               calculate it.*/
4714            ml = data.minlen_fixed ? *(data.minlen_fixed) 
4715                                   : (I32)longest_fixed_length;
4716            r->anchored_end_shift = ml - data.offset_fixed
4717                - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4718                + data.lookbehind_fixed;
4719            r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4720
4721            t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4722                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4723                     || (RExC_flags & RXf_PMf_MULTILINE)));
4724            fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4725        }
4726        else {
4727            r->anchored_substr = r->anchored_utf8 = NULL;
4728            SvREFCNT_dec(data.longest_fixed);
4729            longest_fixed_length = 0;
4730        }
4731        if (ri->regstclass
4732            && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4733            ri->regstclass = NULL;
4734        if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4735            && stclass_flag
4736            && !(data.start_class->flags & ANYOF_EOS)
4737            && !cl_is_anything(data.start_class))
4738        {
4739            const U32 n = add_data(pRExC_state, 1, "f");
4740
4741            Newx(RExC_rxi->data->data[n], 1,
4742                struct regnode_charclass_class);
4743            StructCopy(data.start_class,
4744                       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4745                       struct regnode_charclass_class);
4746            ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4747            r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4748            DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4749                      regprop(r, sv, (regnode*)data.start_class);
4750                      PerlIO_printf(Perl_debug_log,
4751                                    "synthetic stclass \"%s\".\n",
4752                                    SvPVX_const(sv));});
4753        }
4754
4755        /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4756        if (longest_fixed_length > longest_float_length) {
4757            r->check_end_shift = r->anchored_end_shift;
4758            r->check_substr = r->anchored_substr;
4759            r->check_utf8 = r->anchored_utf8;
4760            r->check_offset_min = r->check_offset_max = r->anchored_offset;
4761            if (r->extflags & RXf_ANCH_SINGLE)
4762                r->extflags |= RXf_NOSCAN;
4763        }
4764        else {
4765            r->check_end_shift = r->float_end_shift;
4766            r->check_substr = r->float_substr;
4767            r->check_utf8 = r->float_utf8;
4768            r->check_offset_min = r->float_min_offset;
4769            r->check_offset_max = r->float_max_offset;
4770        }
4771        /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4772           This should be changed ASAP!  */
4773        if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4774            r->extflags |= RXf_USE_INTUIT;
4775            if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4776                r->extflags |= RXf_INTUIT_TAIL;
4777        }
4778        /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4779        if ( (STRLEN)minlen < longest_float_length )
4780            minlen= longest_float_length;
4781        if ( (STRLEN)minlen < longest_fixed_length )
4782            minlen= longest_fixed_length;     
4783        */
4784    }
4785    else {
4786        /* Several toplevels. Best we can is to set minlen. */
4787        I32 fake;
4788        struct regnode_charclass_class ch_class;
4789        I32 last_close = 0;
4790        
4791        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4792
4793        scan = ri->program + 1;
4794        cl_init(pRExC_state, &ch_class);
4795        data.start_class = &ch_class;
4796        data.last_closep = &last_close;
4797
4798        
4799        minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4800            &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4801        
4802        CHECK_RESTUDY_GOTO;
4803
4804        r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4805                = r->float_substr = r->float_utf8 = NULL;
4806        if (!(data.start_class->flags & ANYOF_EOS)
4807            && !cl_is_anything(data.start_class))
4808        {
4809            const U32 n = add_data(pRExC_state, 1, "f");
4810
4811            Newx(RExC_rxi->data->data[n], 1,
4812                struct regnode_charclass_class);
4813            StructCopy(data.start_class,
4814                       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4815                       struct regnode_charclass_class);
4816            ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4817            r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4818            DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4819                      regprop(r, sv, (regnode*)data.start_class);
4820                      PerlIO_printf(Perl_debug_log,
4821                                    "synthetic stclass \"%s\".\n",
4822                                    SvPVX_const(sv));});
4823        }
4824    }
4825
4826    /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4827       the "real" pattern. */
4828    DEBUG_OPTIMISE_r({
4829        PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4830                      (IV)minlen, (IV)r->minlen);
4831    });
4832    r->minlenret = minlen;
4833    if (r->minlen < minlen) 
4834        r->minlen = minlen;
4835    
4836    if (RExC_seen & REG_SEEN_GPOS)
4837        r->extflags |= RXf_GPOS_SEEN;
4838    if (RExC_seen & REG_SEEN_LOOKBEHIND)
4839        r->extflags |= RXf_LOOKBEHIND_SEEN;
4840    if (RExC_seen & REG_SEEN_EVAL)
4841        r->extflags |= RXf_EVAL_SEEN;
4842    if (RExC_seen & REG_SEEN_CANY)
4843        r->extflags |= RXf_CANY_SEEN;
4844    if (RExC_seen & REG_SEEN_VERBARG)
4845        r->intflags |= PREGf_VERBARG_SEEN;
4846    if (RExC_seen & REG_SEEN_CUTGROUP)
4847        r->intflags |= PREGf_CUTGROUP_SEEN;
4848    if (RExC_paren_names)
4849        RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4850    else
4851        RXp_PAREN_NAMES(r) = NULL;
4852
4853#ifdef STUPID_PATTERN_CHECKS            
4854    if (RX_PRELEN(r) == 0)
4855        r->extflags |= RXf_NULL;
4856    if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4857        /* XXX: this should happen BEFORE we compile */
4858        r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 
4859    else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3))
4860        r->extflags |= RXf_WHITE;
4861    else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4862        r->extflags |= RXf_START_ONLY;
4863#else
4864    if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4865            /* XXX: this should happen BEFORE we compile */
4866            r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 
4867    else {
4868        regnode *first = ri->program + 1;
4869        U8 fop = OP(first);
4870        U8 nop = OP(NEXTOPER(first));
4871        
4872        if (PL_regkind[fop] == NOTHING && nop == END)
4873            r->extflags |= RXf_NULL;
4874        else if (PL_regkind[fop] == BOL && nop == END)
4875            r->extflags |= RXf_START_ONLY;
4876        else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4877            r->extflags |= RXf_WHITE;    
4878    }
4879#endif
4880#ifdef DEBUGGING
4881    if (RExC_paren_names) {
4882        ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4883        ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4884    } else
4885#endif
4886        ri->name_list_idx = 0;
4887
4888    if (RExC_recurse_count) {
4889        for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4890            const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4891            ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4892        }
4893    }
4894    Newxz(r->offs, RExC_npar, regexp_paren_pair);
4895    /* assume we don't need to swap parens around before we match */
4896
4897    DEBUG_DUMP_r({
4898        PerlIO_printf(Perl_debug_log,"Final program:\n");
4899        regdump(r);
4900    });
4901#ifdef RE_TRACK_PATTERN_OFFSETS
4902    DEBUG_OFFSETS_r(if (ri->u.offsets) {
4903        const U32 len = ri->u.offsets[0];
4904        U32 i;
4905        GET_RE_DEBUG_FLAGS_DECL;
4906        PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4907        for (i = 1; i <= len; i++) {
4908            if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4909                PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4910                (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4911            }
4912        PerlIO_printf(Perl_debug_log, "\n");
4913    });
4914#endif
4915    return(r);
4916}
4917
4918#undef RE_ENGINE_PTR
4919
4920
4921SV*
4922Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4923                    const U32 flags)
4924{
4925    PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4926
4927    PERL_UNUSED_ARG(value);
4928
4929    if (flags & RXapif_FETCH) {
4930        return reg_named_buff_fetch(rx, key, flags);
4931    } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4932        Perl_croak(aTHX_ "%s", PL_no_modify);
4933        return NULL;
4934    } else if (flags & RXapif_EXISTS) {
4935        return reg_named_buff_exists(rx, key, flags)
4936            ? &PL_sv_yes
4937            : &PL_sv_no;
4938    } else if (flags & RXapif_REGNAMES) {
4939        return reg_named_buff_all(rx, flags);
4940    } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4941        return reg_named_buff_scalar(rx, flags);
4942    } else {
4943        Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4944        return NULL;
4945    }
4946}
4947
4948SV*
4949Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4950                         const U32 flags)
4951{
4952    PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4953    PERL_UNUSED_ARG(lastkey);
4954
4955    if (flags & RXapif_FIRSTKEY)
4956        return reg_named_buff_firstkey(rx, flags);
4957    else if (flags & RXapif_NEXTKEY)
4958        return reg_named_buff_nextkey(rx, flags);
4959    else {
4960        Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4961        return NULL;
4962    }
4963}
4964
4965SV*
4966Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4967{
4968    AV *retarray = NULL;
4969    SV *ret;
4970
4971    PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4972
4973    if (flags & RXapif_ALL)
4974        retarray=newAV();
4975
4976    if (rx && RXp_PAREN_NAMES(rx)) {
4977        HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4978        if (he_str) {
4979            IV i;
4980            SV* sv_dat=HeVAL(he_str);
4981            I32 *nums=(I32*)SvPVX(sv_dat);
4982            for ( i=0; i<SvIVX(sv_dat); i++ ) {
4983                if ((I32)(rx->nparens) >= nums[i]
4984                    && rx->offs[nums[i]].start != -1
4985                    && rx->offs[nums[i]].end != -1)
4986                {
4987                    ret = newSVpvs("");
4988                    CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4989                    if (!retarray)
4990                        return ret;
4991                } else {
4992                    ret = newSVsv(&PL_sv_undef);
4993                }
4994                if (retarray)
4995                    av_push(retarray, ret);
4996            }
4997            if (retarray)
4998                return newRV_noinc(MUTABLE_SV(retarray));
4999        }
5000    }
5001    return NULL;
5002}
5003
5004bool
5005Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
5006                           const U32 flags)
5007{
5008
5009    PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5010
5011    if (rx && RXp_PAREN_NAMES(rx)) {
5012        if (flags & RXapif_ALL) {
5013            return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5014        } else {
5015            SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
5016            if (sv) {
5017                SvREFCNT_dec(sv);
5018                return TRUE;
5019            } else {
5020                return FALSE;
5021            }
5022        }
5023    } else {
5024        return FALSE;
5025    }
5026}
5027
5028SV*
5029Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
5030{
5031
5032    PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5033
5034    if ( rx && RXp_PAREN_NAMES(rx) ) {
5035        (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5036
5037        return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
5038    } else {
5039        return FALSE;
5040    }
5041}
5042
5043SV*
5044Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
5045{
5046    GET_RE_DEBUG_FLAGS_DECL;
5047
5048    PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5049
5050    if (rx && RXp_PAREN_NAMES(rx)) {
5051        HV *hv = RXp_PAREN_NAMES(rx);
5052        HE *temphe;
5053        while ( (temphe = hv_iternext_flags(hv,0)) ) {
5054            IV i;
5055            IV parno = 0;
5056            SV* sv_dat = HeVAL(temphe);
5057            I32 *nums = (I32*)SvPVX(sv_dat);
5058            for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5059                if ((I32)(rx->lastparen) >= nums[i] &&
5060                    rx->offs[nums[i]].start != -1 &&
5061                    rx->offs[nums[i]].end != -1)
5062                {
5063                    parno = nums[i];
5064                    break;
5065                }
5066            }
5067            if (parno || flags & RXapif_ALL) {
5068                return newSVhek(HeKEY_hek(temphe));
5069            }
5070        }
5071    }
5072    return NULL;
5073}
5074
5075SV*
5076Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5077{
5078    SV *ret;
5079    AV *av;
5080    I32 length;
5081
5082    PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5083
5084    if (rx && RXp_PAREN_NAMES(rx)) {
5085        if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5086            return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5087        } else if (flags & RXapif_ONE) {
5088            ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5089            av = MUTABLE_AV(SvRV(ret));
5090            length = av_len(av);
5091            SvREFCNT_dec(ret);
5092            return newSViv(length + 1);
5093        } else {
5094            Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5095            return NULL;
5096        }
5097    }
5098    return &PL_sv_undef;
5099}
5100
5101SV*
5102Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5103{
5104    AV *av = newAV();
5105
5106    PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5107
5108    if (rx && RXp_PAREN_NAMES(rx)) {
5109        HV *hv= RXp_PAREN_NAMES(rx);
5110        HE *temphe;
5111        (void)hv_iterinit(hv);
5112        while ( (temphe = hv_iternext_flags(hv,0)) ) {
5113            IV i;
5114            IV parno = 0;
5115            SV* sv_dat = HeVAL(temphe);
5116            I32 *nums = (I32*)SvPVX(sv_dat);
5117            for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5118                if ((I32)(rx->lastparen) >= nums[i] &&
5119                    rx->offs[nums[i]].start != -1 &&
5120                    rx->offs[nums[i]].end != -1)
5121                {
5122                    parno = nums[i];
5123                    break;
5124                }
5125            }
5126            if (parno || flags & RXapif_ALL) {
5127                av_push(av, newSVhek(HeKEY_hek(temphe)));
5128            }
5129        }
5130    }
5131
5132    return newRV_noinc(MUTABLE_SV(av));
5133}
5134
5135void
5136Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5137{
5138    char *s = NULL;
5139    I32 i = 0;
5140    I32 s1, t1;
5141
5142    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5143        
5144    if (!rx->subbeg) {
5145        sv_setsv(sv,&PL_sv_undef);
5146        return;
5147    } 
5148    else               
5149    if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5150        /* $` */
5151        i = rx->offs[0].start;
5152        s = rx->subbeg;
5153    }
5154    else 
5155    if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5156        /* $' */
5157        s = rx->subbeg + rx->offs[0].end;
5158        i = rx->sublen - rx->offs[0].end;
5159    } 
5160    else
5161    if ( 0 <= paren && paren <= (I32)rx->nparens &&
5162        (s1 = rx->offs[paren].start) != -1 &&
5163        (t1 = rx->offs[paren].end) != -1)
5164    {
5165        /* $& $1 ... */
5166        i = t1 - s1;
5167        s = rx->subbeg + s1;
5168    } else {
5169        sv_setsv(sv,&PL_sv_undef);
5170        return;
5171    }          
5172    assert(rx->sublen >= (s - rx->subbeg) + i );
5173    if (i >= 0) {
5174        const int oldtainted = PL_tainted;
5175        TAINT_NOT;
5176        sv_setpvn(sv, s, i);
5177        PL_tainted = oldtainted;
5178        if ( (rx->extflags & RXf_CANY_SEEN)
5179            ? (RXp_MATCH_UTF8(rx)
5180                        && (!i || is_utf8_string((U8*)s, i)))
5181            : (RXp_MATCH_UTF8(rx)) )
5182        {
5183            SvUTF8_on(sv);
5184        }
5185        else
5186            SvUTF8_off(sv);
5187        if (PL_tainting) {
5188            if (RXp_MATCH_TAINTED(rx)) {
5189                if (SvTYPE(sv) >= SVt_PVMG) {
5190                    MAGIC* const mg = SvMAGIC(sv);
5191                    MAGIC* mgt;
5192                    PL_tainted = 1;
5193                    SvMAGIC_set(sv, mg->mg_moremagic);
5194                    SvTAINT(sv);
5195                    if ((mgt = SvMAGIC(sv))) {
5196                        mg->mg_moremagic = mgt;
5197                        SvMAGIC_set(sv, mg);
5198                    }
5199                } else {
5200                    PL_tainted = 1;
5201                    SvTAINT(sv);
5202                }
5203            } else 
5204                SvTAINTED_off(sv);
5205        }
5206    } else {
5207        sv_setsv(sv,&PL_sv_undef);
5208        return;
5209    }
5210}
5211
5212void
5213Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5214                                                         SV const * const value)
5215{
5216    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5217
5218    PERL_UNUSED_ARG(rx);
5219    PERL_UNUSED_ARG(paren);
5220    PERL_UNUSED_ARG(value);
5221
5222    if (!PL_localizing)
5223        Perl_croak(aTHX_ "%s", PL_no_modify);
5224}
5225
5226I32
5227Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5228                              const I32 paren)
5229{
5230    I32 i;
5231    I32 s1, t1;
5232
5233    PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5234
5235    /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5236        switch (paren) {
5237      /* $` / ${^PREMATCH} */
5238      case RX_BUFF_IDX_PREMATCH:
5239        if (rx->offs[0].start != -1) {
5240                        i = rx->offs[0].start;
5241                        if (i > 0) {
5242                                s1 = 0;
5243                                t1 = i;
5244                                goto getlen;
5245                        }
5246            }
5247        return 0;
5248      /* $' / ${^POSTMATCH} */
5249      case RX_BUFF_IDX_POSTMATCH:
5250            if (rx->offs[0].end != -1) {
5251                        i = rx->sublen - rx->offs[0].end;
5252                        if (i > 0) {
5253                                s1 = rx->offs[0].end;
5254                                t1 = rx->sublen;
5255                                goto getlen;
5256                        }
5257            }
5258        return 0;
5259      /* $& / ${^MATCH}, $1, $2, ... */
5260      default:
5261            if (paren <= (I32)rx->nparens &&
5262            (s1 = rx->offs[paren].start) != -1 &&
5263            (t1 = rx->offs[paren].end) != -1)
5264            {
5265            i = t1 - s1;
5266            goto getlen;
5267        } else {
5268            if (ckWARN(WARN_UNINITIALIZED))
5269                report_uninit((SV *)sv);
5270            return 0;
5271        }
5272    }
5273  getlen:
5274    if (i > 0 && RXp_MATCH_UTF8(rx)) {
5275        const char * const s = rx->subbeg + s1;
5276        const U8 *ep;
5277        STRLEN el;
5278
5279        i = t1 - s1;
5280        if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5281                        i = el;
5282    }
5283    return i;
5284}
5285
5286SV*
5287Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5288{
5289    PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5290        PERL_UNUSED_ARG(rx);
5291        return newSVpvs("Regexp");
5292}
5293
5294/* Scans the name of a named buffer from the pattern.
5295 * If flags is REG_RSN_RETURN_NULL returns null.
5296 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5297 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5298 * to the parsed name as looked up in the RExC_paren_names hash.
5299 * If there is an error throws a vFAIL().. type exception.
5300 */
5301
5302#define REG_RSN_RETURN_NULL    0
5303#define REG_RSN_RETURN_NAME    1
5304#define REG_RSN_RETURN_DATA    2
5305
5306STATIC SV*
5307S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5308{
5309    char *name_start = RExC_parse;
5310
5311    PERL_ARGS_ASSERT_REG_SCAN_NAME;
5312
5313    if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5314         /* skip IDFIRST by using do...while */
5315        if (UTF)
5316            do {
5317                RExC_parse += UTF8SKIP(RExC_parse);
5318            } while (isALNUM_utf8((U8*)RExC_parse));
5319        else
5320            do {
5321                RExC_parse++;
5322            } while (isALNUM(*RExC_parse));
5323    }
5324
5325    if ( flags ) {
5326        SV* sv_name
5327            = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5328                             SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5329        if ( flags == REG_RSN_RETURN_NAME)
5330            return sv_name;
5331        else if (flags==REG_RSN_RETURN_DATA) {
5332            HE *he_str = NULL;
5333            SV *sv_dat = NULL;
5334            if ( ! sv_name )      /* should not happen*/
5335                Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5336            if (RExC_paren_names)
5337                he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5338            if ( he_str )
5339                sv_dat = HeVAL(he_str);
5340            if ( ! sv_dat )
5341                vFAIL("Reference to nonexistent named group");
5342            return sv_dat;
5343        }
5344        else {
5345            Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5346        }
5347        /* NOT REACHED */
5348    }
5349    return NULL;
5350}
5351
5352#define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5353    int rem=(int)(RExC_end - RExC_parse);                       \
5354    int cut;                                                    \
5355    int num;                                                    \
5356    int iscut=0;                                                \
5357    if (rem>10) {                                               \
5358        rem=10;                                                 \
5359        iscut=1;                                                \
5360    }                                                           \
5361    cut=10-rem;                                                 \
5362    if (RExC_lastparse!=RExC_parse)                             \
5363        PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5364            rem, RExC_parse,                                    \
5365            cut + 4,                                            \
5366            iscut ? "..." : "<"                                 \
5367        );                                                      \
5368    else                                                        \
5369        PerlIO_printf(Perl_debug_log,"%16s","");                \
5370                                                                \
5371    if (SIZE_ONLY)                                              \
5372       num = RExC_size + 1;                                     \
5373    else                                                        \
5374       num=REG_NODE_NUM(RExC_emit);                             \
5375    if (RExC_lastnum!=num)                                      \
5376       PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5377    else                                                        \
5378       PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5379    PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5380        (int)((depth*2)), "",                                   \
5381        (funcname)                                              \
5382    );                                                          \
5383    RExC_lastnum=num;                                           \
5384    RExC_lastparse=RExC_parse;                                  \
5385})
5386
5387
5388
5389#define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5390    DEBUG_PARSE_MSG((funcname));                            \
5391    PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5392})
5393#define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5394    DEBUG_PARSE_MSG((funcname));                            \
5395    PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5396})
5397/*
5398 - reg - regular expression, i.e. main body or parenthesized thing
5399 *
5400 * Caller must absorb opening parenthesis.
5401 *
5402 * Combining parenthesis handling with the base level of regular expression
5403 * is a trifle forced, but the need to tie the tails of the branches to what
5404 * follows makes it hard to avoid.
5405 */
5406#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5407#ifdef DEBUGGING
5408#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5409#else
5410#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5411#endif
5412
5413STATIC regnode *
5414S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5415    /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5416{
5417    dVAR;
5418    register regnode *ret;              /* Will be the head of the group. */
5419    register regnode *br;
5420    register regnode *lastbr;
5421    register regnode *ender = NULL;
5422    register I32 parno = 0;
5423    I32 flags;
5424    U32 oregflags = RExC_flags;
5425    bool have_branch = 0;
5426    bool is_open = 0;
5427    I32 freeze_paren = 0;
5428    I32 after_freeze = 0;
5429
5430    /* for (?g), (?gc), and (?o) warnings; warning
5431       about (?c) will warn about (?g) -- japhy    */
5432
5433#define WASTED_O  0x01
5434#define WASTED_G  0x02
5435#define WASTED_C  0x04