1
2
3
4
5
6
7
8
9
10
11
12
13#ifdef USE_UTF8_SCRIPTS
14# define USE_UTF8_IN_NAMES (!IN_BYTES)
15#else
16# define USE_UTF8_IN_NAMES (PL_hints & HINT_UTF8)
17#endif
18
19
20#define uvuni_to_utf8(d, uv) uvuni_to_utf8_flags(d, uv, 0)
21#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
22
23#ifdef EBCDIC
24
25
26
27
28#include "utfebcdic.h"
29
30#else
31START_EXTERN_C
32
33#ifdef DOINIT
34EXTCONST unsigned char PL_utf8skip[] = {
351,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
361,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
371,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
381,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
391,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
401,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
412,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
423,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,
437,13,
44};
45#else
46EXTCONST unsigned char PL_utf8skip[];
47#endif
48
49END_EXTERN_C
50#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
51
52
53#define NATIVE_TO_ASCII(ch) (ch)
54#define ASCII_TO_NATIVE(ch) (ch)
55
56#define NATIVE_TO_UTF(ch) (ch)
57#define UTF_TO_NATIVE(ch) (ch)
58
59#define UNI_TO_NATIVE(ch) (ch)
60#define NATIVE_TO_UNI(ch) (ch)
61
62#define NATIVE_TO_NEED(enc,ch) (ch)
63#define ASCII_TO_NEED(enc,ch) (ch)
64
65
66#define utf8n_to_uvchr utf8n_to_uvuni
67#define uvchr_to_utf8 uvuni_to_utf8
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
113#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
114#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
115#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
116#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
117#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
118#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)
119
120#define UTF_START_MARK(len) ((len > 7) ? 0xFF : (0xFE << (7-len)))
121#define UTF_START_MASK(len) ((len >= 7) ? 0x00 : (0x1F >> (len-2)))
122
123#define UTF_CONTINUATION_MARK 0x80
124#define UTF_ACCUMULATION_SHIFT 6
125#define UTF_CONTINUATION_MASK ((U8)0x3f)
126#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
127
128#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
129#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
130
131#ifdef HAS_QUAD
132#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
133 (uv) < 0x800 ? 2 : \
134 (uv) < 0x10000 ? 3 : \
135 (uv) < 0x200000 ? 4 : \
136 (uv) < 0x4000000 ? 5 : \
137 (uv) < 0x80000000 ? 6 : \
138 (uv) < UTF8_QUAD_MAX ? 7 : 13 )
139#else
140
141#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
142 (uv) < 0x800 ? 2 : \
143 (uv) < 0x10000 ? 3 : \
144 (uv) < 0x200000 ? 4 : \
145 (uv) < 0x4000000 ? 5 : \
146 (uv) < 0x80000000 ? 6 : 7 )
147#endif
148
149
150
151
152
153
154
155#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
156 ? isIDFIRST(*(p)) \
157 : isIDFIRST_utf8((const U8*)p))
158#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
159 ? isALNUM(*(p)) \
160 : isALNUM_utf8((const U8*)p))
161
162
163#endif
164
165
166
167#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
168#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
169
170#define UTF8_MAXBYTES 13
171
172
173
174
175
176#define UTF8_MAXLEN UTF8_MAXBYTES
177
178#define UTF8_MAXLEN_UCLC 3
179#define UTF8_MAXLEN_UCLC_MULT 39
180#define UTF8_MAXLEN_FOLD 3
181#define UTF8_MAXLEN_FOLD_MULT 39
182
183
184
185
186
187
188
189#define UTF8_MAXBYTES_CASE 6
190
191#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
192#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
193
194#define UTF8_ALLOW_EMPTY 0x0001
195#define UTF8_ALLOW_CONTINUATION 0x0002
196#define UTF8_ALLOW_NON_CONTINUATION 0x0004
197#define UTF8_ALLOW_FE_FF 0x0008
198#define UTF8_ALLOW_SHORT 0x0010
199#define UTF8_ALLOW_SURROGATE 0x0020
200#define UTF8_ALLOW_FFFF 0x0040
201#define UTF8_ALLOW_LONG 0x0080
202#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
203 UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
204#define UTF8_ALLOW_ANY 0x00FF
205#define UTF8_CHECK_ONLY 0x0200
206#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
207 UTF8_ALLOW_ANYUV)
208
209#define UNICODE_SURROGATE_FIRST 0xD800
210#define UNICODE_SURROGATE_LAST 0xDFFF
211#define UNICODE_REPLACEMENT 0xFFFD
212#define UNICODE_BYTE_ORDER_MARK 0xFEFF
213#define UNICODE_ILLEGAL 0xFFFF
214
215
216
217#define PERL_UNICODE_MAX 0x10FFFF
218
219#define UNICODE_ALLOW_SURROGATE 0x0001
220#define UNICODE_ALLOW_FDD0 0x0002
221#define UNICODE_ALLOW_FFFF 0x0004
222#define UNICODE_ALLOW_SUPER 0x0008
223#define UNICODE_ALLOW_ANY 0x000F
224
225#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
226 (c) <= UNICODE_SURROGATE_LAST)
227#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)
228#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK)
229#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)
230
231#ifdef HAS_QUAD
232# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
233#endif
234
235#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
236
237#define UNICODE_LATIN_SMALL_LETTER_SHARP_S 0x00DF
238#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
239#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
240#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
241
242#define EBCDIC_LATIN_SMALL_LETTER_SHARP_S 0x0059
243
244#define UNI_DISPLAY_ISPRINT 0x0001
245#define UNI_DISPLAY_BACKSLASH 0x0002
246#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
247#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
248
249#ifdef EBCDIC
250# define ANYOF_FOLD_SHARP_S(node, input, end) \
251 (ANYOF_BITMAP_TEST(node, EBCDIC_LATIN_SMALL_LETTER_SHARP_S) && \
252 (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
253 (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
254 ((end) > (input) + 1) && \
255 toLOWER((input)[0]) == 's' && \
256 toLOWER((input)[1]) == 's')
257#else
258# define ANYOF_FOLD_SHARP_S(node, input, end) \
259 (ANYOF_BITMAP_TEST(node, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \
260 (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
261 (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
262 ((end) > (input) + 1) && \
263 toLOWER((input)[0]) == 's' && \
264 toLOWER((input)[1]) == 's')
265#endif
266#define SHARP_S_SKIP 2
267
268#ifdef EBCDIC
269
270#else
271#define IS_UTF8_CHAR_1(p) \
272 ((p)[0] <= 0x7F)
273#define IS_UTF8_CHAR_2(p) \
274 ((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
275 (p)[1] >= 0x80 && (p)[1] <= 0xBF)
276#define IS_UTF8_CHAR_3a(p) \
277 ((p)[0] == 0xE0 && \
278 (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
279 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
280#define IS_UTF8_CHAR_3b(p) \
281 ((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
282 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
283 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
284#define IS_UTF8_CHAR_3c(p) \
285 ((p)[0] == 0xED && \
286 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
287 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
288
289
290
291#define IS_UTF8_CHAR_3d(p) \
292 ((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
293 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
294 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
295#define IS_UTF8_CHAR_4a(p) \
296 ((p)[0] == 0xF0 && \
297 (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
298 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
299 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
300#define IS_UTF8_CHAR_4b(p) \
301 ((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
302 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
303 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
304 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
305
306
307
308
309
310
311
312
313#define IS_UTF8_CHAR_4c(p) \
314 ((p)[0] == 0xF4 && (p)[0] <= 0xF7 && \
315 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
316 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
317 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
318
319#define IS_UTF8_CHAR_3(p) \
320 (IS_UTF8_CHAR_3a(p) || \
321 IS_UTF8_CHAR_3b(p) || \
322 IS_UTF8_CHAR_3c(p) || \
323 IS_UTF8_CHAR_3d(p))
324#define IS_UTF8_CHAR_4(p) \
325 (IS_UTF8_CHAR_4a(p) || \
326 IS_UTF8_CHAR_4b(p) || \
327 IS_UTF8_CHAR_4c(p))
328
329
330
331
332
333
334#define IS_UTF8_CHAR(p, n) \
335 ((n) == 1 ? IS_UTF8_CHAR_1(p) : \
336 (n) == 2 ? IS_UTF8_CHAR_2(p) : \
337 (n) == 3 ? IS_UTF8_CHAR_3(p) : \
338 (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
339
340#define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
341
342#endif
343