23 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
25 #define BEG(no) (regs->beg[(no)])
26 #define END(no) (regs->end[(no)])
29 static const char casetable[] = {
30 '\000',
'\001',
'\002',
'\003',
'\004',
'\005',
'\006',
'\007',
31 '\010',
'\011',
'\012',
'\013',
'\014',
'\015',
'\016',
'\017',
32 '\020',
'\021',
'\022',
'\023',
'\024',
'\025',
'\026',
'\027',
33 '\030',
'\031',
'\032',
'\033',
'\034',
'\035',
'\036',
'\037',
35 '\040',
'\041',
'\042',
'\043',
'\044',
'\045',
'\046',
'\047',
37 '\050',
'\051',
'\052',
'\053',
'\054',
'\055',
'\056',
'\057',
39 '\060',
'\061',
'\062',
'\063',
'\064',
'\065',
'\066',
'\067',
41 '\070',
'\071',
'\072',
'\073',
'\074',
'\075',
'\076',
'\077',
43 '\100',
'\141',
'\142',
'\143',
'\144',
'\145',
'\146',
'\147',
45 '\150',
'\151',
'\152',
'\153',
'\154',
'\155',
'\156',
'\157',
47 '\160',
'\161',
'\162',
'\163',
'\164',
'\165',
'\166',
'\167',
49 '\170',
'\171',
'\172',
'\133',
'\134',
'\135',
'\136',
'\137',
51 '\140',
'\141',
'\142',
'\143',
'\144',
'\145',
'\146',
'\147',
53 '\150',
'\151',
'\152',
'\153',
'\154',
'\155',
'\156',
'\157',
55 '\160',
'\161',
'\162',
'\163',
'\164',
'\165',
'\166',
'\167',
57 '\170',
'\171',
'\172',
'\173',
'\174',
'\175',
'\176',
'\177',
58 '\200',
'\201',
'\202',
'\203',
'\204',
'\205',
'\206',
'\207',
59 '\210',
'\211',
'\212',
'\213',
'\214',
'\215',
'\216',
'\217',
60 '\220',
'\221',
'\222',
'\223',
'\224',
'\225',
'\226',
'\227',
61 '\230',
'\231',
'\232',
'\233',
'\234',
'\235',
'\236',
'\237',
62 '\240',
'\241',
'\242',
'\243',
'\244',
'\245',
'\246',
'\247',
63 '\250',
'\251',
'\252',
'\253',
'\254',
'\255',
'\256',
'\257',
64 '\260',
'\261',
'\262',
'\263',
'\264',
'\265',
'\266',
'\267',
65 '\270',
'\271',
'\272',
'\273',
'\274',
'\275',
'\276',
'\277',
66 '\300',
'\301',
'\302',
'\303',
'\304',
'\305',
'\306',
'\307',
67 '\310',
'\311',
'\312',
'\313',
'\314',
'\315',
'\316',
'\317',
68 '\320',
'\321',
'\322',
'\323',
'\324',
'\325',
'\326',
'\327',
69 '\330',
'\331',
'\332',
'\333',
'\334',
'\335',
'\336',
'\337',
70 '\340',
'\341',
'\342',
'\343',
'\344',
'\345',
'\346',
'\347',
71 '\350',
'\351',
'\352',
'\353',
'\354',
'\355',
'\356',
'\357',
72 '\360',
'\361',
'\362',
'\363',
'\364',
'\365',
'\366',
'\367',
73 '\370',
'\371',
'\372',
'\373',
'\374',
'\375',
'\376',
'\377',
76 # error >>> "You lose. You will need a translation table for your character set." <<<
82 const unsigned char *p1 = x, *p2 = y;
86 if ((tmp = casetable[(
unsigned)*p1++] - casetable[(
unsigned)*p2++]))
97 return memcmp(p1, p2, len);
103 const unsigned char *x = xs, *xe = xs + m;
104 const unsigned char *y = ys, *ye = ys + n;
106 # if SIZEOF_VALUE == 8
107 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
108 # elif SIZEOF_VALUE == 4
109 # define VALUE_MAX 0xFFFFFFFFUL
115 rb_bug(
"!!too long pattern string!!");
118 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
139 const unsigned char *x = xs, *xe = xs + m;
140 const unsigned char *y = ys;
144 for (i = 0; i < 256; ++
i)
147 qstable[*x] = xe - x;
149 for (; y + m <= ys + n; y += *(qstable + y[m])) {
150 if (*xs == *y &&
memcmp(xs, y, m) == 0)
156 static inline unsigned int
159 register const unsigned int mix = 8353;
160 register unsigned int h = *x;
185 return (
unsigned char)h;
191 const unsigned char *x = xs, *xe = xs + m;
192 const unsigned char *y = ys;
196 for (i = 0; i < 512; ++
i) {
199 for (; x < xe; ++x) {
204 if (*xs == *y &&
memcmp(xs, y, m) == 0)
213 const unsigned char *x = x0, *y = y0;
215 if (m > n)
return -1;
217 return memcmp(x0, y0, m) == 0 ? 0 : -1;
223 const unsigned char *ys = y, *ye = ys + n;
224 for (; y < ye; ++y) {
241 #define REG_LITERAL FL_USER5
242 #define REG_ENCODING_NONE FL_USER6
244 #define KCODE_FIXED FL_USER4
246 #define ARG_REG_OPTION_MASK \
247 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
248 #define ARG_ENCODING_FIXED 16
249 #define ARG_ENCODING_NONE 32
322 const char *
p, *pend;
327 p = s; pend = p +
len;
335 p +=
mbclen(p, pend, enc);
363 if (c ==
'\\' && p+clen < pend) {
364 int n = clen +
mbclen(p+clen, pend, enc);
377 c = (
unsigned char)*p;
397 snprintf(b,
sizeof(b),
"\\x%02X", c);
518 options =
RREGEXP(re)->ptr->options;
522 if (len >= 4 && ptr[0] ==
'(' && ptr[1] ==
'?') {
525 if ((len -= 2) > 0) {
537 if (len > 1 && *ptr ==
'-') {
556 if (*ptr ==
':' && ptr[len-1] ==
')') {
566 options =
RREGEXP(re)->ptr->options;
574 if ((options & embeddable) != embeddable) {
686 int back_num,
int *back_refs,
OnigRegex regex,
void *
arg)
720 int back_num,
int *back_refs,
OnigRegex regex,
void *
arg)
726 for(i = 0; i < back_num; i++)
768 OnigErrorInfo* einfo,
const char *sourcefile,
int sourceline)
778 r =
onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
789 const char *sourcefile,
int sourceline)
850 #if SIZEOF_LONG > SIZEOF_INT
851 return diff ? diff > 0 ? 1 : -1 : 0;
901 for (i = 0; i < num_pos; i++) {
931 if (!
RMATCH(match)->regexp) {
942 if (obj == orig)
return obj;
984 return RMATCH(match)->regexp;
1037 switch(
TYPE(backref)) {
1051 (
const unsigned char*)name,
1052 (
const unsigned char*)name +
strlen(name),
1174 #define MATCH_BUSY FL_USER2
1229 "incompatible encoding regexp match (%s regexp with %s string)",
1241 "invalid byte sequence in %s",
1248 if (
RREGEXP(re)->ptr->enc != enc) {
1253 if (
RREGEXP(re)->ptr->enc != enc &&
1263 rb_warn(
"regexp match /.../n against to %s string",
1276 const char *pattern;
1281 if (reg->
enc == enc)
return reg;
1291 if (unescaped ==
Qnil) {
1355 tmpreg = reg !=
RREGEXP(re)->ptr;
1356 if (!tmpreg)
RREGEXP(re)->usecnt++;
1359 if (!
NIL_P(match)) {
1379 if (!tmpreg)
RREGEXP(re)->usecnt--;
1416 RMATCH(match)->regexp = re;
1417 RMATCH(match)->rmatch->char_offset_updated = 0;
1438 if (nth <= 0)
return Qnil;
1459 if (nth <= 0)
return Qnil;
1462 if (start == -1)
return Qnil;
1497 if (
BEG(0) == -1)
return Qnil;
1525 if (
BEG(0) == -1)
return Qnil;
1526 str =
RMATCH(match)->str;
1542 if (
BEG(0) == -1)
return Qnil;
1544 for (i=regs->
num_regs-1;
BEG(i) == -1 && i > 0; i--)
1546 if (i == 0)
return Qnil;
1586 target =
RMATCH(match)->str;
1588 for (i=start; i<regs->
num_regs; i++) {
1589 if (regs->
beg[i] == -1) {
1659 (
const unsigned char* )name, (
const unsigned char* )name_end, regs);
1714 switch (
TYPE(idx)) {
1717 goto name_to_backref;
1806 return RMATCH(match)->str;
1816 int back_num,
int *back_refs,
OnigRegex regex,
void *arg0)
1821 for (i = 0; i < back_num; i++) {
1823 arg[back_refs[
i]].
len = name_end -
name;
1860 return rb_sprintf(
"#<%s:%p>", cname, (
void*)match);
1872 for (i = 0; i < num_regs; i++) {
1899 const char *
p = *pp;
1901 int meta_prefix = 0, ctrl_prefix = 0;
1904 if (p == end || *p++ !=
'\\') {
1905 errcpy(err,
"too short escaped multibyte character");
1911 errcpy(err,
"too short escape sequence");
1915 case '\\': code =
'\\';
break;
1916 case 'n': code =
'\n';
break;
1917 case 't': code =
'\t';
break;
1918 case 'r': code =
'\r';
break;
1919 case 'f': code =
'\f';
break;
1920 case 'v': code =
'\013';
break;
1921 case 'a': code =
'\007';
break;
1922 case 'e': code =
'\033';
break;
1925 case '0':
case '1':
case '2':
case '3':
1926 case '4':
case '5':
case '6':
case '7':
1928 code =
scan_oct(p, end < p+3 ? end-p : 3, &len);
1933 code =
scan_hex(p, end < p+2 ? end-p : 2, &len);
1935 errcpy(err,
"invalid hex escape");
1943 errcpy(err,
"duplicate meta escape");
1947 if (p+1 < end && *p++ ==
'-' && (*p & 0x80) == 0) {
1957 errcpy(err,
"too short meta escape");
1961 if (p == end || *p++ !=
'-') {
1962 errcpy(err,
"too short control escape");
1967 errcpy(err,
"duplicate control escape");
1971 if (p < end && (*p & 0x80) == 0) {
1981 errcpy(err,
"too short control escape");
1985 errcpy(err,
"unexpected escape sequence");
1988 if (code < 0 || 0xff < code) {
1989 errcpy(err,
"invalid escape code");
2006 const char *
p = *pp;
2008 char *chbuf =
ALLOCA_N(
char, chmaxlen);
2013 memset(chbuf, 0, chmaxlen);
2020 chbuf[chlen++] = byte;
2021 while (chlen < chmaxlen &&
2027 chbuf[chlen++] = byte;
2032 errcpy(err,
"invalid multibyte escape");
2035 if (1 < chlen || (chbuf[0] & 0x80)) {
2040 else if (*encp != enc) {
2041 errcpy(err,
"escaped non ASCII character in UTF-8 regexp");
2047 snprintf(escbuf,
sizeof(escbuf),
"\\x%02X", chbuf[0]&0xff);
2057 if ((0xd800 <= code && code <= 0xdfff) ||
2059 errcpy(err,
"invalid Unicode range");
2073 snprintf(escbuf,
sizeof(escbuf),
"\\x%02X", (
int)uv);
2085 errcpy(err,
"UTF-8 character in non UTF-8 regexp");
2096 const char *
p = *pp;
2097 int has_unicode = 0;
2101 while (p < end &&
ISSPACE(*p)) p++;
2108 errcpy(err,
"invalid Unicode range");
2116 while (p < end &&
ISSPACE(*p)) p++;
2119 if (has_unicode == 0) {
2120 errcpy(err,
"invalid Unicode list");
2133 const char *
p = *pp;
2138 errcpy(err,
"invalid Unicode escape");
2143 errcpy(err,
"invalid Unicode escape");
2163 errcpy(err,
"invalid multibyte character");
2167 if (1 < chlen || (*p & 0x80)) {
2172 else if (*encp != enc) {
2173 errcpy(err,
"non ASCII character in UTF-8 regexp");
2182 errcpy(err,
"too short escape sequence");
2186 case '1':
case '2':
case '3':
2187 case '4':
case '5':
case '6':
case '7':
2212 errcpy(err,
"too short escape sequence");
2220 if (p == end || *p++ !=
'}') {
2221 errcpy(err,
"invalid Unicode list");
2263 int has_property = 0;
2277 if (has_property && !*fixed_enc) {
2333 src_enc != ascii8bit) {
2337 src_enc = ascii8bit;
2349 if (fixed_enc != 0) {
2350 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2354 regexp_enc = fixed_enc;
2372 const char *sourcefile,
int sourceline)
2389 errcpy(err,
"can't make regexp with dummy encoding");
2394 if (unescaped ==
Qnil)
2400 errcpy(err,
"incompatible character encoding");
2403 if (fixed_enc != a_enc) {
2413 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2422 sourcefile, sourceline);
2423 if (!re->ptr)
return -1;
2432 const char *sourcefile,
int sourceline)
2438 if (enc != ascii8bit) {
2440 errcpy(err,
"/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2447 options, err, sourcefile, sourceline);
2535 volatile VALUE save_str = str;
2565 hashval =
RREGEXP(re)->ptr->options;
2589 if (re1 == re2)
return Qtrue;
2637 if (match1 == match2)
return Qtrue;
2657 if (check &&
NIL_P(tmp)) {
2740 if (pos < 0)
return Qnil;
2846 if (
rb_scan_args(argc, argv,
"11", &str, &initpos) == 2) {
2906 if (argc == 0 || argc > 3) {
2931 if (argc == 3 && !
NIL_P(argv[2])) {
2933 if (kcode[0] ==
'n' || kcode[0] ==
'N') {
2938 rb_warn(
"encoding option is ignored - %s", kcode);
2966 s +=
mbclen(s, send, enc);
2970 case '[':
case ']':
case '{':
case '}':
2971 case '(':
case ')':
case '|':
case '-':
2972 case '*':
case '.':
case '\\':
2973 case '?':
case '+':
case '^':
case '$':
2975 case '\t':
case '\f':
case '\v':
case '\n':
case '\r':
3002 int n =
mbclen(s, send, enc);
3010 case '[':
case ']':
case '{':
case '}':
3011 case '(':
case ')':
case '|':
case '-':
3012 case '*':
case '.':
case '\\':
3013 case '?':
case '+':
case '^':
case '$':
3121 else if (argc == 1) {
3137 int has_asciionly = 0;
3141 for (i = 0; i <
argc; i++) {
3152 if (!has_ascii_incompat)
3153 has_ascii_incompat = enc;
3154 else if (has_ascii_incompat != enc)
3159 if (!has_ascii_compat_fixed)
3160 has_ascii_compat_fixed = enc;
3161 else if (has_ascii_compat_fixed != enc)
3175 if (!has_ascii_incompat)
3176 has_ascii_incompat = enc;
3177 else if (has_ascii_incompat != enc)
3185 if (!has_ascii_compat_fixed)
3186 has_ascii_compat_fixed = enc;
3187 else if (has_ascii_compat_fixed != enc)
3193 if (has_ascii_incompat) {
3194 if (has_asciionly) {
3198 if (has_ascii_compat_fixed) {
3210 if (has_ascii_incompat) {
3211 result_enc = has_ascii_incompat;
3213 else if (has_ascii_compat_fixed) {
3214 result_enc = has_ascii_compat_fixed;
3262 if (copy == re)
return copy;
3272 err,
NULL, 0) != 0) {
3287 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
3293 int c =
ASCGET(s, e, &clen);
3297 s +=
mbclen(s, e, str_enc);
3303 if (c !=
'\\' || s == e)
continue;
3312 s +=
mbclen(s, e, str_enc);
3321 case '1':
case '2':
case '3':
case '4':
3322 case '5':
case '6':
case '7':
case '8':
case '9':
3332 if (s < e &&
ASCGET(s, e, &clen) ==
'<') {
3333 char *
name, *name_end;
3335 name_end = name = s + clen;
3336 while (name_end < e) {
3337 c =
ASCGET(name_end, e, &clen);
3338 if (c ==
'>')
break;
3339 name_end += c == -1 ?
mbclen(name_end, e, str_enc) : clen;
3343 p = s = name_end + clen;
3369 while (
BEG(no) == -1 && no > 0) no--;
3370 if (no == 0)
continue;
3383 if (no >= regs->
num_regs)
continue;
3384 if (
BEG(no) == -1)
continue;
3389 if (!val)
return str;
3400 rb_warn(
"variable $KCODE is no longer effective");
3407 rb_warn(
"variable $KCODE is no longer effective; ignored");
3413 rb_warn(
"variable $= is no longer effective");
3420 rb_warn(
"variable $= is no longer effective; ignored");
3473 if (argc > 0 &&
rb_scan_args(argc, argv,
"01", &nth) == 1) {