Ruby  1.9.3p392(2013-02-22revision39386)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author: usa $
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/ruby.h"
15 #include "ruby/re.h"
16 #include "ruby/encoding.h"
17 #include "internal.h"
18 #include <assert.h>
19 
20 #define BEG(no) (regs->beg[(no)])
21 #define END(no) (regs->end[(no)])
22 
23 #include <math.h>
24 #include <ctype.h>
25 
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 
30 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
31 
32 #undef rb_str_new_cstr
33 #undef rb_tainted_str_new_cstr
34 #undef rb_usascii_str_new_cstr
35 #undef rb_external_str_new_cstr
36 #undef rb_locale_str_new_cstr
37 #undef rb_str_new2
38 #undef rb_str_new3
39 #undef rb_str_new4
40 #undef rb_str_new5
41 #undef rb_tainted_str_new2
42 #undef rb_usascii_str_new2
43 #undef rb_str_dup_frozen
44 #undef rb_str_buf_new_cstr
45 #undef rb_str_buf_new2
46 #undef rb_str_buf_cat2
47 #undef rb_str_cat2
48 
49 static VALUE rb_str_clear(VALUE str);
50 
53 
54 #define RUBY_MAX_CHAR_LEN 16
55 #define STR_TMPLOCK FL_USER7
56 #define STR_NOEMBED FL_USER1
57 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
58 #define STR_ASSOC FL_USER3
59 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
60 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
61 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
62 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
63 #define STR_UNSET_NOCAPA(s) do {\
64  if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
65 } while (0)
66 
67 
68 #define STR_SET_NOEMBED(str) do {\
69  FL_SET((str), STR_NOEMBED);\
70  STR_SET_EMBED_LEN((str), 0);\
71 } while (0)
72 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
73 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
74 #define STR_SET_EMBED_LEN(str, n) do { \
75  long tmp_n = (n);\
76  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
77  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
78 } while (0)
79 
80 #define STR_SET_LEN(str, n) do { \
81  if (STR_EMBED_P(str)) {\
82  STR_SET_EMBED_LEN((str), (n));\
83  }\
84  else {\
85  RSTRING(str)->as.heap.len = (n);\
86  }\
87 } while (0)
88 
89 #define STR_DEC_LEN(str) do {\
90  if (STR_EMBED_P(str)) {\
91  long n = RSTRING_LEN(str);\
92  n--;\
93  STR_SET_EMBED_LEN((str), n);\
94  }\
95  else {\
96  RSTRING(str)->as.heap.len--;\
97  }\
98 } while (0)
99 
100 #define RESIZE_CAPA(str,capacity) do {\
101  if (STR_EMBED_P(str)) {\
102  if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
103  char *tmp = ALLOC_N(char, (capacity)+1);\
104  memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
105  RSTRING(str)->as.heap.ptr = tmp;\
106  RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
107  STR_SET_NOEMBED(str);\
108  RSTRING(str)->as.heap.aux.capa = (capacity);\
109  }\
110  }\
111  else {\
112  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
113  if (!STR_NOCAPA_P(str))\
114  RSTRING(str)->as.heap.aux.capa = (capacity);\
115  }\
116 } while (0)
117 
118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
120 
121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
122 
123 static inline int
125 {
126  rb_encoding *enc;
127 
128  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
129  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
130  return 1;
131 
132  enc = STR_ENC_GET(str);
133  if (rb_enc_mbmaxlen(enc) == 1)
134  return 1;
135 
136  /* Conservative. Possibly single byte.
137  * "\xa1" in Shift_JIS for example. */
138  return 0;
139 }
140 
142 
143 static inline const char *
144 search_nonascii(const char *p, const char *e)
145 {
146 #if SIZEOF_VALUE == 8
147 # define NONASCII_MASK 0x8080808080808080ULL
148 #elif SIZEOF_VALUE == 4
149 # define NONASCII_MASK 0x80808080UL
150 #endif
151 #ifdef NONASCII_MASK
152  if ((int)sizeof(VALUE) * 2 < e - p) {
153  const VALUE *s, *t;
154  const VALUE lowbits = sizeof(VALUE) - 1;
155  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
156  while (p < (const char *)s) {
157  if (!ISASCII(*p))
158  return p;
159  p++;
160  }
161  t = (const VALUE*)(~lowbits & (VALUE)e);
162  while (s < t) {
163  if (*s & NONASCII_MASK) {
164  t = s;
165  break;
166  }
167  s++;
168  }
169  p = (const char *)t;
170  }
171 #endif
172  while (p < e) {
173  if (!ISASCII(*p))
174  return p;
175  p++;
176  }
177  return NULL;
178 }
179 
180 static int
181 coderange_scan(const char *p, long len, rb_encoding *enc)
182 {
183  const char *e = p + len;
184 
185  if (rb_enc_to_index(enc) == 0) {
186  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
187  p = search_nonascii(p, e);
189  }
190 
191  if (rb_enc_asciicompat(enc)) {
192  p = search_nonascii(p, e);
193  if (!p) {
194  return ENC_CODERANGE_7BIT;
195  }
196  while (p < e) {
197  int ret = rb_enc_precise_mbclen(p, e, enc);
198  if (!MBCLEN_CHARFOUND_P(ret)) {
199  return ENC_CODERANGE_BROKEN;
200  }
201  p += MBCLEN_CHARFOUND_LEN(ret);
202  if (p < e) {
203  p = search_nonascii(p, e);
204  if (!p) {
205  return ENC_CODERANGE_VALID;
206  }
207  }
208  }
209  if (e < p) {
210  return ENC_CODERANGE_BROKEN;
211  }
212  return ENC_CODERANGE_VALID;
213  }
214 
215  while (p < e) {
216  int ret = rb_enc_precise_mbclen(p, e, enc);
217 
218  if (!MBCLEN_CHARFOUND_P(ret)) {
219  return ENC_CODERANGE_BROKEN;
220  }
221  p += MBCLEN_CHARFOUND_LEN(ret);
222  }
223  if (e < p) {
224  return ENC_CODERANGE_BROKEN;
225  }
226  return ENC_CODERANGE_VALID;
227 }
228 
229 long
230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
231 {
232  const char *p = s;
233 
234  if (*cr == ENC_CODERANGE_BROKEN)
235  return e - s;
236 
237  if (rb_enc_to_index(enc) == 0) {
238  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
239  p = search_nonascii(p, e);
241  return e - s;
242  }
243  else if (rb_enc_asciicompat(enc)) {
244  p = search_nonascii(p, e);
245  if (!p) {
246  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
247  return e - s;
248  }
249  while (p < e) {
250  int ret = rb_enc_precise_mbclen(p, e, enc);
251  if (!MBCLEN_CHARFOUND_P(ret)) {
253  return p - s;
254  }
255  p += MBCLEN_CHARFOUND_LEN(ret);
256  if (p < e) {
257  p = search_nonascii(p, e);
258  if (!p) {
259  *cr = ENC_CODERANGE_VALID;
260  return e - s;
261  }
262  }
263  }
265  return p - s;
266  }
267  else {
268  while (p < e) {
269  int ret = rb_enc_precise_mbclen(p, e, enc);
270  if (!MBCLEN_CHARFOUND_P(ret)) {
272  return p - s;
273  }
274  p += MBCLEN_CHARFOUND_LEN(ret);
275  }
277  return p - s;
278  }
279 }
280 
281 static inline void
283 {
284  rb_enc_set_index(str1, ENCODING_GET(str2));
285 }
286 
287 static void
289 {
290  /* this function is designed for copying encoding and coderange
291  * from src to new string "dest" which is made from the part of src.
292  */
293  str_enc_copy(dest, src);
294  switch (ENC_CODERANGE(src)) {
295  case ENC_CODERANGE_7BIT:
297  break;
298  case ENC_CODERANGE_VALID:
299  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
302  else
304  break;
305  default:
306  if (RSTRING_LEN(dest) == 0) {
307  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
309  else
311  }
312  break;
313  }
314 }
315 
316 static void
318 {
319  str_enc_copy(dest, src);
320  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
321 }
322 
323 int
325 {
326  int cr = ENC_CODERANGE(str);
327 
328  if (cr == ENC_CODERANGE_UNKNOWN) {
329  rb_encoding *enc = STR_ENC_GET(str);
330  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
331  ENC_CODERANGE_SET(str, cr);
332  }
333  return cr;
334 }
335 
336 int
338 {
339  rb_encoding *enc = STR_ENC_GET(str);
340 
341  if (!rb_enc_asciicompat(enc))
342  return FALSE;
343  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
344  return TRUE;
345  return FALSE;
346 }
347 
348 static inline void
349 str_mod_check(VALUE s, const char *p, long len)
350 {
351  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
352  rb_raise(rb_eRuntimeError, "string modified");
353  }
354 }
355 
356 size_t
358 {
359  if (STR_EMBED_P(str)) {
360  return RSTRING_EMBED_LEN_MAX;
361  }
362  else if (STR_NOCAPA_P(str)) {
363  return RSTRING(str)->as.heap.len;
364  }
365  else {
366  return RSTRING(str)->as.heap.aux.capa;
367  }
368 }
369 
370 static inline VALUE
372 {
373  NEWOBJ(str, struct RString);
374  OBJSETUP(str, klass, T_STRING);
375 
376  str->as.heap.ptr = 0;
377  str->as.heap.len = 0;
378  str->as.heap.aux.capa = 0;
379 
380  return (VALUE)str;
381 }
382 
383 static VALUE
384 str_new(VALUE klass, const char *ptr, long len)
385 {
386  VALUE str;
387 
388  if (len < 0) {
389  rb_raise(rb_eArgError, "negative string size (or size too big)");
390  }
391 
392  str = str_alloc(klass);
393  if (len > RSTRING_EMBED_LEN_MAX) {
394  RSTRING(str)->as.heap.aux.capa = len;
395  RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
396  STR_SET_NOEMBED(str);
397  }
398  else if (len == 0) {
400  }
401  if (ptr) {
402  memcpy(RSTRING_PTR(str), ptr, len);
403  }
404  STR_SET_LEN(str, len);
405  RSTRING_PTR(str)[len] = '\0';
406  return str;
407 }
408 
409 VALUE
410 rb_str_new(const char *ptr, long len)
411 {
412  return str_new(rb_cString, ptr, len);
413 }
414 
415 VALUE
416 rb_usascii_str_new(const char *ptr, long len)
417 {
418  VALUE str = rb_str_new(ptr, len);
420  return str;
421 }
422 
423 VALUE
424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
425 {
426  VALUE str = rb_str_new(ptr, len);
427  rb_enc_associate(str, enc);
428  return str;
429 }
430 
431 VALUE
432 rb_str_new_cstr(const char *ptr)
433 {
434  if (!ptr) {
435  rb_raise(rb_eArgError, "NULL pointer given");
436  }
437  return rb_str_new(ptr, strlen(ptr));
438 }
439 
441 #define rb_str_new2 rb_str_new_cstr
442 
443 VALUE
444 rb_usascii_str_new_cstr(const char *ptr)
445 {
446  VALUE str = rb_str_new2(ptr);
448  return str;
449 }
450 
452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
453 
454 VALUE
455 rb_tainted_str_new(const char *ptr, long len)
456 {
457  VALUE str = rb_str_new(ptr, len);
458 
459  OBJ_TAINT(str);
460  return str;
461 }
462 
463 VALUE
464 rb_tainted_str_new_cstr(const char *ptr)
465 {
466  VALUE str = rb_str_new2(ptr);
467 
468  OBJ_TAINT(str);
469  return str;
470 }
471 
473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
474 
475 VALUE
476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
477 {
478  rb_econv_t *ec;
479  rb_econv_result_t ret;
480  long len;
481  VALUE newstr;
482  const unsigned char *sp;
483  unsigned char *dp;
484 
485  if (!to) return str;
486  if (from == to) return str;
487  if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
488  to == rb_ascii8bit_encoding()) {
489  if (STR_ENC_GET(str) != to) {
490  str = rb_str_dup(str);
491  rb_enc_associate(str, to);
492  }
493  return str;
494  }
495 
496  len = RSTRING_LEN(str);
497  newstr = rb_str_new(0, len);
498 
499  retry:
500  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
501  if (!ec) return str;
502 
503  sp = (unsigned char*)RSTRING_PTR(str);
504  dp = (unsigned char*)RSTRING_PTR(newstr);
505  ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
506  &dp, (unsigned char*)RSTRING_END(newstr), 0);
507  rb_econv_close(ec);
508  switch (ret) {
510  /* destination buffer short */
511  len = len < 2 ? 2 : len * 2;
512  rb_str_resize(newstr, len);
513  goto retry;
514 
515  case econv_finished:
516  len = dp - (unsigned char*)RSTRING_PTR(newstr);
517  rb_str_set_len(newstr, len);
518  rb_enc_associate(newstr, to);
519  return newstr;
520 
521  default:
522  /* some error, return original */
523  return str;
524  }
525 }
526 
527 VALUE
529 {
530  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
531 }
532 
533 VALUE
534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
535 {
536  VALUE str;
537 
538  str = rb_tainted_str_new(ptr, len);
539  if (eenc == rb_usascii_encoding() &&
542  return str;
543  }
544  rb_enc_associate(str, eenc);
545  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
546 }
547 
548 VALUE
549 rb_external_str_new(const char *ptr, long len)
550 {
552 }
553 
554 VALUE
555 rb_external_str_new_cstr(const char *ptr)
556 {
558 }
559 
560 VALUE
561 rb_locale_str_new(const char *ptr, long len)
562 {
564 }
565 
566 VALUE
567 rb_locale_str_new_cstr(const char *ptr)
568 {
570 }
571 
572 VALUE
573 rb_filesystem_str_new(const char *ptr, long len)
574 {
576 }
577 
578 VALUE
580 {
582 }
583 
584 VALUE
586 {
588 }
589 
590 VALUE
592 {
593  return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
594 }
595 
596 VALUE
598 {
599  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
600 }
601 
602 static VALUE
604 {
605  if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
606  STR_SET_EMBED(str2);
607  memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
608  STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
609  }
610  else {
611  str = rb_str_new_frozen(str);
612  FL_SET(str2, STR_NOEMBED);
613  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
614  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
615  RSTRING(str2)->as.heap.aux.shared = str;
616  FL_SET(str2, ELTS_SHARED);
617  }
618  rb_enc_cr_str_exact_copy(str2, str);
619 
620  return str2;
621 }
622 
623 static VALUE
625 {
626  return str_replace_shared(str_alloc(klass), str);
627 }
628 
629 static VALUE
630 str_new3(VALUE klass, VALUE str)
631 {
632  return str_new_shared(klass, str);
633 }
634 
635 VALUE
637 {
638  VALUE str2 = str_new3(rb_obj_class(str), str);
639 
640  OBJ_INFECT(str2, str);
641  return str2;
642 }
643 
645 #define rb_str_new3 rb_str_new_shared
646 
647 static VALUE
648 str_new4(VALUE klass, VALUE str)
649 {
650  VALUE str2;
651 
652  str2 = str_alloc(klass);
653  STR_SET_NOEMBED(str2);
654  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
655  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
656  if (STR_SHARED_P(str)) {
657  VALUE shared = RSTRING(str)->as.heap.aux.shared;
658  assert(OBJ_FROZEN(shared));
659  FL_SET(str2, ELTS_SHARED);
660  RSTRING(str2)->as.heap.aux.shared = shared;
661  }
662  else {
663  FL_SET(str, ELTS_SHARED);
664  RSTRING(str)->as.heap.aux.shared = str2;
665  }
666  rb_enc_cr_str_exact_copy(str2, str);
667  OBJ_INFECT(str2, str);
668  return str2;
669 }
670 
671 VALUE
673 {
674  VALUE klass, str;
675 
676  if (OBJ_FROZEN(orig)) return orig;
677  klass = rb_obj_class(orig);
678  if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
679  long ofs;
680  assert(OBJ_FROZEN(str));
681  ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
682  if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
683  (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
684  ENCODING_GET(str) != ENCODING_GET(orig)) {
685  str = str_new3(klass, str);
686  RSTRING(str)->as.heap.ptr += ofs;
687  RSTRING(str)->as.heap.len -= ofs;
688  rb_enc_cr_str_exact_copy(str, orig);
689  OBJ_INFECT(str, orig);
690  }
691  }
692  else if (STR_EMBED_P(orig)) {
693  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
694  rb_enc_cr_str_exact_copy(str, orig);
695  OBJ_INFECT(str, orig);
696  }
697  else if (STR_ASSOC_P(orig)) {
698  VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
699  FL_UNSET(orig, STR_ASSOC);
700  str = str_new4(klass, orig);
701  FL_SET(str, STR_ASSOC);
702  RSTRING(str)->as.heap.aux.shared = assoc;
703  }
704  else {
705  str = str_new4(klass, orig);
706  }
707  OBJ_FREEZE(str);
708  return str;
709 }
710 
712 #define rb_str_new4 rb_str_new_frozen
713 
714 VALUE
715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
716 {
717  return str_new(rb_obj_class(obj), ptr, len);
718 }
719 
720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
721  rb_str_new_with_class, (obj, ptr, len))
722 #define rb_str_new5 rb_str_new_with_class
723 
724 static VALUE
725 str_new_empty(VALUE str)
726 {
727  VALUE v = rb_str_new5(str, 0, 0);
728  rb_enc_copy(v, str);
729  OBJ_INFECT(v, str);
730  return v;
731 }
732 
733 #define STR_BUF_MIN_SIZE 128
734 
735 VALUE
736 rb_str_buf_new(long capa)
737 {
738  VALUE str = str_alloc(rb_cString);
739 
740  if (capa < STR_BUF_MIN_SIZE) {
741  capa = STR_BUF_MIN_SIZE;
742  }
743  FL_SET(str, STR_NOEMBED);
744  RSTRING(str)->as.heap.aux.capa = capa;
745  RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
746  RSTRING(str)->as.heap.ptr[0] = '\0';
747 
748  return str;
749 }
750 
751 VALUE
752 rb_str_buf_new_cstr(const char *ptr)
753 {
754  VALUE str;
755  long len = strlen(ptr);
756 
757  str = rb_str_buf_new(len);
758  rb_str_buf_cat(str, ptr, len);
759 
760  return str;
761 }
762 
764 #define rb_str_buf_new2 rb_str_buf_new_cstr
765 
766 VALUE
767 rb_str_tmp_new(long len)
768 {
769  return str_new(0, 0, len);
770 }
771 
772 void *
773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
774 {
775  VALUE s = rb_str_tmp_new(len);
776  *store = s;
777  return RSTRING_PTR(s);
778 }
779 
780 void
781 rb_free_tmp_buffer(volatile VALUE *store)
782 {
783  VALUE s = *store;
784  *store = 0;
785  if (s) rb_str_clear(s);
786 }
787 
788 void
790 {
791  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
792  xfree(RSTRING(str)->as.heap.ptr);
793  }
794 }
795 
796 RUBY_FUNC_EXPORTED size_t
798 {
799  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
800  return RSTRING(str)->as.heap.aux.capa;
801  }
802  else {
803  return 0;
804  }
805 }
806 
807 VALUE
809 {
810  return rb_convert_type(str, T_STRING, "String", "to_str");
811 }
812 
813 static inline void str_discard(VALUE str);
814 
815 void
817 {
818  rb_encoding *enc;
819  int cr;
820  if (str == str2) return;
821  enc = STR_ENC_GET(str2);
822  cr = ENC_CODERANGE(str2);
823  str_discard(str);
824  OBJ_INFECT(str, str2);
825  if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
826  STR_SET_EMBED(str);
827  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
828  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
829  rb_enc_associate(str, enc);
830  ENC_CODERANGE_SET(str, cr);
831  return;
832  }
833  STR_SET_NOEMBED(str);
834  STR_UNSET_NOCAPA(str);
835  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
836  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
837  if (STR_NOCAPA_P(str2)) {
838  FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
839  RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
840  }
841  else {
842  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
843  }
844  STR_SET_EMBED(str2); /* abandon str2 */
845  RSTRING_PTR(str2)[0] = 0;
846  STR_SET_EMBED_LEN(str2, 0);
847  rb_enc_associate(str, enc);
848  ENC_CODERANGE_SET(str, cr);
849 }
850 
851 static ID id_to_s;
852 
853 VALUE
855 {
856  VALUE str;
857 
858  if (TYPE(obj) == T_STRING) {
859  return obj;
860  }
861  str = rb_funcall(obj, id_to_s, 0);
862  if (TYPE(str) != T_STRING)
863  return rb_any_to_s(obj);
864  if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
865  return str;
866 }
867 
868 static VALUE
870 {
871  long len;
872 
873  len = RSTRING_LEN(str2);
874  if (STR_ASSOC_P(str2)) {
875  str2 = rb_str_new4(str2);
876  }
877  if (STR_SHARED_P(str2)) {
878  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
879  assert(OBJ_FROZEN(shared));
880  STR_SET_NOEMBED(str);
881  RSTRING(str)->as.heap.len = len;
882  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
883  FL_SET(str, ELTS_SHARED);
884  FL_UNSET(str, STR_ASSOC);
885  RSTRING(str)->as.heap.aux.shared = shared;
886  }
887  else {
888  str_replace_shared(str, str2);
889  }
890 
891  OBJ_INFECT(str, str2);
892  rb_enc_cr_str_exact_copy(str, str2);
893  return str;
894 }
895 
896 static VALUE
898 {
899  VALUE dup = str_alloc(klass);
900  str_replace(dup, str);
901  return dup;
902 }
903 
904 VALUE
906 {
907  return str_duplicate(rb_obj_class(str), str);
908 }
909 
910 VALUE
912 {
913  return str_replace(str_alloc(rb_cString), str);
914 }
915 
916 /*
917  * call-seq:
918  * String.new(str="") -> new_str
919  *
920  * Returns a new string object containing a copy of <i>str</i>.
921  */
922 
923 static VALUE
925 {
926  VALUE orig;
927 
928  if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
929  rb_str_replace(str, orig);
930  return str;
931 }
932 
933 static inline long
934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
935 {
936  long c;
937  const char *q;
938 
939  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
940  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
941  }
942  else if (rb_enc_asciicompat(enc)) {
943  c = 0;
944  if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
945  while (p < e) {
946  if (ISASCII(*p)) {
947  q = search_nonascii(p, e);
948  if (!q)
949  return c + (e - p);
950  c += q - p;
951  p = q;
952  }
953  p += rb_enc_fast_mbclen(p, e, enc);
954  c++;
955  }
956  }
957  else {
958  while (p < e) {
959  if (ISASCII(*p)) {
960  q = search_nonascii(p, e);
961  if (!q)
962  return c + (e - p);
963  c += q - p;
964  p = q;
965  }
966  p += rb_enc_mbclen(p, e, enc);
967  c++;
968  }
969  }
970  return c;
971  }
972 
973  for (c=0; p<e; c++) {
974  p += rb_enc_mbclen(p, e, enc);
975  }
976  return c;
977 }
978 
979 long
980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
981 {
982  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
983 }
984 
985 long
986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
987 {
988  long c;
989  const char *q;
990  int ret;
991 
992  *cr = 0;
993  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
994  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
995  }
996  else if (rb_enc_asciicompat(enc)) {
997  c = 0;
998  while (p < e) {
999  if (ISASCII(*p)) {
1000  q = search_nonascii(p, e);
1001  if (!q) {
1002  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1003  return c + (e - p);
1004  }
1005  c += q - p;
1006  p = q;
1007  }
1008  ret = rb_enc_precise_mbclen(p, e, enc);
1009  if (MBCLEN_CHARFOUND_P(ret)) {
1010  *cr |= ENC_CODERANGE_VALID;
1011  p += MBCLEN_CHARFOUND_LEN(ret);
1012  }
1013  else {
1014  *cr = ENC_CODERANGE_BROKEN;
1015  p++;
1016  }
1017  c++;
1018  }
1019  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1020  return c;
1021  }
1022 
1023  for (c=0; p<e; c++) {
1024  ret = rb_enc_precise_mbclen(p, e, enc);
1025  if (MBCLEN_CHARFOUND_P(ret)) {
1026  *cr |= ENC_CODERANGE_VALID;
1027  p += MBCLEN_CHARFOUND_LEN(ret);
1028  }
1029  else {
1030  *cr = ENC_CODERANGE_BROKEN;
1031  if (p + rb_enc_mbminlen(enc) <= e)
1032  p += rb_enc_mbminlen(enc);
1033  else
1034  p = e;
1035  }
1036  }
1037  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1038  return c;
1039 }
1040 
1041 #ifdef NONASCII_MASK
1042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1043 
1044 /*
1045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
1047  * Therefore, following pseudo code can detect UTF-8 leading byte.
1048  *
1049  * if (!(byte & 0x80))
1050  * byte |= 0x40; // turn on bit6
1051  * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
1052  *
1053  * This function calculate every bytes in the argument word `s'
1054  * using the above logic concurrently. and gather every bytes result.
1055  */
1056 static inline VALUE
1057 count_utf8_lead_bytes_with_word(const VALUE *s)
1058 {
1059  VALUE d = *s;
1060 
1061  /* Transform into bit0 represent UTF-8 leading or not. */
1062  d |= ~(d>>1);
1063  d >>= 6;
1064  d &= NONASCII_MASK >> 7;
1065 
1066  /* Gather every bytes. */
1067  d += (d>>8);
1068  d += (d>>16);
1069 #if SIZEOF_VALUE == 8
1070  d += (d>>32);
1071 #endif
1072  return (d&0xF);
1073 }
1074 #endif
1075 
1076 static long
1078 {
1079  const char *p, *e;
1080  long n;
1081  int cr;
1082 
1083  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1084  if (!enc) enc = STR_ENC_GET(str);
1085  p = RSTRING_PTR(str);
1086  e = RSTRING_END(str);
1087  cr = ENC_CODERANGE(str);
1088 #ifdef NONASCII_MASK
1089  if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1090  enc == rb_utf8_encoding()) {
1091 
1092  VALUE len = 0;
1093  if ((int)sizeof(VALUE) * 2 < e - p) {
1094  const VALUE *s, *t;
1095  const VALUE lowbits = sizeof(VALUE) - 1;
1096  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1097  t = (const VALUE*)(~lowbits & (VALUE)e);
1098  while (p < (const char *)s) {
1099  if (is_utf8_lead_byte(*p)) len++;
1100  p++;
1101  }
1102  while (s < t) {
1103  len += count_utf8_lead_bytes_with_word(s);
1104  s++;
1105  }
1106  p = (const char *)s;
1107  }
1108  while (p < e) {
1109  if (is_utf8_lead_byte(*p)) len++;
1110  p++;
1111  }
1112  return (long)len;
1113  }
1114 #endif
1115  n = rb_enc_strlen_cr(p, e, enc, &cr);
1116  if (cr) {
1117  ENC_CODERANGE_SET(str, cr);
1118  }
1119  return n;
1120 }
1121 
1122 long
1124 {
1125  return str_strlen(str, STR_ENC_GET(str));
1126 }
1127 
1128 /*
1129  * call-seq:
1130  * str.length -> integer
1131  * str.size -> integer
1132  *
1133  * Returns the character length of <i>str</i>.
1134  */
1135 
1136 VALUE
1138 {
1139  long len;
1140 
1141  len = str_strlen(str, STR_ENC_GET(str));
1142  return LONG2NUM(len);
1143 }
1144 
1145 /*
1146  * call-seq:
1147  * str.bytesize -> integer
1148  *
1149  * Returns the length of <i>str</i> in bytes.
1150  */
1151 
1152 static VALUE
1154 {
1155  return LONG2NUM(RSTRING_LEN(str));
1156 }
1157 
1158 /*
1159  * call-seq:
1160  * str.empty? -> true or false
1161  *
1162  * Returns <code>true</code> if <i>str</i> has a length of zero.
1163  *
1164  * "hello".empty? #=> false
1165  * "".empty? #=> true
1166  */
1167 
1168 static VALUE
1170 {
1171  if (RSTRING_LEN(str) == 0)
1172  return Qtrue;
1173  return Qfalse;
1174 }
1175 
1176 /*
1177  * call-seq:
1178  * str + other_str -> new_str
1179  *
1180  * Concatenation---Returns a new <code>String</code> containing
1181  * <i>other_str</i> concatenated to <i>str</i>.
1182  *
1183  * "Hello from " + self.to_s #=> "Hello from main"
1184  */
1185 
1186 VALUE
1188 {
1189  VALUE str3;
1190  rb_encoding *enc;
1191 
1192  StringValue(str2);
1193  enc = rb_enc_check(str1, str2);
1194  str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1195  memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1196  memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1197  RSTRING_PTR(str2), RSTRING_LEN(str2));
1198  RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1199 
1200  if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1201  OBJ_TAINT(str3);
1204  return str3;
1205 }
1206 
1207 /*
1208  * call-seq:
1209  * str * integer -> new_str
1210  *
1211  * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
1212  * the receiver.
1213  *
1214  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1215  */
1216 
1217 VALUE
1219 {
1220  VALUE str2;
1221  long n, len;
1222  char *ptr2;
1223 
1224  len = NUM2LONG(times);
1225  if (len < 0) {
1226  rb_raise(rb_eArgError, "negative argument");
1227  }
1228  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1229  rb_raise(rb_eArgError, "argument too big");
1230  }
1231 
1232  str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1233  ptr2 = RSTRING_PTR(str2);
1234  if (len) {
1235  n = RSTRING_LEN(str);
1236  memcpy(ptr2, RSTRING_PTR(str), n);
1237  while (n <= len/2) {
1238  memcpy(ptr2 + n, ptr2, n);
1239  n *= 2;
1240  }
1241  memcpy(ptr2 + n, ptr2, len-n);
1242  }
1243  ptr2[RSTRING_LEN(str2)] = '\0';
1244  OBJ_INFECT(str2, str);
1245  rb_enc_cr_str_copy_for_substr(str2, str);
1246 
1247  return str2;
1248 }
1249 
1250 /*
1251  * call-seq:
1252  * str % arg -> new_str
1253  *
1254  * Format---Uses <i>str</i> as a format specification, and returns the result
1255  * of applying it to <i>arg</i>. If the format specification contains more than
1256  * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1257  * containing the values to be substituted. See <code>Kernel::sprintf</code> for
1258  * details of the format string.
1259  *
1260  * "%05d" % 123 #=> "00123"
1261  * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
1262  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
1263  */
1264 
1265 static VALUE
1267 {
1268  volatile VALUE tmp = rb_check_array_type(arg);
1269 
1270  if (!NIL_P(tmp)) {
1271  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1272  }
1273  return rb_str_format(1, &arg, str);
1274 }
1275 
1276 static inline void
1278 {
1279  if (FL_TEST(str, STR_TMPLOCK)) {
1280  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1281  }
1282  rb_check_frozen(str);
1283  if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1284  rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1285 }
1286 
1287 static inline int
1289 {
1290  str_modifiable(str);
1291  if (!STR_SHARED_P(str)) return 1;
1292  if (STR_EMBED_P(str)) return 1;
1293  return 0;
1294 }
1295 
1296 static void
1298 {
1299  char *ptr;
1300  long len = RSTRING_LEN(str);
1301  long capa = len + expand;
1302 
1303  if (len > capa) len = capa;
1304  ptr = ALLOC_N(char, capa + 1);
1305  if (RSTRING_PTR(str)) {
1306  memcpy(ptr, RSTRING_PTR(str), len);
1307  }
1308  STR_SET_NOEMBED(str);
1309  STR_UNSET_NOCAPA(str);
1310  ptr[len] = 0;
1311  RSTRING(str)->as.heap.ptr = ptr;
1312  RSTRING(str)->as.heap.len = len;
1313  RSTRING(str)->as.heap.aux.capa = capa;
1314 }
1315 
1316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1317 
1318 void
1320 {
1321  if (!str_independent(str))
1322  str_make_independent(str);
1323  ENC_CODERANGE_CLEAR(str);
1324 }
1325 
1326 void
1327 rb_str_modify_expand(VALUE str, long expand)
1328 {
1329  if (expand < 0) {
1330  rb_raise(rb_eArgError, "negative expanding string size");
1331  }
1332  if (!str_independent(str)) {
1333  str_make_independent_expand(str, expand);
1334  }
1335  else if (expand > 0) {
1336  long len = RSTRING_LEN(str);
1337  long capa = len + expand;
1338  if (!STR_EMBED_P(str)) {
1339  REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1340  RSTRING(str)->as.heap.aux.capa = capa;
1341  }
1342  else if (capa > RSTRING_EMBED_LEN_MAX) {
1343  str_make_independent_expand(str, expand);
1344  }
1345  }
1346  ENC_CODERANGE_CLEAR(str);
1347 }
1348 
1349 /* As rb_str_modify(), but don't clear coderange */
1350 static void
1352 {
1353  if (!str_independent(str))
1354  str_make_independent(str);
1355  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1356  /* Force re-scan later */
1357  ENC_CODERANGE_CLEAR(str);
1358 }
1359 
1360 static inline void
1362 {
1363  str_modifiable(str);
1364  if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1365  xfree(RSTRING_PTR(str));
1366  RSTRING(str)->as.heap.ptr = 0;
1367  RSTRING(str)->as.heap.len = 0;
1368  }
1369 }
1370 
1371 void
1373 {
1374  /* sanity check */
1375  rb_check_frozen(str);
1376  if (STR_ASSOC_P(str)) {
1377  /* already associated */
1378  rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1379  }
1380  else {
1381  if (STR_SHARED_P(str)) {
1382  VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1383  str_make_independent(str);
1384  if (STR_ASSOC_P(assoc)) {
1385  assoc = RSTRING(assoc)->as.heap.aux.shared;
1386  rb_ary_concat(assoc, add);
1387  add = assoc;
1388  }
1389  }
1390  else if (STR_EMBED_P(str)) {
1391  str_make_independent(str);
1392  }
1393  else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1394  RESIZE_CAPA(str, RSTRING_LEN(str));
1395  }
1396  FL_SET(str, STR_ASSOC);
1397  RBASIC(add)->klass = 0;
1398  RSTRING(str)->as.heap.aux.shared = add;
1399  }
1400 }
1401 
1402 VALUE
1404 {
1405  if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1406  if (STR_ASSOC_P(str)) {
1407  return RSTRING(str)->as.heap.aux.shared;
1408  }
1409  return Qfalse;
1410 }
1411 
1412 VALUE
1413 rb_string_value(volatile VALUE *ptr)
1414 {
1415  VALUE s = *ptr;
1416  if (TYPE(s) != T_STRING) {
1417  s = rb_str_to_str(s);
1418  *ptr = s;
1419  }
1420  return s;
1421 }
1422 
1423 char *
1425 {
1426  VALUE str = rb_string_value(ptr);
1427  return RSTRING_PTR(str);
1428 }
1429 
1430 char *
1432 {
1433  VALUE str = rb_string_value(ptr);
1434  char *s = RSTRING_PTR(str);
1435  long len = RSTRING_LEN(str);
1436 
1437  if (!s || memchr(s, 0, len)) {
1438  rb_raise(rb_eArgError, "string contains null byte");
1439  }
1440  if (s[len]) {
1441  rb_str_modify(str);
1442  s = RSTRING_PTR(str);
1443  s[RSTRING_LEN(str)] = 0;
1444  }
1445  return s;
1446 }
1447 
1448 VALUE
1450 {
1451  str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1452  return str;
1453 }
1454 
1455 /*
1456  * call-seq:
1457  * String.try_convert(obj) -> string or nil
1458  *
1459  * Try to convert <i>obj</i> into a String, using to_str method.
1460  * Returns converted string or nil if <i>obj</i> cannot be converted
1461  * for any reason.
1462  *
1463  * String.try_convert("str") #=> "str"
1464  * String.try_convert(/re/) #=> nil
1465  */
1466 static VALUE
1468 {
1469  return rb_check_string_type(str);
1470 }
1471 
1472 static char*
1473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1474 {
1475  long nth = *nthp;
1476  if (rb_enc_mbmaxlen(enc) == 1) {
1477  p += nth;
1478  }
1479  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1480  p += nth * rb_enc_mbmaxlen(enc);
1481  }
1482  else if (rb_enc_asciicompat(enc)) {
1483  const char *p2, *e2;
1484  int n;
1485 
1486  while (p < e && 0 < nth) {
1487  e2 = p + nth;
1488  if (e < e2) {
1489  *nthp = nth;
1490  return (char *)e;
1491  }
1492  if (ISASCII(*p)) {
1493  p2 = search_nonascii(p, e2);
1494  if (!p2) {
1495  *nthp = nth;
1496  return (char *)e2;
1497  }
1498  nth -= p2 - p;
1499  p = p2;
1500  }
1501  n = rb_enc_mbclen(p, e, enc);
1502  p += n;
1503  nth--;
1504  }
1505  *nthp = nth;
1506  if (nth != 0) {
1507  return (char *)e;
1508  }
1509  return (char *)p;
1510  }
1511  else {
1512  while (p < e && nth--) {
1513  p += rb_enc_mbclen(p, e, enc);
1514  }
1515  }
1516  if (p > e) p = e;
1517  *nthp = nth;
1518  return (char*)p;
1519 }
1520 
1521 char*
1522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1523 {
1524  return str_nth_len(p, e, &nth, enc);
1525 }
1526 
1527 static char*
1528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1529 {
1530  if (singlebyte)
1531  p += nth;
1532  else {
1533  p = str_nth_len(p, e, &nth, enc);
1534  }
1535  if (!p) return 0;
1536  if (p > e) p = e;
1537  return (char *)p;
1538 }
1539 
1540 /* char offset to byte offset */
1541 static long
1542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1543 {
1544  const char *pp = str_nth(p, e, nth, enc, singlebyte);
1545  if (!pp) return e - p;
1546  return pp - p;
1547 }
1548 
1549 long
1550 rb_str_offset(VALUE str, long pos)
1551 {
1552  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1554 }
1555 
1556 #ifdef NONASCII_MASK
1557 static char *
1558 str_utf8_nth(const char *p, const char *e, long *nthp)
1559 {
1560  long nth = *nthp;
1561  if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1562  const VALUE *s, *t;
1563  const VALUE lowbits = sizeof(VALUE) - 1;
1564  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1565  t = (const VALUE*)(~lowbits & (VALUE)e);
1566  while (p < (const char *)s) {
1567  if (is_utf8_lead_byte(*p)) nth--;
1568  p++;
1569  }
1570  do {
1571  nth -= count_utf8_lead_bytes_with_word(s);
1572  s++;
1573  } while (s < t && (int)sizeof(VALUE) <= nth);
1574  p = (char *)s;
1575  }
1576  while (p < e) {
1577  if (is_utf8_lead_byte(*p)) {
1578  if (nth == 0) break;
1579  nth--;
1580  }
1581  p++;
1582  }
1583  *nthp = nth;
1584  return (char *)p;
1585 }
1586 
1587 static long
1588 str_utf8_offset(const char *p, const char *e, long nth)
1589 {
1590  const char *pp = str_utf8_nth(p, e, &nth);
1591  return pp - p;
1592 }
1593 #endif
1594 
1595 /* byte offset to char offset */
1596 long
1597 rb_str_sublen(VALUE str, long pos)
1598 {
1599  if (single_byte_optimizable(str) || pos < 0)
1600  return pos;
1601  else {
1602  char *p = RSTRING_PTR(str);
1603  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1604  }
1605 }
1606 
1607 VALUE
1608 rb_str_subseq(VALUE str, long beg, long len)
1609 {
1610  VALUE str2;
1611 
1612  if (RSTRING_LEN(str) == beg + len &&
1613  RSTRING_EMBED_LEN_MAX < len) {
1614  str2 = rb_str_new_shared(rb_str_new_frozen(str));
1615  rb_str_drop_bytes(str2, beg);
1616  }
1617  else {
1618  str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1619  }
1620 
1621  rb_enc_cr_str_copy_for_substr(str2, str);
1622  OBJ_INFECT(str2, str);
1623 
1624  return str2;
1625 }
1626 
1627 VALUE
1628 rb_str_substr(VALUE str, long beg, long len)
1629 {
1630  rb_encoding *enc = STR_ENC_GET(str);
1631  VALUE str2;
1632  char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1633 
1634  if (len < 0) return Qnil;
1635  if (!RSTRING_LEN(str)) {
1636  len = 0;
1637  }
1638  if (single_byte_optimizable(str)) {
1639  if (beg > RSTRING_LEN(str)) return Qnil;
1640  if (beg < 0) {
1641  beg += RSTRING_LEN(str);
1642  if (beg < 0) return Qnil;
1643  }
1644  if (beg + len > RSTRING_LEN(str))
1645  len = RSTRING_LEN(str) - beg;
1646  if (len <= 0) {
1647  len = 0;
1648  p = 0;
1649  }
1650  else
1651  p = s + beg;
1652  goto sub;
1653  }
1654  if (beg < 0) {
1655  if (len > -beg) len = -beg;
1656  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1657  beg = -beg;
1658  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1659  p = e;
1660  if (!p) return Qnil;
1661  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1662  if (!p) return Qnil;
1663  len = e - p;
1664  goto sub;
1665  }
1666  else {
1667  beg += str_strlen(str, enc);
1668  if (beg < 0) return Qnil;
1669  }
1670  }
1671  else if (beg > 0 && beg > RSTRING_LEN(str)) {
1672  return Qnil;
1673  }
1674  if (len == 0) {
1675  if (beg > str_strlen(str, enc)) return Qnil;
1676  p = 0;
1677  }
1678 #ifdef NONASCII_MASK
1679  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1680  enc == rb_utf8_encoding()) {
1681  p = str_utf8_nth(s, e, &beg);
1682  if (beg > 0) return Qnil;
1683  len = str_utf8_offset(p, e, len);
1684  }
1685 #endif
1686  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1687  int char_sz = rb_enc_mbmaxlen(enc);
1688 
1689  p = s + beg * char_sz;
1690  if (p > e) {
1691  return Qnil;
1692  }
1693  else if (len * char_sz > e - p)
1694  len = e - p;
1695  else
1696  len *= char_sz;
1697  }
1698  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1699  if (beg > 0) return Qnil;
1700  len = 0;
1701  }
1702  else {
1703  len = str_offset(p, e, len, enc, 0);
1704  }
1705  sub:
1706  if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1707  str2 = rb_str_new4(str);
1708  str2 = str_new3(rb_obj_class(str2), str2);
1709  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1710  RSTRING(str2)->as.heap.len = len;
1711  }
1712  else {
1713  str2 = rb_str_new5(str, p, len);
1714  rb_enc_cr_str_copy_for_substr(str2, str);
1715  OBJ_INFECT(str2, str);
1716  }
1717 
1718  return str2;
1719 }
1720 
1721 VALUE
1723 {
1724  if (STR_ASSOC_P(str)) {
1725  VALUE ary = RSTRING(str)->as.heap.aux.shared;
1726  OBJ_FREEZE(ary);
1727  }
1728  return rb_obj_freeze(str);
1729 }
1730 
1732 #define rb_str_dup_frozen rb_str_new_frozen
1733 
1734 VALUE
1735 rb_str_locktmp(VALUE str)
1736 {
1737  if (FL_TEST(str, STR_TMPLOCK)) {
1738  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1739  }
1740  FL_SET(str, STR_TMPLOCK);
1741  return str;
1742 }
1743 
1744 VALUE
1746 {
1747  if (!FL_TEST(str, STR_TMPLOCK)) {
1748  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1749  }
1750  FL_UNSET(str, STR_TMPLOCK);
1751  return str;
1752 }
1753 
1754 void
1755 rb_str_set_len(VALUE str, long len)
1756 {
1757  long capa;
1758 
1759  str_modifiable(str);
1760  if (STR_SHARED_P(str)) {
1761  rb_raise(rb_eRuntimeError, "can't set length of shared string");
1762  }
1763  if (len > (capa = (long)rb_str_capacity(str))) {
1764  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1765  }
1766  STR_SET_LEN(str, len);
1767  RSTRING_PTR(str)[len] = '\0';
1768 }
1769 
1770 VALUE
1771 rb_str_resize(VALUE str, long len)
1772 {
1773  long slen;
1774  int independent;
1775 
1776  if (len < 0) {
1777  rb_raise(rb_eArgError, "negative string size (or size too big)");
1778  }
1779 
1780  independent = str_independent(str);
1781  ENC_CODERANGE_CLEAR(str);
1782  slen = RSTRING_LEN(str);
1783  if (len != slen) {
1784  if (STR_EMBED_P(str)) {
1785  if (len <= RSTRING_EMBED_LEN_MAX) {
1786  STR_SET_EMBED_LEN(str, len);
1787  RSTRING(str)->as.ary[len] = '\0';
1788  return str;
1789  }
1790  str_make_independent_expand(str, len - slen);
1791  STR_SET_NOEMBED(str);
1792  }
1793  else if (len <= RSTRING_EMBED_LEN_MAX) {
1794  char *ptr = RSTRING(str)->as.heap.ptr;
1795  STR_SET_EMBED(str);
1796  if (slen > len) slen = len;
1797  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1798  RSTRING(str)->as.ary[len] = '\0';
1799  STR_SET_EMBED_LEN(str, len);
1800  if (independent) xfree(ptr);
1801  return str;
1802  }
1803  else if (!independent) {
1804  str_make_independent_expand(str, len - slen);
1805  }
1806  else if (slen < len || slen - len > 1024) {
1807  REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1808  }
1809  if (!STR_NOCAPA_P(str)) {
1810  RSTRING(str)->as.heap.aux.capa = len;
1811  }
1812  RSTRING(str)->as.heap.len = len;
1813  RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
1814  }
1815  return str;
1816 }
1817 
1818 static VALUE
1819 str_buf_cat(VALUE str, const char *ptr, long len)
1820 {
1821  long capa, total, off = -1;
1822 
1823  if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1824  off = ptr - RSTRING_PTR(str);
1825  }
1826  rb_str_modify(str);
1827  if (len == 0) return 0;
1828  if (STR_ASSOC_P(str)) {
1829  FL_UNSET(str, STR_ASSOC);
1830  capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1831  }
1832  else if (STR_EMBED_P(str)) {
1833  capa = RSTRING_EMBED_LEN_MAX;
1834  }
1835  else {
1836  capa = RSTRING(str)->as.heap.aux.capa;
1837  }
1838  if (RSTRING_LEN(str) >= LONG_MAX - len) {
1839  rb_raise(rb_eArgError, "string sizes too big");
1840  }
1841  total = RSTRING_LEN(str)+len;
1842  if (capa <= total) {
1843  while (total > capa) {
1844  if (capa + 1 >= LONG_MAX / 2) {
1845  capa = (total + 4095) / 4096;
1846  break;
1847  }
1848  capa = (capa + 1) * 2;
1849  }
1850  RESIZE_CAPA(str, capa);
1851  }
1852  if (off != -1) {
1853  ptr = RSTRING_PTR(str) + off;
1854  }
1855  memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1856  STR_SET_LEN(str, total);
1857  RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1858 
1859  return str;
1860 }
1861 
1862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1863 
1864 VALUE
1865 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1866 {
1867  if (len == 0) return str;
1868  if (len < 0) {
1869  rb_raise(rb_eArgError, "negative string size (or size too big)");
1870  }
1871  return str_buf_cat(str, ptr, len);
1872 }
1873 
1874 VALUE
1875 rb_str_buf_cat2(VALUE str, const char *ptr)
1876 {
1877  return rb_str_buf_cat(str, ptr, strlen(ptr));
1878 }
1879 
1880 VALUE
1881 rb_str_cat(VALUE str, const char *ptr, long len)
1882 {
1883  if (len < 0) {
1884  rb_raise(rb_eArgError, "negative string size (or size too big)");
1885  }
1886  if (STR_ASSOC_P(str)) {
1887  char *p;
1888  rb_str_modify_expand(str, len);
1889  p = RSTRING(str)->as.heap.ptr;
1890  memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1891  len = RSTRING(str)->as.heap.len += len;
1892  p[len] = '\0'; /* sentinel */
1893  return str;
1894  }
1895 
1896  return rb_str_buf_cat(str, ptr, len);
1897 }
1898 
1899 VALUE
1900 rb_str_cat2(VALUE str, const char *ptr)
1901 {
1902  return rb_str_cat(str, ptr, strlen(ptr));
1903 }
1904 
1905 static VALUE
1906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1907  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1908 {
1909  int str_encindex = ENCODING_GET(str);
1910  int res_encindex;
1911  int str_cr, res_cr;
1912 
1913  str_cr = ENC_CODERANGE(str);
1914 
1915  if (str_encindex == ptr_encindex) {
1916  if (str_cr == ENC_CODERANGE_UNKNOWN)
1917  ptr_cr = ENC_CODERANGE_UNKNOWN;
1918  else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1919  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1920  }
1921  }
1922  else {
1923  rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1924  rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1925  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1926  if (len == 0)
1927  return str;
1928  if (RSTRING_LEN(str) == 0) {
1929  rb_str_buf_cat(str, ptr, len);
1930  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1931  return str;
1932  }
1933  goto incompatible;
1934  }
1935  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1936  ptr_cr = coderange_scan(ptr, len, ptr_enc);
1937  }
1938  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1939  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
1940  str_cr = rb_enc_str_coderange(str);
1941  }
1942  }
1943  }
1944  if (ptr_cr_ret)
1945  *ptr_cr_ret = ptr_cr;
1946 
1947  if (str_encindex != ptr_encindex &&
1948  str_cr != ENC_CODERANGE_7BIT &&
1949  ptr_cr != ENC_CODERANGE_7BIT) {
1950  incompatible:
1951  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1952  rb_enc_name(rb_enc_from_index(str_encindex)),
1953  rb_enc_name(rb_enc_from_index(ptr_encindex)));
1954  }
1955 
1956  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1957  res_encindex = str_encindex;
1958  res_cr = ENC_CODERANGE_UNKNOWN;
1959  }
1960  else if (str_cr == ENC_CODERANGE_7BIT) {
1961  if (ptr_cr == ENC_CODERANGE_7BIT) {
1962  res_encindex = str_encindex;
1963  res_cr = ENC_CODERANGE_7BIT;
1964  }
1965  else {
1966  res_encindex = ptr_encindex;
1967  res_cr = ptr_cr;
1968  }
1969  }
1970  else if (str_cr == ENC_CODERANGE_VALID) {
1971  res_encindex = str_encindex;
1972  if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
1973  res_cr = str_cr;
1974  else
1975  res_cr = ptr_cr;
1976  }
1977  else { /* str_cr == ENC_CODERANGE_BROKEN */
1978  res_encindex = str_encindex;
1979  res_cr = str_cr;
1980  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1981  }
1982 
1983  if (len < 0) {
1984  rb_raise(rb_eArgError, "negative string size (or size too big)");
1985  }
1986  str_buf_cat(str, ptr, len);
1987  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1988  return str;
1989 }
1990 
1991 VALUE
1992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1993 {
1994  return rb_enc_cr_str_buf_cat(str, ptr, len,
1996 }
1997 
1998 VALUE
1999 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2000 {
2001  /* ptr must reference NUL terminated ASCII string. */
2002  int encindex = ENCODING_GET(str);
2003  rb_encoding *enc = rb_enc_from_index(encindex);
2004  if (rb_enc_asciicompat(enc)) {
2005  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2006  encindex, ENC_CODERANGE_7BIT, 0);
2007  }
2008  else {
2009  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2010  while (*ptr) {
2011  unsigned int c = (unsigned char)*ptr;
2012  int len = rb_enc_codelen(c, enc);
2013  rb_enc_mbcput(c, buf, enc);
2014  rb_enc_cr_str_buf_cat(str, buf, len,
2015  encindex, ENC_CODERANGE_VALID, 0);
2016  ptr++;
2017  }
2018  return str;
2019  }
2020 }
2021 
2022 VALUE
2024 {
2025  int str2_cr;
2026 
2027  str2_cr = ENC_CODERANGE(str2);
2028 
2029  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2030  ENCODING_GET(str2), str2_cr, &str2_cr);
2031 
2032  OBJ_INFECT(str, str2);
2033  ENC_CODERANGE_SET(str2, str2_cr);
2034 
2035  return str;
2036 }
2037 
2038 VALUE
2040 {
2041  rb_encoding *enc;
2042  int cr, cr2;
2043  long len2;
2044 
2045  StringValue(str2);
2046  if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2047  long len = RSTRING_LEN(str) + len2;
2048  enc = rb_enc_check(str, str2);
2049  cr = ENC_CODERANGE(str);
2050  if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2051  rb_str_modify_expand(str, len2);
2052  memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2053  RSTRING_PTR(str2), len2+1);
2054  RSTRING(str)->as.heap.len = len;
2055  rb_enc_associate(str, enc);
2056  ENC_CODERANGE_SET(str, cr);
2057  OBJ_INFECT(str, str2);
2058  return str;
2059  }
2060  return rb_str_buf_append(str, str2);
2061 }
2062 
2063 /*
2064  * call-seq:
2065  * str << integer -> str
2066  * str.concat(integer) -> str
2067  * str << obj -> str
2068  * str.concat(obj) -> str
2069  *
2070  * Append---Concatenates the given object to <i>str</i>. If the object is a
2071  * <code>Integer</code>, it is considered as a codepoint, and is converted
2072  * to a character before concatenation.
2073  *
2074  * a = "hello "
2075  * a << "world" #=> "hello world"
2076  * a.concat(33) #=> "hello world!"
2077  */
2078 
2079 VALUE
2081 {
2082  unsigned int code;
2083  rb_encoding *enc = STR_ENC_GET(str1);
2084 
2085  if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
2086  if (rb_num_to_uint(str2, &code) == 0) {
2087  }
2088  else if (FIXNUM_P(str2)) {
2089  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2090  }
2091  else {
2092  rb_raise(rb_eRangeError, "bignum out of char range");
2093  }
2094  }
2095  else {
2096  return rb_str_append(str1, str2);
2097  }
2098 
2099  if (enc == rb_usascii_encoding()) {
2100  /* US-ASCII automatically extended to ASCII-8BIT */
2101  char buf[1] = {(char)code};
2102  if (code > 0xFF) {
2103  rb_raise(rb_eRangeError, "%u out of char range", code);
2104  }
2105  rb_str_cat(str1, buf, 1);
2106  if (code > 127) {
2109  }
2110  }
2111  else {
2112  long pos = RSTRING_LEN(str1);
2113  int cr = ENC_CODERANGE(str1);
2114  int len;
2115  char *buf;
2116 
2117  switch (len = rb_enc_codelen(code, enc)) {
2119  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2120  break;
2122  case 0:
2123  rb_raise(rb_eRangeError, "%u out of char range", code);
2124  break;
2125  }
2126  buf = ALLOCA_N(char, len + 1);
2127  rb_enc_mbcput(code, buf, enc);
2128  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2129  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2130  }
2131  rb_str_resize(str1, pos+len);
2132  strncpy(RSTRING_PTR(str1) + pos, buf, len);
2133  if (cr == ENC_CODERANGE_7BIT && code > 127)
2134  cr = ENC_CODERANGE_VALID;
2135  ENC_CODERANGE_SET(str1, cr);
2136  }
2137  return str1;
2138 }
2139 
2140 /*
2141  * call-seq:
2142  * str.prepend(other_str) -> str
2143  *
2144  * Prepend---Prepend the given string to <i>str</i>.
2145  *
2146  * a = "world"
2147  * a.prepend("hello ") #=> "hello world"
2148  * a #=> "hello world"
2149  */
2150 
2151 static VALUE
2153 {
2154  StringValue(str2);
2155  StringValue(str);
2156  rb_str_update(str, 0L, 0L, str2);
2157  return str;
2158 }
2159 
2160 st_index_t
2162 {
2163  int e = ENCODING_GET(str);
2164  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2165  e = 0;
2166  }
2167  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2168 }
2169 
2170 int
2172 {
2173  long len;
2174 
2175  if (!rb_str_comparable(str1, str2)) return 1;
2176  if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2177  memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2178  return 0;
2179  }
2180  return 1;
2181 }
2182 
2183 /*
2184  * call-seq:
2185  * str.hash -> fixnum
2186  *
2187  * Return a hash based on the string's length and content.
2188  */
2189 
2190 static VALUE
2192 {
2193  st_index_t hval = rb_str_hash(str);
2194  return INT2FIX(hval);
2195 }
2196 
2197 #define lesser(a,b) (((a)>(b))?(b):(a))
2198 
2199 int
2201 {
2202  int idx1, idx2;
2203  int rc1, rc2;
2204 
2205  if (RSTRING_LEN(str1) == 0) return TRUE;
2206  if (RSTRING_LEN(str2) == 0) return TRUE;
2207  idx1 = ENCODING_GET(str1);
2208  idx2 = ENCODING_GET(str2);
2209  if (idx1 == idx2) return TRUE;
2210  rc1 = rb_enc_str_coderange(str1);
2211  rc2 = rb_enc_str_coderange(str2);
2212  if (rc1 == ENC_CODERANGE_7BIT) {
2213  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2215  return TRUE;
2216  }
2217  if (rc2 == ENC_CODERANGE_7BIT) {
2219  return TRUE;
2220  }
2221  return FALSE;
2222 }
2223 
2224 int
2226 {
2227  long len1, len2;
2228  const char *ptr1, *ptr2;
2229  int retval;
2230 
2231  if (str1 == str2) return 0;
2232  RSTRING_GETMEM(str1, ptr1, len1);
2233  RSTRING_GETMEM(str2, ptr2, len2);
2234  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2235  if (len1 == len2) {
2236  if (!rb_str_comparable(str1, str2)) {
2237  if (ENCODING_GET(str1) > ENCODING_GET(str2))
2238  return 1;
2239  return -1;
2240  }
2241  return 0;
2242  }
2243  if (len1 > len2) return 1;
2244  return -1;
2245  }
2246  if (retval > 0) return 1;
2247  return -1;
2248 }
2249 
2250 /* expect tail call optimization */
2251 static VALUE
2252 str_eql(const VALUE str1, const VALUE str2)
2253 {
2254  const long len = RSTRING_LEN(str1);
2255  const char *ptr1, *ptr2;
2256 
2257  if (len != RSTRING_LEN(str2)) return Qfalse;
2258  if (!rb_str_comparable(str1, str2)) return Qfalse;
2259  if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2260  return Qtrue;
2261  if (memcmp(ptr1, ptr2, len) == 0)
2262  return Qtrue;
2263  return Qfalse;
2264 }
2265 /*
2266  * call-seq:
2267  * str == obj -> true or false
2268  *
2269  * Equality---If <i>obj</i> is not a <code>String</code>, returns
2270  * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2271  * <code><=></code> <i>obj</i> returns zero.
2272  */
2273 
2274 VALUE
2276 {
2277  if (str1 == str2) return Qtrue;
2278  if (TYPE(str2) != T_STRING) {
2279  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2280  return Qfalse;
2281  }
2282  return rb_equal(str2, str1);
2283  }
2284  return str_eql(str1, str2);
2285 }
2286 
2287 /*
2288  * call-seq:
2289  * str.eql?(other) -> true or false
2290  *
2291  * Two strings are equal if they have the same length and content.
2292  */
2293 
2294 static VALUE
2296 {
2297  if (str1 == str2) return Qtrue;
2298  if (TYPE(str2) != T_STRING) return Qfalse;
2299  return str_eql(str1, str2);
2300 }
2301 
2302 /*
2303  * call-seq:
2304  * str <=> other_str -> -1, 0, +1 or nil
2305  *
2306  * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
2307  * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
2308  * <i>str</i>. If the strings are of different lengths, and the strings are
2309  * equal when compared up to the shortest length, then the longer string is
2310  * considered greater than the shorter one. In older versions of Ruby, setting
2311  * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
2312  * in favor of using <code>String#casecmp</code>.
2313  *
2314  * <code><=></code> is the basis for the methods <code><</code>,
2315  * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
2316  * included from module <code>Comparable</code>. The method
2317  * <code>String#==</code> does not use <code>Comparable#==</code>.
2318  *
2319  * "abcdef" <=> "abcde" #=> 1
2320  * "abcdef" <=> "abcdef" #=> 0
2321  * "abcdef" <=> "abcdefg" #=> -1
2322  * "abcdef" <=> "ABCDEF" #=> 1
2323  */
2324 
2325 static VALUE
2327 {
2328  long result;
2329 
2330  if (TYPE(str2) != T_STRING) {
2331  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2332  return Qnil;
2333  }
2334  else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2335  return Qnil;
2336  }
2337  else {
2338  VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2339 
2340  if (NIL_P(tmp)) return Qnil;
2341  if (!FIXNUM_P(tmp)) {
2342  return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2343  }
2344  result = -FIX2LONG(tmp);
2345  }
2346  }
2347  else {
2348  result = rb_str_cmp(str1, str2);
2349  }
2350  return LONG2NUM(result);
2351 }
2352 
2353 /*
2354  * call-seq:
2355  * str.casecmp(other_str) -> -1, 0, +1 or nil
2356  *
2357  * Case-insensitive version of <code>String#<=></code>.
2358  *
2359  * "abcdef".casecmp("abcde") #=> 1
2360  * "aBcDeF".casecmp("abcdef") #=> 0
2361  * "abcdef".casecmp("abcdefg") #=> -1
2362  * "abcdef".casecmp("ABCDEF") #=> 0
2363  */
2364 
2365 static VALUE
2367 {
2368  long len;
2369  rb_encoding *enc;
2370  char *p1, *p1end, *p2, *p2end;
2371 
2372  StringValue(str2);
2373  enc = rb_enc_compatible(str1, str2);
2374  if (!enc) {
2375  return Qnil;
2376  }
2377 
2378  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2379  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2380  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2381  while (p1 < p1end && p2 < p2end) {
2382  if (*p1 != *p2) {
2383  unsigned int c1 = TOUPPER(*p1 & 0xff);
2384  unsigned int c2 = TOUPPER(*p2 & 0xff);
2385  if (c1 != c2)
2386  return INT2FIX(c1 < c2 ? -1 : 1);
2387  }
2388  p1++;
2389  p2++;
2390  }
2391  }
2392  else {
2393  while (p1 < p1end && p2 < p2end) {
2394  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2395  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2396 
2397  if (0 <= c1 && 0 <= c2) {
2398  c1 = TOUPPER(c1);
2399  c2 = TOUPPER(c2);
2400  if (c1 != c2)
2401  return INT2FIX(c1 < c2 ? -1 : 1);
2402  }
2403  else {
2404  int r;
2405  l1 = rb_enc_mbclen(p1, p1end, enc);
2406  l2 = rb_enc_mbclen(p2, p2end, enc);
2407  len = l1 < l2 ? l1 : l2;
2408  r = memcmp(p1, p2, len);
2409  if (r != 0)
2410  return INT2FIX(r < 0 ? -1 : 1);
2411  if (l1 != l2)
2412  return INT2FIX(l1 < l2 ? -1 : 1);
2413  }
2414  p1 += l1;
2415  p2 += l2;
2416  }
2417  }
2418  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2419  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2420  return INT2FIX(-1);
2421 }
2422 
2423 static long
2424 rb_str_index(VALUE str, VALUE sub, long offset)
2425 {
2426  long pos;
2427  char *s, *sptr, *e;
2428  long len, slen;
2429  rb_encoding *enc;
2430 
2431  enc = rb_enc_check(str, sub);
2432  if (is_broken_string(sub)) {
2433  return -1;
2434  }
2435  len = str_strlen(str, enc);
2436  slen = str_strlen(sub, enc);
2437  if (offset < 0) {
2438  offset += len;
2439  if (offset < 0) return -1;
2440  }
2441  if (len - offset < slen) return -1;
2442  s = RSTRING_PTR(str);
2443  e = s + RSTRING_LEN(str);
2444  if (offset) {
2445  offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2446  s += offset;
2447  }
2448  if (slen == 0) return offset;
2449  /* need proceed one character at a time */
2450  sptr = RSTRING_PTR(sub);
2451  slen = RSTRING_LEN(sub);
2452  len = RSTRING_LEN(str) - offset;
2453  for (;;) {
2454  char *t;
2455  pos = rb_memsearch(sptr, slen, s, len, enc);
2456  if (pos < 0) return pos;
2457  t = rb_enc_right_char_head(s, s+pos, e, enc);
2458  if (t == s + pos) break;
2459  if ((len -= t - s) <= 0) return -1;
2460  offset += t - s;
2461  s = t;
2462  }
2463  return pos + offset;
2464 }
2465 
2466 
2467 /*
2468  * call-seq:
2469  * str.index(substring [, offset]) -> fixnum or nil
2470  * str.index(regexp [, offset]) -> fixnum or nil
2471  *
2472  * Returns the index of the first occurrence of the given <i>substring</i> or
2473  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2474  * found. If the second parameter is present, it specifies the position in the
2475  * string to begin the search.
2476  *
2477  * "hello".index('e') #=> 1
2478  * "hello".index('lo') #=> 3
2479  * "hello".index('a') #=> nil
2480  * "hello".index(?e) #=> 1
2481  * "hello".index(/[aeiou]/, -3) #=> 4
2482  */
2483 
2484 static VALUE
2486 {
2487  VALUE sub;
2488  VALUE initpos;
2489  long pos;
2490 
2491  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2492  pos = NUM2LONG(initpos);
2493  }
2494  else {
2495  pos = 0;
2496  }
2497  if (pos < 0) {
2498  pos += str_strlen(str, STR_ENC_GET(str));
2499  if (pos < 0) {
2500  if (TYPE(sub) == T_REGEXP) {
2502  }
2503  return Qnil;
2504  }
2505  }
2506 
2507  switch (TYPE(sub)) {
2508  case T_REGEXP:
2509  if (pos > str_strlen(str, STR_ENC_GET(str)))
2510  return Qnil;
2511  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2512  rb_enc_check(str, sub), single_byte_optimizable(str));
2513 
2514  pos = rb_reg_search(sub, str, pos, 0);
2515  pos = rb_str_sublen(str, pos);
2516  break;
2517 
2518  default: {
2519  VALUE tmp;
2520 
2521  tmp = rb_check_string_type(sub);
2522  if (NIL_P(tmp)) {
2523  rb_raise(rb_eTypeError, "type mismatch: %s given",
2524  rb_obj_classname(sub));
2525  }
2526  sub = tmp;
2527  }
2528  /* fall through */
2529  case T_STRING:
2530  pos = rb_str_index(str, sub, pos);
2531  pos = rb_str_sublen(str, pos);
2532  break;
2533  }
2534 
2535  if (pos == -1) return Qnil;
2536  return LONG2NUM(pos);
2537 }
2538 
2539 static long
2540 rb_str_rindex(VALUE str, VALUE sub, long pos)
2541 {
2542  long len, slen;
2543  char *s, *sbeg, *e, *t;
2544  rb_encoding *enc;
2545  int singlebyte = single_byte_optimizable(str);
2546 
2547  enc = rb_enc_check(str, sub);
2548  if (is_broken_string(sub)) {
2549  return -1;
2550  }
2551  len = str_strlen(str, enc);
2552  slen = str_strlen(sub, enc);
2553  /* substring longer than string */
2554  if (len < slen) return -1;
2555  if (len - pos < slen) {
2556  pos = len - slen;
2557  }
2558  if (len == 0) {
2559  return pos;
2560  }
2561  sbeg = RSTRING_PTR(str);
2562  e = RSTRING_END(str);
2563  t = RSTRING_PTR(sub);
2564  slen = RSTRING_LEN(sub);
2565  s = str_nth(sbeg, e, pos, enc, singlebyte);
2566  while (s) {
2567  if (memcmp(s, t, slen) == 0) {
2568  return pos;
2569  }
2570  if (pos == 0) break;
2571  pos--;
2572  s = rb_enc_prev_char(sbeg, s, e, enc);
2573  }
2574  return -1;
2575 }
2576 
2577 
2578 /*
2579  * call-seq:
2580  * str.rindex(substring [, fixnum]) -> fixnum or nil
2581  * str.rindex(regexp [, fixnum]) -> fixnum or nil
2582  *
2583  * Returns the index of the last occurrence of the given <i>substring</i> or
2584  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2585  * found. If the second parameter is present, it specifies the position in the
2586  * string to end the search---characters beyond this point will not be
2587  * considered.
2588  *
2589  * "hello".rindex('e') #=> 1
2590  * "hello".rindex('l') #=> 3
2591  * "hello".rindex('a') #=> nil
2592  * "hello".rindex(?e) #=> 1
2593  * "hello".rindex(/[aeiou]/, -2) #=> 1
2594  */
2595 
2596 static VALUE
2598 {
2599  VALUE sub;
2600  VALUE vpos;
2601  rb_encoding *enc = STR_ENC_GET(str);
2602  long pos, len = str_strlen(str, enc);
2603 
2604  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2605  pos = NUM2LONG(vpos);
2606  if (pos < 0) {
2607  pos += len;
2608  if (pos < 0) {
2609  if (TYPE(sub) == T_REGEXP) {
2611  }
2612  return Qnil;
2613  }
2614  }
2615  if (pos > len) pos = len;
2616  }
2617  else {
2618  pos = len;
2619  }
2620 
2621  switch (TYPE(sub)) {
2622  case T_REGEXP:
2623  /* enc = rb_get_check(str, sub); */
2624  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2626 
2627  if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2628  pos = rb_reg_search(sub, str, pos, 1);
2629  pos = rb_str_sublen(str, pos);
2630  }
2631  if (pos >= 0) return LONG2NUM(pos);
2632  break;
2633 
2634  default: {
2635  VALUE tmp;
2636 
2637  tmp = rb_check_string_type(sub);
2638  if (NIL_P(tmp)) {
2639  rb_raise(rb_eTypeError, "type mismatch: %s given",
2640  rb_obj_classname(sub));
2641  }
2642  sub = tmp;
2643  }
2644  /* fall through */
2645  case T_STRING:
2646  pos = rb_str_rindex(str, sub, pos);
2647  if (pos >= 0) return LONG2NUM(pos);
2648  break;
2649  }
2650  return Qnil;
2651 }
2652 
2653 /*
2654  * call-seq:
2655  * str =~ obj -> fixnum or nil
2656  *
2657  * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2658  * against <i>str</i>,and returns the position the match starts, or
2659  * <code>nil</code> if there is no match. Otherwise, invokes
2660  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2661  * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2662  *
2663  * "cat o' 9 tails" =~ /\d/ #=> 7
2664  * "cat o' 9 tails" =~ 9 #=> nil
2665  */
2666 
2667 static VALUE
2669 {
2670  switch (TYPE(y)) {
2671  case T_STRING:
2672  rb_raise(rb_eTypeError, "type mismatch: String given");
2673 
2674  case T_REGEXP:
2675  return rb_reg_match(y, x);
2676 
2677  default:
2678  return rb_funcall(y, rb_intern("=~"), 1, x);
2679  }
2680 }
2681 
2682 
2683 static VALUE get_pat(VALUE, int);
2684 
2685 
2686 /*
2687  * call-seq:
2688  * str.match(pattern) -> matchdata or nil
2689  * str.match(pattern, pos) -> matchdata or nil
2690  *
2691  * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2692  * then invokes its <code>match</code> method on <i>str</i>. If the second
2693  * parameter is present, it specifies the position in the string to begin the
2694  * search.
2695  *
2696  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
2697  * 'hello'.match('(.)\1')[0] #=> "ll"
2698  * 'hello'.match(/(.)\1/)[0] #=> "ll"
2699  * 'hello'.match('xx') #=> nil
2700  *
2701  * If a block is given, invoke the block with MatchData if match succeed, so
2702  * that you can write
2703  *
2704  * str.match(pat) {|m| ...}
2705  *
2706  * instead of
2707  *
2708  * if m = str.match(pat)
2709  * ...
2710  * end
2711  *
2712  * The return value is a value from block execution in this case.
2713  */
2714 
2715 static VALUE
2717 {
2718  VALUE re, result;
2719  if (argc < 1)
2720  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
2721  re = argv[0];
2722  argv[0] = str;
2723  result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2724  if (!NIL_P(result) && rb_block_given_p()) {
2725  return rb_yield(result);
2726  }
2727  return result;
2728 }
2729 
2734 };
2735 
2736 static enum neighbor_char
2737 enc_succ_char(char *p, long len, rb_encoding *enc)
2738 {
2739  long i;
2740  int l;
2741  while (1) {
2742  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2743  p[i] = '\0';
2744  if (i < 0)
2745  return NEIGHBOR_WRAPPED;
2746  ++((unsigned char*)p)[i];
2747  l = rb_enc_precise_mbclen(p, p+len, enc);
2748  if (MBCLEN_CHARFOUND_P(l)) {
2749  l = MBCLEN_CHARFOUND_LEN(l);
2750  if (l == len) {
2751  return NEIGHBOR_FOUND;
2752  }
2753  else {
2754  memset(p+l, 0xff, len-l);
2755  }
2756  }
2757  if (MBCLEN_INVALID_P(l) && i < len-1) {
2758  long len2;
2759  int l2;
2760  for (len2 = len-1; 0 < len2; len2--) {
2761  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2762  if (!MBCLEN_INVALID_P(l2))
2763  break;
2764  }
2765  memset(p+len2+1, 0xff, len-(len2+1));
2766  }
2767  }
2768 }
2769 
2770 static enum neighbor_char
2771 enc_pred_char(char *p, long len, rb_encoding *enc)
2772 {
2773  long i;
2774  int l;
2775  while (1) {
2776  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2777  p[i] = '\xff';
2778  if (i < 0)
2779  return NEIGHBOR_WRAPPED;
2780  --((unsigned char*)p)[i];
2781  l = rb_enc_precise_mbclen(p, p+len, enc);
2782  if (MBCLEN_CHARFOUND_P(l)) {
2783  l = MBCLEN_CHARFOUND_LEN(l);
2784  if (l == len) {
2785  return NEIGHBOR_FOUND;
2786  }
2787  else {
2788  memset(p+l, 0, len-l);
2789  }
2790  }
2791  if (MBCLEN_INVALID_P(l) && i < len-1) {
2792  long len2;
2793  int l2;
2794  for (len2 = len-1; 0 < len2; len2--) {
2795  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2796  if (!MBCLEN_INVALID_P(l2))
2797  break;
2798  }
2799  memset(p+len2+1, 0, len-(len2+1));
2800  }
2801  }
2802 }
2803 
2804 /*
2805  overwrite +p+ by succeeding letter in +enc+ and returns
2806  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2807  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2808  assuming each ranges are successive, and mbclen
2809  never change in each ranges.
2810  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2811  character.
2812  */
2813 static enum neighbor_char
2814 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2815 {
2816  enum neighbor_char ret;
2817  unsigned int c;
2818  int ctype;
2819  int range;
2820  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2821 
2822  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2823  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2824  ctype = ONIGENC_CTYPE_DIGIT;
2825  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2826  ctype = ONIGENC_CTYPE_ALPHA;
2827  else
2828  return NEIGHBOR_NOT_CHAR;
2829 
2830  MEMCPY(save, p, char, len);
2831  ret = enc_succ_char(p, len, enc);
2832  if (ret == NEIGHBOR_FOUND) {
2833  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2834  if (rb_enc_isctype(c, ctype, enc))
2835  return NEIGHBOR_FOUND;
2836  }
2837  MEMCPY(p, save, char, len);
2838  range = 1;
2839  while (1) {
2840  MEMCPY(save, p, char, len);
2841  ret = enc_pred_char(p, len, enc);
2842  if (ret == NEIGHBOR_FOUND) {
2843  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2844  if (!rb_enc_isctype(c, ctype, enc)) {
2845  MEMCPY(p, save, char, len);
2846  break;
2847  }
2848  }
2849  else {
2850  MEMCPY(p, save, char, len);
2851  break;
2852  }
2853  range++;
2854  }
2855  if (range == 1) {
2856  return NEIGHBOR_NOT_CHAR;
2857  }
2858 
2859  if (ctype != ONIGENC_CTYPE_DIGIT) {
2860  MEMCPY(carry, p, char, len);
2861  return NEIGHBOR_WRAPPED;
2862  }
2863 
2864  MEMCPY(carry, p, char, len);
2865  enc_succ_char(carry, len, enc);
2866  return NEIGHBOR_WRAPPED;
2867 }
2868 
2869 
2870 /*
2871  * call-seq:
2872  * str.succ -> new_str
2873  * str.next -> new_str
2874  *
2875  * Returns the successor to <i>str</i>. The successor is calculated by
2876  * incrementing characters starting from the rightmost alphanumeric (or
2877  * the rightmost character if there are no alphanumerics) in the
2878  * string. Incrementing a digit always results in another digit, and
2879  * incrementing a letter results in another letter of the same case.
2880  * Incrementing nonalphanumerics uses the underlying character set's
2881  * collating sequence.
2882  *
2883  * If the increment generates a ``carry,'' the character to the left of
2884  * it is incremented. This process repeats until there is no carry,
2885  * adding an additional character if necessary.
2886  *
2887  * "abcd".succ #=> "abce"
2888  * "THX1138".succ #=> "THX1139"
2889  * "<<koala>>".succ #=> "<<koalb>>"
2890  * "1999zzz".succ #=> "2000aaa"
2891  * "ZZZ9999".succ #=> "AAAA0000"
2892  * "***".succ #=> "**+"
2893  */
2894 
2895 VALUE
2897 {
2898  rb_encoding *enc;
2899  VALUE str;
2900  char *sbeg, *s, *e, *last_alnum = 0;
2901  int c = -1;
2902  long l;
2903  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2904  long carry_pos = 0, carry_len = 1;
2905  enum neighbor_char neighbor = NEIGHBOR_FOUND;
2906 
2907  str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2908  rb_enc_cr_str_copy_for_substr(str, orig);
2909  OBJ_INFECT(str, orig);
2910  if (RSTRING_LEN(str) == 0) return str;
2911 
2912  enc = STR_ENC_GET(orig);
2913  sbeg = RSTRING_PTR(str);
2914  s = e = sbeg + RSTRING_LEN(str);
2915 
2916  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2917  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2918  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2919  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2920  s = last_alnum;
2921  break;
2922  }
2923  }
2924  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2925  neighbor = enc_succ_alnum_char(s, l, enc, carry);
2926  switch (neighbor) {
2927  case NEIGHBOR_NOT_CHAR:
2928  continue;
2929  case NEIGHBOR_FOUND:
2930  return str;
2931  case NEIGHBOR_WRAPPED:
2932  last_alnum = s;
2933  break;
2934  }
2935  c = 1;
2936  carry_pos = s - sbeg;
2937  carry_len = l;
2938  }
2939  if (c == -1) { /* str contains no alnum */
2940  s = e;
2941  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2942  enum neighbor_char neighbor;
2943  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2944  neighbor = enc_succ_char(s, l, enc);
2945  if (neighbor == NEIGHBOR_FOUND)
2946  return str;
2947  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2948  /* wrapped to \0...\0. search next valid char. */
2949  enc_succ_char(s, l, enc);
2950  }
2951  if (!rb_enc_asciicompat(enc)) {
2952  MEMCPY(carry, s, char, l);
2953  carry_len = l;
2954  }
2955  carry_pos = s - sbeg;
2956  }
2957  }
2958  RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2959  s = RSTRING_PTR(str) + carry_pos;
2960  memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2961  memmove(s, carry, carry_len);
2962  STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2963  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2964  rb_enc_str_coderange(str);
2965  return str;
2966 }
2967 
2968 
2969 /*
2970  * call-seq:
2971  * str.succ! -> str
2972  * str.next! -> str
2973  *
2974  * Equivalent to <code>String#succ</code>, but modifies the receiver in
2975  * place.
2976  */
2977 
2978 static VALUE
2980 {
2982 
2983  return str;
2984 }
2985 
2986 
2987 /*
2988  * call-seq:
2989  * str.upto(other_str, exclusive=false) {|s| block } -> str
2990  * str.upto(other_str, exclusive=false) -> an_enumerator
2991  *
2992  * Iterates through successive values, starting at <i>str</i> and
2993  * ending at <i>other_str</i> inclusive, passing each value in turn to
2994  * the block. The <code>String#succ</code> method is used to generate
2995  * each value. If optional second argument exclusive is omitted or is false,
2996  * the last value will be included; otherwise it will be excluded.
2997  *
2998  * If no block is given, an enumerator is returned instead.
2999  *
3000  * "a8".upto("b6") {|s| print s, ' ' }
3001  * for s in "a8".."b6"
3002  * print s, ' '
3003  * end
3004  *
3005  * <em>produces:</em>
3006  *
3007  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3008  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3009  *
3010  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3011  * both are recognized as decimal numbers. In addition, the width of
3012  * string (e.g. leading zeros) is handled appropriately.
3013  *
3014  * "9".upto("11").to_a #=> ["9", "10", "11"]
3015  * "25".upto("5").to_a #=> []
3016  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
3017  */
3018 
3019 static VALUE
3021 {
3022  VALUE end, exclusive;
3023  VALUE current, after_end;
3024  ID succ;
3025  int n, excl, ascii;
3026  rb_encoding *enc;
3027 
3028  rb_scan_args(argc, argv, "11", &end, &exclusive);
3029  RETURN_ENUMERATOR(beg, argc, argv);
3030  excl = RTEST(exclusive);
3031  CONST_ID(succ, "succ");
3032  StringValue(end);
3033  enc = rb_enc_check(beg, end);
3034  ascii = (is_ascii_string(beg) && is_ascii_string(end));
3035  /* single character */
3036  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3037  char c = RSTRING_PTR(beg)[0];
3038  char e = RSTRING_PTR(end)[0];
3039 
3040  if (c > e || (excl && c == e)) return beg;
3041  for (;;) {
3042  rb_yield(rb_enc_str_new(&c, 1, enc));
3043  if (!excl && c == e) break;
3044  c++;
3045  if (excl && c == e) break;
3046  }
3047  return beg;
3048  }
3049  /* both edges are all digits */
3050  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3051  char *s, *send;
3052  VALUE b, e;
3053  int width;
3054 
3055  s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3056  width = rb_long2int(send - s);
3057  while (s < send) {
3058  if (!ISDIGIT(*s)) goto no_digits;
3059  s++;
3060  }
3061  s = RSTRING_PTR(end); send = RSTRING_END(end);
3062  while (s < send) {
3063  if (!ISDIGIT(*s)) goto no_digits;
3064  s++;
3065  }
3066  b = rb_str_to_inum(beg, 10, FALSE);
3067  e = rb_str_to_inum(end, 10, FALSE);
3068  if (FIXNUM_P(b) && FIXNUM_P(e)) {
3069  long bi = FIX2LONG(b);
3070  long ei = FIX2LONG(e);
3071  rb_encoding *usascii = rb_usascii_encoding();
3072 
3073  while (bi <= ei) {
3074  if (excl && bi == ei) break;
3075  rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3076  bi++;
3077  }
3078  }
3079  else {
3080  ID op = excl ? '<' : rb_intern("<=");
3081  VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3082 
3083  args[0] = INT2FIX(width);
3084  while (rb_funcall(b, op, 1, e)) {
3085  args[1] = b;
3086  rb_yield(rb_str_format(numberof(args), args, fmt));
3087  b = rb_funcall(b, succ, 0, 0);
3088  }
3089  }
3090  return beg;
3091  }
3092  /* normal case */
3093  no_digits:
3094  n = rb_str_cmp(beg, end);
3095  if (n > 0 || (excl && n == 0)) return beg;
3096 
3097  after_end = rb_funcall(end, succ, 0, 0);
3098  current = rb_str_dup(beg);
3099  while (!rb_str_equal(current, after_end)) {
3100  VALUE next = Qnil;
3101  if (excl || !rb_str_equal(current, end))
3102  next = rb_funcall(current, succ, 0, 0);
3103  rb_yield(current);
3104  if (NIL_P(next)) break;
3105  current = next;
3106  StringValue(current);
3107  if (excl && rb_str_equal(current, end)) break;
3108  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3109  break;
3110  }
3111 
3112  return beg;
3113 }
3114 
3115 static VALUE
3116 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3117 {
3118  if (rb_reg_search(re, str, 0, 0) >= 0) {
3120  int nth = rb_reg_backref_number(match, backref);
3121  return rb_reg_nth_match(nth, match);
3122  }
3123  return Qnil;
3124 }
3125 
3126 static VALUE
3128 {
3129  long idx;
3130 
3131  switch (TYPE(indx)) {
3132  case T_FIXNUM:
3133  idx = FIX2LONG(indx);
3134 
3135  num_index:
3136  str = rb_str_substr(str, idx, 1);
3137  if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3138  return str;
3139 
3140  case T_REGEXP:
3141  return rb_str_subpat(str, indx, INT2FIX(0));
3142 
3143  case T_STRING:
3144  if (rb_str_index(str, indx, 0) != -1)
3145  return rb_str_dup(indx);
3146  return Qnil;
3147 
3148  default:
3149  /* check if indx is Range */
3150  {
3151  long beg, len;
3152  VALUE tmp;
3153 
3154  len = str_strlen(str, STR_ENC_GET(str));
3155  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3156  case Qfalse:
3157  break;
3158  case Qnil:
3159  return Qnil;
3160  default:
3161  tmp = rb_str_substr(str, beg, len);
3162  return tmp;
3163  }
3164  }
3165  idx = NUM2LONG(indx);
3166  goto num_index;
3167  }
3168  return Qnil; /* not reached */
3169 }
3170 
3171 
3172 /*
3173  * call-seq:
3174  * str[fixnum] -> new_str or nil
3175  * str[fixnum, fixnum] -> new_str or nil
3176  * str[range] -> new_str or nil
3177  * str[regexp] -> new_str or nil
3178  * str[regexp, fixnum] -> new_str or nil
3179  * str[other_str] -> new_str or nil
3180  * str.slice(fixnum) -> new_str or nil
3181  * str.slice(fixnum, fixnum) -> new_str or nil
3182  * str.slice(range) -> new_str or nil
3183  * str.slice(regexp) -> new_str or nil
3184  * str.slice(regexp, fixnum) -> new_str or nil
3185  * str.slice(regexp, capname) -> new_str or nil
3186  * str.slice(other_str) -> new_str or nil
3187  *
3188  * Element Reference---If passed a single <code>Fixnum</code>, returns a
3189  * substring of one character at that position. If passed two <code>Fixnum</code>
3190  * objects, returns a substring starting at the offset given by the first, and
3191  * with a length given by the second. If passed a range, its beginning and end
3192  * are interpreted as offsets delimiting the substring to be returned. In all
3193  * three cases, if an offset is negative, it is counted from the end of <i>str</i>.
3194  * Returns <code>nil</code> if the initial offset falls outside the string or
3195  * the length is negative.
3196  *
3197  * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
3198  * returned. If a numeric or name parameter follows the regular expression, that
3199  * component of the <code>MatchData</code> is returned instead. If a
3200  * <code>String</code> is given, that string is returned if it occurs in
3201  * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
3202  * match.
3203  *
3204  * a = "hello there"
3205  * a[1] #=> "e"
3206  * a[2, 3] #=> "llo"
3207  * a[2..3] #=> "ll"
3208  * a[-3, 2] #=> "er"
3209  * a[7..-2] #=> "her"
3210  * a[-4..-2] #=> "her"
3211  * a[-2..-4] #=> ""
3212  * a[12..-1] #=> nil
3213  * a[/[aeiou](.)\1/] #=> "ell"
3214  * a[/[aeiou](.)\1/, 0] #=> "ell"
3215  * a[/[aeiou](.)\1/, 1] #=> "l"
3216  * a[/[aeiou](.)\1/, 2] #=> nil
3217  * a["lo"] #=> "lo"
3218  * a["bye"] #=> nil
3219  */
3220 
3221 static VALUE
3223 {
3224  if (argc == 2) {
3225  if (TYPE(argv[0]) == T_REGEXP) {
3226  return rb_str_subpat(str, argv[0], argv[1]);
3227  }
3228  return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3229  }
3230  if (argc != 1) {
3231  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3232  }
3233  return rb_str_aref(str, argv[0]);
3234 }
3235 
3236 VALUE
3237 rb_str_drop_bytes(VALUE str, long len)
3238 {
3239  char *ptr = RSTRING_PTR(str);
3240  long olen = RSTRING_LEN(str), nlen;
3241 
3242  str_modifiable(str);
3243  if (len > olen) len = olen;
3244  nlen = olen - len;
3245  if (nlen <= RSTRING_EMBED_LEN_MAX) {
3246  char *oldptr = ptr;
3247  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3248  STR_SET_EMBED(str);
3249  STR_SET_EMBED_LEN(str, nlen);
3250  ptr = RSTRING(str)->as.ary;
3251  memmove(ptr, oldptr + len, nlen);
3252  if (fl == STR_NOEMBED) xfree(oldptr);
3253  }
3254  else {
3255  if (!STR_SHARED_P(str)) rb_str_new4(str);
3256  ptr = RSTRING(str)->as.heap.ptr += len;
3257  RSTRING(str)->as.heap.len = nlen;
3258  }
3259  ptr[nlen] = 0;
3260  ENC_CODERANGE_CLEAR(str);
3261  return str;
3262 }
3263 
3264 static void
3265 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3266 {
3267  if (beg == 0 && RSTRING_LEN(val) == 0) {
3268  rb_str_drop_bytes(str, len);
3269  OBJ_INFECT(str, val);
3270  return;
3271  }
3272 
3273  rb_str_modify(str);
3274  if (len < RSTRING_LEN(val)) {
3275  /* expand string */
3276  RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3277  }
3278 
3279  if (RSTRING_LEN(val) != len) {
3280  memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3281  RSTRING_PTR(str) + beg + len,
3282  RSTRING_LEN(str) - (beg + len));
3283  }
3284  if (RSTRING_LEN(val) < beg && len < 0) {
3285  MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3286  }
3287  if (RSTRING_LEN(val) > 0) {
3288  memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3289  }
3290  STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3291  if (RSTRING_PTR(str)) {
3292  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3293  }
3294  OBJ_INFECT(str, val);
3295 }
3296 
3297 static void
3298 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3299 {
3300  long slen;
3301  char *p, *e;
3302  rb_encoding *enc;
3303  int singlebyte = single_byte_optimizable(str);
3304  int cr;
3305 
3306  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3307 
3308  StringValue(val);
3309  enc = rb_enc_check(str, val);
3310  slen = str_strlen(str, enc);
3311 
3312  if (slen < beg) {
3313  out_of_range:
3314  rb_raise(rb_eIndexError, "index %ld out of string", beg);
3315  }
3316  if (beg < 0) {
3317  if (-beg > slen) {
3318  goto out_of_range;
3319  }
3320  beg += slen;
3321  }
3322  if (slen < len || slen < beg + len) {
3323  len = slen - beg;
3324  }
3325  str_modify_keep_cr(str);
3326  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3327  if (!p) p = RSTRING_END(str);
3328  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3329  if (!e) e = RSTRING_END(str);
3330  /* error check */
3331  beg = p - RSTRING_PTR(str); /* physical position */
3332  len = e - p; /* physical length */
3333  rb_str_splice_0(str, beg, len, val);
3334  rb_enc_associate(str, enc);
3336  if (cr != ENC_CODERANGE_BROKEN)
3337  ENC_CODERANGE_SET(str, cr);
3338 }
3339 
3340 void
3341 rb_str_update(VALUE str, long beg, long len, VALUE val)
3342 {
3343  rb_str_splice(str, beg, len, val);
3344 }
3345 
3346 static void
3347 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3348 {
3349  int nth;
3350  VALUE match;
3351  long start, end, len;
3352  rb_encoding *enc;
3353  struct re_registers *regs;
3354 
3355  if (rb_reg_search(re, str, 0, 0) < 0) {
3356  rb_raise(rb_eIndexError, "regexp not matched");
3357  }
3358  match = rb_backref_get();
3359  nth = rb_reg_backref_number(match, backref);
3360  regs = RMATCH_REGS(match);
3361  if (nth >= regs->num_regs) {
3362  out_of_range:
3363  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3364  }
3365  if (nth < 0) {
3366  if (-nth >= regs->num_regs) {
3367  goto out_of_range;
3368  }
3369  nth += regs->num_regs;
3370  }
3371 
3372  start = BEG(nth);
3373  if (start == -1) {
3374  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3375  }
3376  end = END(nth);
3377  len = end - start;
3378  StringValue(val);
3379  enc = rb_enc_check(str, val);
3380  rb_str_splice_0(str, start, len, val);
3381  rb_enc_associate(str, enc);
3382 }
3383 
3384 static VALUE
3385 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3386 {
3387  long idx, beg;
3388 
3389  switch (TYPE(indx)) {
3390  case T_FIXNUM:
3391  idx = FIX2LONG(indx);
3392  num_index:
3393  rb_str_splice(str, idx, 1, val);
3394  return val;
3395 
3396  case T_REGEXP:
3397  rb_str_subpat_set(str, indx, INT2FIX(0), val);
3398  return val;
3399 
3400  case T_STRING:
3401  beg = rb_str_index(str, indx, 0);
3402  if (beg < 0) {
3403  rb_raise(rb_eIndexError, "string not matched");
3404  }
3405  beg = rb_str_sublen(str, beg);
3406  rb_str_splice(str, beg, str_strlen(indx, 0), val);
3407  return val;
3408 
3409  default:
3410  /* check if indx is Range */
3411  {
3412  long beg, len;
3413  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3414  rb_str_splice(str, beg, len, val);
3415  return val;
3416  }
3417  }
3418  idx = NUM2LONG(indx);
3419  goto num_index;
3420  }
3421 }
3422 
3423 /*
3424  * call-seq:
3425  * str[fixnum] = new_str
3426  * str[fixnum, fixnum] = new_str
3427  * str[range] = aString
3428  * str[regexp] = new_str
3429  * str[regexp, fixnum] = new_str
3430  * str[regexp, name] = new_str
3431  * str[other_str] = new_str
3432  *
3433  * Element Assignment---Replaces some or all of the content of <i>str</i>. The
3434  * portion of the string affected is determined using the same criteria as
3435  * <code>String#[]</code>. If the replacement string is not the same length as
3436  * the text it is replacing, the string will be adjusted accordingly. If the
3437  * regular expression or string is used as the index doesn't match a position
3438  * in the string, <code>IndexError</code> is raised. If the regular expression
3439  * form is used, the optional second <code>Fixnum</code> allows you to specify
3440  * which portion of the match to replace (effectively using the
3441  * <code>MatchData</code> indexing rules. The forms that take a
3442  * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3443  * out of range; the <code>Range</code> form will raise a
3444  * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3445  * forms will silently ignore the assignment.
3446  */
3447 
3448 static VALUE
3450 {
3451  if (argc == 3) {
3452  if (TYPE(argv[0]) == T_REGEXP) {
3453  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3454  }
3455  else {
3456  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3457  }
3458  return argv[2];
3459  }
3460  if (argc != 2) {
3461  rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
3462  }
3463  return rb_str_aset(str, argv[0], argv[1]);
3464 }
3465 
3466 /*
3467  * call-seq:
3468  * str.insert(index, other_str) -> str
3469  *
3470  * Inserts <i>other_str</i> before the character at the given
3471  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
3472  * end of the string, and insert <em>after</em> the given character.
3473  * The intent is insert <i>aString</i> so that it starts at the given
3474  * <i>index</i>.
3475  *
3476  * "abcd".insert(0, 'X') #=> "Xabcd"
3477  * "abcd".insert(3, 'X') #=> "abcXd"
3478  * "abcd".insert(4, 'X') #=> "abcdX"
3479  * "abcd".insert(-3, 'X') #=> "abXcd"
3480  * "abcd".insert(-1, 'X') #=> "abcdX"
3481  */
3482 
3483 static VALUE
3485 {
3486  long pos = NUM2LONG(idx);
3487 
3488  if (pos == -1) {
3489  return rb_str_append(str, str2);
3490  }
3491  else if (pos < 0) {
3492  pos++;
3493  }
3494  rb_str_splice(str, pos, 0, str2);
3495  return str;
3496 }
3497 
3498 
3499 /*
3500  * call-seq:
3501  * str.slice!(fixnum) -> fixnum or nil
3502  * str.slice!(fixnum, fixnum) -> new_str or nil
3503  * str.slice!(range) -> new_str or nil
3504  * str.slice!(regexp) -> new_str or nil
3505  * str.slice!(other_str) -> new_str or nil
3506  *
3507  * Deletes the specified portion from <i>str</i>, and returns the portion
3508  * deleted.
3509  *
3510  * string = "this is a string"
3511  * string.slice!(2) #=> "i"
3512  * string.slice!(3..6) #=> " is "
3513  * string.slice!(/s.*t/) #=> "sa st"
3514  * string.slice!("r") #=> "r"
3515  * string #=> "thing"
3516  */
3517 
3518 static VALUE
3520 {
3521  VALUE result;
3522  VALUE buf[3];
3523  int i;
3524 
3525  if (argc < 1 || 2 < argc) {
3526  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3527  }
3528  for (i=0; i<argc; i++) {
3529  buf[i] = argv[i];
3530  }
3531  str_modify_keep_cr(str);
3532  result = rb_str_aref_m(argc, buf, str);
3533  if (!NIL_P(result)) {
3534  buf[i] = rb_str_new(0,0);
3535  rb_str_aset_m(argc+1, buf, str);
3536  }
3537  return result;
3538 }
3539 
3540 static VALUE
3541 get_pat(VALUE pat, int quote)
3542 {
3543  VALUE val;
3544 
3545  switch (TYPE(pat)) {
3546  case T_REGEXP:
3547  return pat;
3548 
3549  case T_STRING:
3550  break;
3551 
3552  default:
3553  val = rb_check_string_type(pat);
3554  if (NIL_P(val)) {
3555  Check_Type(pat, T_REGEXP);
3556  }
3557  pat = val;
3558  }
3559 
3560  if (quote) {
3561  pat = rb_reg_quote(pat);
3562  }
3563 
3564  return rb_reg_regcomp(pat);
3565 }
3566 
3567 
3568 /*
3569  * call-seq:
3570  * str.sub!(pattern, replacement) -> str or nil
3571  * str.sub!(pattern) {|match| block } -> str or nil
3572  *
3573  * Performs the substitutions of <code>String#sub</code> in place,
3574  * returning <i>str</i>, or <code>nil</code> if no substitutions were
3575  * performed.
3576  */
3577 
3578 static VALUE
3580 {
3581  VALUE pat, repl, hash = Qnil;
3582  int iter = 0;
3583  int tainted = 0;
3584  int untrusted = 0;
3585  long plen;
3586 
3587  if (argc == 1 && rb_block_given_p()) {
3588  iter = 1;
3589  }
3590  else if (argc == 2) {
3591  repl = argv[1];
3592  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3593  if (NIL_P(hash)) {
3594  StringValue(repl);
3595  }
3596  if (OBJ_TAINTED(repl)) tainted = 1;
3597  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3598  }
3599  else {
3600  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3601  }
3602 
3603  pat = get_pat(argv[0], 1);
3604  str_modifiable(str);
3605  if (rb_reg_search(pat, str, 0, 0) >= 0) {
3606  rb_encoding *enc;
3607  int cr = ENC_CODERANGE(str);
3609  struct re_registers *regs = RMATCH_REGS(match);
3610  long beg0 = BEG(0);
3611  long end0 = END(0);
3612  char *p, *rp;
3613  long len, rlen;
3614 
3615  if (iter || !NIL_P(hash)) {
3616  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3617 
3618  if (iter) {
3619  repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3620  }
3621  else {
3622  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3623  repl = rb_obj_as_string(repl);
3624  }
3625  str_mod_check(str, p, len);
3626  rb_check_frozen(str);
3627  }
3628  else {
3629  repl = rb_reg_regsub(repl, str, regs, pat);
3630  }
3631  enc = rb_enc_compatible(str, repl);
3632  if (!enc) {
3633  rb_encoding *str_enc = STR_ENC_GET(str);
3634  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3635  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3636  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3637  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3638  rb_enc_name(str_enc),
3639  rb_enc_name(STR_ENC_GET(repl)));
3640  }
3641  enc = STR_ENC_GET(repl);
3642  }
3643  rb_str_modify(str);
3644  rb_enc_associate(str, enc);
3645  if (OBJ_TAINTED(repl)) tainted = 1;
3646  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3647  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3648  int cr2 = ENC_CODERANGE(repl);
3649  if (cr2 == ENC_CODERANGE_BROKEN ||
3650  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3651  cr = ENC_CODERANGE_UNKNOWN;
3652  else
3653  cr = cr2;
3654  }
3655  plen = end0 - beg0;
3656  rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3657  len = RSTRING_LEN(str);
3658  if (rlen > plen) {
3659  RESIZE_CAPA(str, len + rlen - plen);
3660  }
3661  p = RSTRING_PTR(str);
3662  if (rlen != plen) {
3663  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3664  }
3665  memcpy(p + beg0, rp, rlen);
3666  len += rlen - plen;
3667  STR_SET_LEN(str, len);
3668  RSTRING_PTR(str)[len] = '\0';
3669  ENC_CODERANGE_SET(str, cr);
3670  if (tainted) OBJ_TAINT(str);
3671  if (untrusted) OBJ_UNTRUST(str);
3672 
3673  return str;
3674  }
3675  return Qnil;
3676 }
3677 
3678 
3679 /*
3680  * call-seq:
3681  * str.sub(pattern, replacement) -> new_str
3682  * str.sub(pattern, hash) -> new_str
3683  * str.sub(pattern) {|match| block } -> new_str
3684  *
3685  * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3686  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3687  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3688  * regular expression metacharacters it contains will be interpreted
3689  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3690  * instead of a digit.
3691  *
3692  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3693  * the matched text. It may contain back-references to the pattern's capture
3694  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3695  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3696  * double-quoted string, both back-references must be preceded by an
3697  * additional backslash. However, within <i>replacement</i> the special match
3698  * variables, such as <code>&$</code>, will not refer to the current match.
3699  *
3700  * If the second argument is a <code>Hash</code>, and the matched text is one
3701  * of its keys, the corresponding value is the replacement string.
3702  *
3703  * In the block form, the current match string is passed in as a parameter,
3704  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3705  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3706  * returned by the block will be substituted for the match on each call.
3707  *
3708  * The result inherits any tainting in the original string or any supplied
3709  * replacement string.
3710  *
3711  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
3712  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
3713  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
3714  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
3715  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3716  * #=> "Is /bin/bash your preferred shell?"
3717  */
3718 
3719 static VALUE
3721 {
3722  str = rb_str_dup(str);
3723  rb_str_sub_bang(argc, argv, str);
3724  return str;
3725 }
3726 
3727 static VALUE
3728 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3729 {
3730  VALUE pat, val, repl, match, dest, hash = Qnil;
3731  struct re_registers *regs;
3732  long beg, n;
3733  long beg0, end0;
3734  long offset, blen, slen, len, last;
3735  int iter = 0;
3736  char *sp, *cp;
3737  int tainted = 0;
3738  rb_encoding *str_enc;
3739 
3740  switch (argc) {
3741  case 1:
3742  RETURN_ENUMERATOR(str, argc, argv);
3743  iter = 1;
3744  break;
3745  case 2:
3746  repl = argv[1];
3747  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3748  if (NIL_P(hash)) {
3749  StringValue(repl);
3750  }
3751  if (OBJ_TAINTED(repl)) tainted = 1;
3752  break;
3753  default:
3754  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3755  }
3756 
3757  pat = get_pat(argv[0], 1);
3758  beg = rb_reg_search(pat, str, 0, 0);
3759  if (beg < 0) {
3760  if (bang) return Qnil; /* no match, no substitution */
3761  return rb_str_dup(str);
3762  }
3763 
3764  offset = 0;
3765  n = 0;
3766  blen = RSTRING_LEN(str) + 30; /* len + margin */
3767  dest = rb_str_buf_new(blen);
3768  sp = RSTRING_PTR(str);
3769  slen = RSTRING_LEN(str);
3770  cp = sp;
3771  str_enc = STR_ENC_GET(str);
3772  rb_enc_associate(dest, str_enc);
3774 
3775  do {
3776  n++;
3777  match = rb_backref_get();
3778  regs = RMATCH_REGS(match);
3779  beg0 = BEG(0);
3780  end0 = END(0);
3781  if (iter || !NIL_P(hash)) {
3782  if (iter) {
3783  val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3784  }
3785  else {
3786  val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3787  val = rb_obj_as_string(val);
3788  }
3789  str_mod_check(str, sp, slen);
3790  if (val == dest) { /* paranoid check [ruby-dev:24827] */
3791  rb_raise(rb_eRuntimeError, "block should not cheat");
3792  }
3793  }
3794  else {
3795  val = rb_reg_regsub(repl, str, regs, pat);
3796  }
3797 
3798  if (OBJ_TAINTED(val)) tainted = 1;
3799 
3800  len = beg - offset; /* copy pre-match substr */
3801  if (len) {
3802  rb_enc_str_buf_cat(dest, cp, len, str_enc);
3803  }
3804 
3805  rb_str_buf_append(dest, val);
3806 
3807  last = offset;
3808  offset = end0;
3809  if (beg0 == end0) {
3810  /*
3811  * Always consume at least one character of the input string
3812  * in order to prevent infinite loops.
3813  */
3814  if (RSTRING_LEN(str) <= end0) break;
3815  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3816  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3817  offset = end0 + len;
3818  }
3819  cp = RSTRING_PTR(str) + offset;
3820  if (offset > RSTRING_LEN(str)) break;
3821  beg = rb_reg_search(pat, str, offset, 0);
3822  } while (beg >= 0);
3823  if (RSTRING_LEN(str) > offset) {
3824  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3825  }
3826  rb_reg_search(pat, str, last, 0);
3827  if (bang) {
3828  rb_str_shared_replace(str, dest);
3829  }
3830  else {
3831  RBASIC(dest)->klass = rb_obj_class(str);
3832  OBJ_INFECT(dest, str);
3833  str = dest;
3834  }
3835 
3836  if (tainted) OBJ_TAINT(str);
3837  return str;
3838 }
3839 
3840 
3841 /*
3842  * call-seq:
3843  * str.gsub!(pattern, replacement) -> str or nil
3844  * str.gsub!(pattern) {|match| block } -> str or nil
3845  * str.gsub!(pattern) -> an_enumerator
3846  *
3847  * Performs the substitutions of <code>String#gsub</code> in place, returning
3848  * <i>str</i>, or <code>nil</code> if no substitutions were performed.
3849  * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
3850  */
3851 
3852 static VALUE
3854 {
3855  str_modify_keep_cr(str);
3856  return str_gsub(argc, argv, str, 1);
3857 }
3858 
3859 
3860 /*
3861  * call-seq:
3862  * str.gsub(pattern, replacement) -> new_str
3863  * str.gsub(pattern, hash) -> new_str
3864  * str.gsub(pattern) {|match| block } -> new_str
3865  * str.gsub(pattern) -> enumerator
3866  *
3867  * Returns a copy of <i>str</i> with the <em>all</em> occurrences of
3868  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3869  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3870  * regular expression metacharacters it contains will be interpreted
3871  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3872  * instead of a digit.
3873  *
3874  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3875  * the matched text. It may contain back-references to the pattern's capture
3876  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3877  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3878  * double-quoted string, both back-references must be preceded by an
3879  * additional backslash. However, within <i>replacement</i> the special match
3880  * variables, such as <code>&$</code>, will not refer to the current match.
3881  *
3882  * If the second argument is a <code>Hash</code>, and the matched text is one
3883  * of its keys, the corresponding value is the replacement string.
3884  *
3885  * In the block form, the current match string is passed in as a parameter,
3886  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3887  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3888  * returned by the block will be substituted for the match on each call.
3889  *
3890  * The result inherits any tainting in the original string or any supplied
3891  * replacement string.
3892  *
3893  * When neither a block nor a second argument is supplied, an
3894  * <code>Enumerator</code> is returned.
3895  *
3896  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
3897  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
3898  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
3899  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
3900  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
3901  */
3902 
3903 static VALUE
3905 {
3906  return str_gsub(argc, argv, str, 0);
3907 }
3908 
3909 
3910 /*
3911  * call-seq:
3912  * str.replace(other_str) -> str
3913  *
3914  * Replaces the contents and taintedness of <i>str</i> with the corresponding
3915  * values in <i>other_str</i>.
3916  *
3917  * s = "hello" #=> "hello"
3918  * s.replace "world" #=> "world"
3919  */
3920 
3921 VALUE
3923 {
3924  str_modifiable(str);
3925  if (str == str2) return str;
3926 
3927  StringValue(str2);
3928  str_discard(str);
3929  return str_replace(str, str2);
3930 }
3931 
3932 /*
3933  * call-seq:
3934  * string.clear -> string
3935  *
3936  * Makes string empty.
3937  *
3938  * a = "abcde"
3939  * a.clear #=> ""
3940  */
3941 
3942 static VALUE
3944 {
3945  str_discard(str);
3946  STR_SET_EMBED(str);
3947  STR_SET_EMBED_LEN(str, 0);
3948  RSTRING_PTR(str)[0] = 0;
3949  if (rb_enc_asciicompat(STR_ENC_GET(str)))
3951  else
3953  return str;
3954 }
3955 
3956 /*
3957  * call-seq:
3958  * string.chr -> string
3959  *
3960  * Returns a one-character string at the beginning of the string.
3961  *
3962  * a = "abcde"
3963  * a.chr #=> "a"
3964  */
3965 
3966 static VALUE
3968 {
3969  return rb_str_substr(str, 0, 1);
3970 }
3971 
3972 /*
3973  * call-seq:
3974  * str.getbyte(index) -> 0 .. 255
3975  *
3976  * returns the <i>index</i>th byte as an integer.
3977  */
3978 static VALUE
3980 {
3981  long pos = NUM2LONG(index);
3982 
3983  if (pos < 0)
3984  pos += RSTRING_LEN(str);
3985  if (pos < 0 || RSTRING_LEN(str) <= pos)
3986  return Qnil;
3987 
3988  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3989 }
3990 
3991 /*
3992  * call-seq:
3993  * str.setbyte(index, int) -> int
3994  *
3995  * modifies the <i>index</i>th byte as <i>int</i>.
3996  */
3997 static VALUE
3998 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3999 {
4000  long pos = NUM2LONG(index);
4001  int byte = NUM2INT(value);
4002 
4003  rb_str_modify(str);
4004 
4005  if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4006  rb_raise(rb_eIndexError, "index %ld out of string", pos);
4007  if (pos < 0)
4008  pos += RSTRING_LEN(str);
4009 
4010  RSTRING_PTR(str)[pos] = byte;
4011 
4012  return value;
4013 }
4014 
4015 static VALUE
4016 str_byte_substr(VALUE str, long beg, long len)
4017 {
4018  char *p, *s = RSTRING_PTR(str);
4019  long n = RSTRING_LEN(str);
4020  VALUE str2;
4021 
4022  if (beg > n || len < 0) return Qnil;
4023  if (beg < 0) {
4024  beg += n;
4025  if (beg < 0) return Qnil;
4026  }
4027  if (beg + len > n)
4028  len = n - beg;
4029  if (len <= 0) {
4030  len = 0;
4031  p = 0;
4032  }
4033  else
4034  p = s + beg;
4035 
4036  if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4037  str2 = rb_str_new4(str);
4038  str2 = str_new3(rb_obj_class(str2), str2);
4039  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4040  RSTRING(str2)->as.heap.len = len;
4041  }
4042  else {
4043  str2 = rb_str_new5(str, p, len);
4044  rb_enc_cr_str_copy_for_substr(str2, str);
4045  OBJ_INFECT(str2, str);
4046  }
4047 
4048  return str2;
4049 }
4050 
4051 static VALUE
4053 {
4054  long idx;
4055  switch (TYPE(indx)) {
4056  case T_FIXNUM:
4057  idx = FIX2LONG(indx);
4058 
4059  num_index:
4060  str = str_byte_substr(str, idx, 1);
4061  if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4062  return str;
4063 
4064  default:
4065  /* check if indx is Range */
4066  {
4067  long beg, len = RSTRING_LEN(str);
4068 
4069  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4070  case Qfalse:
4071  break;
4072  case Qnil:
4073  return Qnil;
4074  default:
4075  return str_byte_substr(str, beg, len);
4076  }
4077  }
4078  idx = NUM2LONG(indx);
4079  goto num_index;
4080  }
4081  return Qnil; /* not reached */
4082 }
4083 
4084 /*
4085  * call-seq:
4086  * str.byteslice(fixnum) -> new_str or nil
4087  * str.byteslice(fixnum, fixnum) -> new_str or nil
4088  * str.byteslice(range) -> new_str or nil
4089  *
4090  * Byte Reference---If passed a single <code>Fixnum</code>, returns a
4091  * substring of one byte at that position. If passed two <code>Fixnum</code>
4092  * objects, returns a substring starting at the offset given by the first, and
4093  * a length given by the second. If given a <code>Range</code>, a substring containing
4094  * bytes at offsets given by the range is returned. In all three cases, if
4095  * an offset is negative, it is counted from the end of <i>str</i>. Returns
4096  * <code>nil</code> if the initial offset falls outside the string, the length
4097  * is negative, or the beginning of the range is greater than the end.
4098  * The encoding of the resulted string keeps original encoding.
4099  *
4100  * "hello".byteslice(1) #=> "e"
4101  * "hello".byteslice(-1) #=> "o"
4102  * "hello".byteslice(1, 2) #=> "el"
4103  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4104  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
4105  */
4106 
4107 static VALUE
4109 {
4110  if (argc == 2) {
4111  return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4112  }
4113  if (argc != 1) {
4114  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
4115  }
4116  return str_byte_aref(str, argv[0]);
4117 }
4118 
4119 /*
4120  * call-seq:
4121  * str.reverse -> new_str
4122  *
4123  * Returns a new string with the characters from <i>str</i> in reverse order.
4124  *
4125  * "stressed".reverse #=> "desserts"
4126  */
4127 
4128 static VALUE
4130 {
4131  rb_encoding *enc;
4132  VALUE rev;
4133  char *s, *e, *p;
4134  int single = 1;
4135 
4136  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4137  enc = STR_ENC_GET(str);
4138  rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4139  s = RSTRING_PTR(str); e = RSTRING_END(str);
4140  p = RSTRING_END(rev);
4141 
4142  if (RSTRING_LEN(str) > 1) {
4143  if (single_byte_optimizable(str)) {
4144  while (s < e) {
4145  *--p = *s++;
4146  }
4147  }
4148  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4149  while (s < e) {
4150  int clen = rb_enc_fast_mbclen(s, e, enc);
4151 
4152  if (clen > 1 || (*s & 0x80)) single = 0;
4153  p -= clen;
4154  memcpy(p, s, clen);
4155  s += clen;
4156  }
4157  }
4158  else {
4159  while (s < e) {
4160  int clen = rb_enc_mbclen(s, e, enc);
4161 
4162  if (clen > 1 || (*s & 0x80)) single = 0;
4163  p -= clen;
4164  memcpy(p, s, clen);
4165  s += clen;
4166  }
4167  }
4168  }
4169  STR_SET_LEN(rev, RSTRING_LEN(str));
4170  OBJ_INFECT(rev, str);
4171  if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4172  if (single) {
4174  }
4175  else {
4177  }
4178  }
4180 
4181  return rev;
4182 }
4183 
4184 
4185 /*
4186  * call-seq:
4187  * str.reverse! -> str
4188  *
4189  * Reverses <i>str</i> in place.
4190  */
4191 
4192 static VALUE
4194 {
4195  if (RSTRING_LEN(str) > 1) {
4196  if (single_byte_optimizable(str)) {
4197  char *s, *e, c;
4198 
4199  str_modify_keep_cr(str);
4200  s = RSTRING_PTR(str);
4201  e = RSTRING_END(str) - 1;
4202  while (s < e) {
4203  c = *s;
4204  *s++ = *e;
4205  *e-- = c;
4206  }
4207  }
4208  else {
4210  }
4211  }
4212  else {
4213  str_modify_keep_cr(str);
4214  }
4215  return str;
4216 }
4217 
4218 
4219 /*
4220  * call-seq:
4221  * str.include? other_str -> true or false
4222  *
4223  * Returns <code>true</code> if <i>str</i> contains the given string or
4224  * character.
4225  *
4226  * "hello".include? "lo" #=> true
4227  * "hello".include? "ol" #=> false
4228  * "hello".include? ?h #=> true
4229  */
4230 
4231 static VALUE
4233 {
4234  long i;
4235 
4236  StringValue(arg);
4237  i = rb_str_index(str, arg, 0);
4238 
4239  if (i == -1) return Qfalse;
4240  return Qtrue;
4241 }
4242 
4243 
4244 /*
4245  * call-seq:
4246  * str.to_i(base=10) -> integer
4247  *
4248  * Returns the result of interpreting leading characters in <i>str</i> as an
4249  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4250  * end of a valid number are ignored. If there is not a valid number at the
4251  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
4252  * exception when <i>base</i> is valid.
4253  *
4254  * "12345".to_i #=> 12345
4255  * "99 red balloons".to_i #=> 99
4256  * "0a".to_i #=> 0
4257  * "0a".to_i(16) #=> 10
4258  * "hello".to_i #=> 0
4259  * "1100101".to_i(2) #=> 101
4260  * "1100101".to_i(8) #=> 294977
4261  * "1100101".to_i(10) #=> 1100101
4262  * "1100101".to_i(16) #=> 17826049
4263  */
4264 
4265 static VALUE
4267 {
4268  int base;
4269 
4270  if (argc == 0) base = 10;
4271  else {
4272  VALUE b;
4273 
4274  rb_scan_args(argc, argv, "01", &b);
4275  base = NUM2INT(b);
4276  }
4277  if (base < 0) {
4278  rb_raise(rb_eArgError, "invalid radix %d", base);
4279  }
4280  return rb_str_to_inum(str, base, FALSE);
4281 }
4282 
4283 
4284 /*
4285  * call-seq:
4286  * str.to_f -> float
4287  *
4288  * Returns the result of interpreting leading characters in <i>str</i> as a
4289  * floating point number. Extraneous characters past the end of a valid number
4290  * are ignored. If there is not a valid number at the start of <i>str</i>,
4291  * <code>0.0</code> is returned. This method never raises an exception.
4292  *
4293  * "123.45e1".to_f #=> 1234.5
4294  * "45.67 degrees".to_f #=> 45.67
4295  * "thx1138".to_f #=> 0.0
4296  */
4297 
4298 static VALUE
4300 {
4301  return DBL2NUM(rb_str_to_dbl(str, FALSE));
4302 }
4303 
4304 
4305 /*
4306  * call-seq:
4307  * str.to_s -> str
4308  * str.to_str -> str
4309  *
4310  * Returns the receiver.
4311  */
4312 
4313 static VALUE
4315 {
4316  if (rb_obj_class(str) != rb_cString) {
4317  return str_duplicate(rb_cString, str);
4318  }
4319  return str;
4320 }
4321 
4322 #if 0
4323 static void
4324 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4325 {
4326  char s[RUBY_MAX_CHAR_LEN];
4327  int n = rb_enc_codelen(c, enc);
4328 
4329  rb_enc_mbcput(c, s, enc);
4330  rb_enc_str_buf_cat(str, s, n, enc);
4331 }
4332 #endif
4333 
4334 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4335 
4336 int
4337 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4338 {
4339  char buf[CHAR_ESC_LEN + 1];
4340  int l;
4341 
4342 #if SIZEOF_INT > 4
4343  c &= 0xffffffff;
4344 #endif
4345  if (unicode_p) {
4346  if (c < 0x7F && ISPRINT(c)) {
4347  snprintf(buf, CHAR_ESC_LEN, "%c", c);
4348  }
4349  else if (c < 0x10000) {
4350  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4351  }
4352  else {
4353  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4354  }
4355  }
4356  else {
4357  if (c < 0x100) {
4358  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4359  }
4360  else {
4361  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4362  }
4363  }
4364  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
4365  rb_str_buf_cat(result, buf, l);
4366  return l;
4367 }
4368 
4369 /*
4370  * call-seq:
4371  * str.inspect -> string
4372  *
4373  * Returns a printable version of _str_, surrounded by quote marks,
4374  * with special characters escaped.
4375  *
4376  * str = "hello"
4377  * str[3] = "\b"
4378  * str.inspect #=> "\"hel\\bo\""
4379  */
4380 
4381 VALUE
4383 {
4384  rb_encoding *enc = STR_ENC_GET(str);
4385  const char *p, *pend, *prev;
4386  char buf[CHAR_ESC_LEN + 1];
4389  int unicode_p = rb_enc_unicode_p(enc);
4390  int asciicompat = rb_enc_asciicompat(enc);
4391  static rb_encoding *utf16, *utf32;
4392 
4393  if (!utf16) utf16 = rb_enc_find("UTF-16");
4394  if (!utf32) utf32 = rb_enc_find("UTF-32");
4395  if (resenc == NULL) resenc = rb_default_external_encoding();
4396  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4397  rb_enc_associate(result, resenc);
4398  str_buf_cat2(result, "\"");
4399 
4400  p = RSTRING_PTR(str); pend = RSTRING_END(str);
4401  prev = p;
4402  if (enc == utf16) {
4403  const unsigned char *q = (const unsigned char *)p;
4404  if (q[0] == 0xFE && q[1] == 0xFF)
4405  enc = rb_enc_find("UTF-16BE");
4406  else if (q[0] == 0xFF && q[1] == 0xFE)
4407  enc = rb_enc_find("UTF-16LE");
4408  else
4409  unicode_p = 0;
4410  }
4411  else if (enc == utf32) {
4412  const unsigned char *q = (const unsigned char *)p;
4413  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4414  enc = rb_enc_find("UTF-32BE");
4415  else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4416  enc = rb_enc_find("UTF-32LE");
4417  else
4418  unicode_p = 0;
4419  }
4420  while (p < pend) {
4421  unsigned int c, cc;
4422  int n;
4423 
4424  n = rb_enc_precise_mbclen(p, pend, enc);
4425  if (!MBCLEN_CHARFOUND_P(n)) {
4426  if (p > prev) str_buf_cat(result, prev, p - prev);
4427  n = rb_enc_mbminlen(enc);
4428  if (pend < p + n)
4429  n = (int)(pend - p);
4430  while (n--) {
4431  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4432  str_buf_cat(result, buf, strlen(buf));
4433  prev = ++p;
4434  }
4435  continue;
4436  }
4437  n = MBCLEN_CHARFOUND_LEN(n);
4438  c = rb_enc_mbc_to_codepoint(p, pend, enc);
4439  p += n;
4440  if ((asciicompat || unicode_p) &&
4441  (c == '"'|| c == '\\' ||
4442  (c == '#' &&
4443  p < pend &&
4445  (cc = rb_enc_codepoint(p,pend,enc),
4446  (cc == '$' || cc == '@' || cc == '{'))))) {
4447  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4448  str_buf_cat2(result, "\\");
4449  if (asciicompat || enc == resenc) {
4450  prev = p - n;
4451  continue;
4452  }
4453  }
4454  switch (c) {
4455  case '\n': cc = 'n'; break;
4456  case '\r': cc = 'r'; break;
4457  case '\t': cc = 't'; break;
4458  case '\f': cc = 'f'; break;
4459  case '\013': cc = 'v'; break;
4460  case '\010': cc = 'b'; break;
4461  case '\007': cc = 'a'; break;
4462  case 033: cc = 'e'; break;
4463  default: cc = 0; break;
4464  }
4465  if (cc) {
4466  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4467  buf[0] = '\\';
4468  buf[1] = (char)cc;
4469  str_buf_cat(result, buf, 2);
4470  prev = p;
4471  continue;
4472  }
4473  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4474  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4475  continue;
4476  }
4477  else {
4478  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4479  rb_str_buf_cat_escaped_char(result, c, unicode_p);
4480  prev = p;
4481  continue;
4482  }
4483  }
4484  if (p > prev) str_buf_cat(result, prev, p - prev);
4485  str_buf_cat2(result, "\"");
4486 
4487  OBJ_INFECT(result, str);
4488  return result;
4489 }
4490 
4491 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4492 
4493 /*
4494  * call-seq:
4495  * str.dump -> new_str
4496  *
4497  * Produces a version of <i>str</i> with all nonprinting characters replaced by
4498  * <code>\nnn</code> notation and all special characters escaped.
4499  */
4500 
4501 VALUE
4503 {
4504  rb_encoding *enc = rb_enc_get(str);
4505  long len;
4506  const char *p, *pend;
4507  char *q, *qend;
4508  VALUE result;
4509  int u8 = (enc == rb_utf8_encoding());
4510 
4511  len = 2; /* "" */
4512  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4513  while (p < pend) {
4514  unsigned char c = *p++;
4515  switch (c) {
4516  case '"': case '\\':
4517  case '\n': case '\r':
4518  case '\t': case '\f':
4519  case '\013': case '\010': case '\007': case '\033':
4520  len += 2;
4521  break;
4522 
4523  case '#':
4524  len += IS_EVSTR(p, pend) ? 2 : 1;
4525  break;
4526 
4527  default:
4528  if (ISPRINT(c)) {
4529  len++;
4530  }
4531  else {
4532  if (u8) { /* \u{NN} */
4533  int n = rb_enc_precise_mbclen(p-1, pend, enc);
4534  if (MBCLEN_CHARFOUND_P(n-1)) {
4535  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4536  while (cc >>= 4) len++;
4537  len += 5;
4538  p += MBCLEN_CHARFOUND_LEN(n)-1;
4539  break;
4540  }
4541  }
4542  len += 4; /* \xNN */
4543  }
4544  break;
4545  }
4546  }
4547  if (!rb_enc_asciicompat(enc)) {
4548  len += 19; /* ".force_encoding('')" */
4549  len += strlen(enc->name);
4550  }
4551 
4552  result = rb_str_new5(str, 0, len);
4553  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4554  q = RSTRING_PTR(result); qend = q + len + 1;
4555 
4556  *q++ = '"';
4557  while (p < pend) {
4558  unsigned char c = *p++;
4559 
4560  if (c == '"' || c == '\\') {
4561  *q++ = '\\';
4562  *q++ = c;
4563  }
4564  else if (c == '#') {
4565  if (IS_EVSTR(p, pend)) *q++ = '\\';
4566  *q++ = '#';
4567  }
4568  else if (c == '\n') {
4569  *q++ = '\\';
4570  *q++ = 'n';
4571  }
4572  else if (c == '\r') {
4573  *q++ = '\\';
4574  *q++ = 'r';
4575  }
4576  else if (c == '\t') {
4577  *q++ = '\\';
4578  *q++ = 't';
4579  }
4580  else if (c == '\f') {
4581  *q++ = '\\';
4582  *q++ = 'f';
4583  }
4584  else if (c == '\013') {
4585  *q++ = '\\';
4586  *q++ = 'v';
4587  }
4588  else if (c == '\010') {
4589  *q++ = '\\';
4590  *q++ = 'b';
4591  }
4592  else if (c == '\007') {
4593  *q++ = '\\';
4594  *q++ = 'a';
4595  }
4596  else if (c == '\033') {
4597  *q++ = '\\';
4598  *q++ = 'e';
4599  }
4600  else if (ISPRINT(c)) {
4601  *q++ = c;
4602  }
4603  else {
4604  *q++ = '\\';
4605  if (u8) {
4606  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4607  if (MBCLEN_CHARFOUND_P(n)) {
4608  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4609  p += n;
4610  snprintf(q, qend-q, "u{%x}", cc);
4611  q += strlen(q);
4612  continue;
4613  }
4614  }
4615  snprintf(q, qend-q, "x%02X", c);
4616  q += 3;
4617  }
4618  }
4619  *q++ = '"';
4620  *q = '\0';
4621  if (!rb_enc_asciicompat(enc)) {
4622  snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4623  enc = rb_ascii8bit_encoding();
4624  }
4625  OBJ_INFECT(result, str);
4626  /* result from dump is ASCII */
4627  rb_enc_associate(result, enc);
4629  return result;
4630 }
4631 
4632 
4633 static void
4635 {
4636  if (rb_enc_dummy_p(enc)) {
4637  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4638  rb_enc_name(enc));
4639  }
4640 }
4641 
4642 /*
4643  * call-seq:
4644  * str.upcase! -> str or nil
4645  *
4646  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4647  * were made.
4648  * Note: case replacement is effective only in ASCII region.
4649  */
4650 
4651 static VALUE
4653 {
4654  rb_encoding *enc;
4655  char *s, *send;
4656  int modify = 0;
4657  int n;
4658 
4659  str_modify_keep_cr(str);
4660  enc = STR_ENC_GET(str);
4662  s = RSTRING_PTR(str); send = RSTRING_END(str);
4663  if (single_byte_optimizable(str)) {
4664  while (s < send) {
4665  unsigned int c = *(unsigned char*)s;
4666 
4667  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4668  *s = 'A' + (c - 'a');
4669  modify = 1;
4670  }
4671  s++;
4672  }
4673  }
4674  else {
4675  int ascompat = rb_enc_asciicompat(enc);
4676 
4677  while (s < send) {
4678  unsigned int c;
4679 
4680  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4681  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4682  *s = 'A' + (c - 'a');
4683  modify = 1;
4684  }
4685  s++;
4686  }
4687  else {
4688  c = rb_enc_codepoint_len(s, send, &n, enc);
4689  if (rb_enc_islower(c, enc)) {
4690  /* assuming toupper returns codepoint with same size */
4691  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4692  modify = 1;
4693  }
4694  s += n;
4695  }
4696  }
4697  }
4698 
4699  if (modify) return str;
4700  return Qnil;
4701 }
4702 
4703 
4704 /*
4705  * call-seq:
4706  * str.upcase -> new_str
4707  *
4708  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
4709  * uppercase counterparts. The operation is locale insensitive---only
4710  * characters ``a'' to ``z'' are affected.
4711  * Note: case replacement is effective only in ASCII region.
4712  *
4713  * "hEllO".upcase #=> "HELLO"
4714  */
4715 
4716 static VALUE
4718 {
4719  str = rb_str_dup(str);
4720  rb_str_upcase_bang(str);
4721  return str;
4722 }
4723 
4724 
4725 /*
4726  * call-seq:
4727  * str.downcase! -> str or nil
4728  *
4729  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4730  * changes were made.
4731  * Note: case replacement is effective only in ASCII region.
4732  */
4733 
4734 static VALUE
4736 {
4737  rb_encoding *enc;
4738  char *s, *send;
4739  int modify = 0;
4740 
4741  str_modify_keep_cr(str);
4742  enc = STR_ENC_GET(str);
4744  s = RSTRING_PTR(str); send = RSTRING_END(str);
4745  if (single_byte_optimizable(str)) {
4746  while (s < send) {
4747  unsigned int c = *(unsigned char*)s;
4748 
4749  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4750  *s = 'a' + (c - 'A');
4751  modify = 1;
4752  }
4753  s++;
4754  }
4755  }
4756  else {
4757  int ascompat = rb_enc_asciicompat(enc);
4758 
4759  while (s < send) {
4760  unsigned int c;
4761  int n;
4762 
4763  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4764  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4765  *s = 'a' + (c - 'A');
4766  modify = 1;
4767  }
4768  s++;
4769  }
4770  else {
4771  c = rb_enc_codepoint_len(s, send, &n, enc);
4772  if (rb_enc_isupper(c, enc)) {
4773  /* assuming toupper returns codepoint with same size */
4774  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4775  modify = 1;
4776  }
4777  s += n;
4778  }
4779  }
4780  }
4781 
4782  if (modify) return str;
4783  return Qnil;
4784 }
4785 
4786 
4787 /*
4788  * call-seq:
4789  * str.downcase -> new_str
4790  *
4791  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
4792  * lowercase counterparts. The operation is locale insensitive---only
4793  * characters ``A'' to ``Z'' are affected.
4794  * Note: case replacement is effective only in ASCII region.
4795  *
4796  * "hEllO".downcase #=> "hello"
4797  */
4798 
4799 static VALUE
4801 {
4802  str = rb_str_dup(str);
4803  rb_str_downcase_bang(str);
4804  return str;
4805 }
4806 
4807 
4808 /*
4809  * call-seq:
4810  * str.capitalize! -> str or nil
4811  *
4812  * Modifies <i>str</i> by converting the first character to uppercase and the
4813  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
4814  * Note: case conversion is effective only in ASCII region.
4815  *
4816  * a = "hello"
4817  * a.capitalize! #=> "Hello"
4818  * a #=> "Hello"
4819  * a.capitalize! #=> nil
4820  */
4821 
4822 static VALUE
4824 {
4825  rb_encoding *enc;
4826  char *s, *send;
4827  int modify = 0;
4828  unsigned int c;
4829  int n;
4830 
4831  str_modify_keep_cr(str);
4832  enc = STR_ENC_GET(str);
4834  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4835  s = RSTRING_PTR(str); send = RSTRING_END(str);
4836 
4837  c = rb_enc_codepoint_len(s, send, &n, enc);
4838  if (rb_enc_islower(c, enc)) {
4839  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4840  modify = 1;
4841  }
4842  s += n;
4843  while (s < send) {
4844  c = rb_enc_codepoint_len(s, send, &n, enc);
4845  if (rb_enc_isupper(c, enc)) {
4846  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4847  modify = 1;
4848  }
4849  s += n;
4850  }
4851 
4852  if (modify) return str;
4853  return Qnil;
4854 }
4855 
4856 
4857 /*
4858  * call-seq:
4859  * str.capitalize -> new_str
4860  *
4861  * Returns a copy of <i>str</i> with the first character converted to uppercase
4862  * and the remainder to lowercase.
4863  * Note: case conversion is effective only in ASCII region.
4864  *
4865  * "hello".capitalize #=> "Hello"
4866  * "HELLO".capitalize #=> "Hello"
4867  * "123ABC".capitalize #=> "123abc"
4868  */
4869 
4870 static VALUE
4872 {
4873  str = rb_str_dup(str);
4875  return str;
4876 }
4877 
4878 
4879 /*
4880  * call-seq:
4881  * str.swapcase! -> str or nil
4882  *
4883  * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4884  * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4885  * Note: case conversion is effective only in ASCII region.
4886  */
4887 
4888 static VALUE
4890 {
4891  rb_encoding *enc;
4892  char *s, *send;
4893  int modify = 0;
4894  int n;
4895 
4896  str_modify_keep_cr(str);
4897  enc = STR_ENC_GET(str);
4899  s = RSTRING_PTR(str); send = RSTRING_END(str);
4900  while (s < send) {
4901  unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
4902 
4903  if (rb_enc_isupper(c, enc)) {
4904  /* assuming toupper returns codepoint with same size */
4905  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4906  modify = 1;
4907  }
4908  else if (rb_enc_islower(c, enc)) {
4909  /* assuming tolower returns codepoint with same size */
4910  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4911  modify = 1;
4912  }
4913  s += n;
4914  }
4915 
4916  if (modify) return str;
4917  return Qnil;
4918 }
4919 
4920 
4921 /*
4922  * call-seq:
4923  * str.swapcase -> new_str
4924  *
4925  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4926  * to lowercase and lowercase characters converted to uppercase.
4927  * Note: case conversion is effective only in ASCII region.
4928  *
4929  * "Hello".swapcase #=> "hELLO"
4930  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
4931  */
4932 
4933 static VALUE
4935 {
4936  str = rb_str_dup(str);
4937  rb_str_swapcase_bang(str);
4938  return str;
4939 }
4940 
4941 typedef unsigned char *USTR;
4942 
4943 struct tr {
4944  int gen;
4945  unsigned int now, max;
4946  char *p, *pend;
4947 };
4948 
4949 static unsigned int
4950 trnext(struct tr *t, rb_encoding *enc)
4951 {
4952  int n;
4953 
4954  for (;;) {
4955  if (!t->gen) {
4956  if (t->p == t->pend) return -1;
4957  if (t->p < t->pend - 1 && *t->p == '\\') {
4958  t->p++;
4959  }
4960  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4961  t->p += n;
4962  if (t->p < t->pend - 1 && *t->p == '-') {
4963  t->p++;
4964  if (t->p < t->pend) {
4965  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4966  t->p += n;
4967  if (t->now > c) {
4968  if (t->now < 0x80 && c < 0x80) {
4970  "invalid range \"%c-%c\" in string transliteration",
4971  t->now, c);
4972  }
4973  else {
4974  rb_raise(rb_eArgError, "invalid range in string transliteration");
4975  }
4976  continue; /* not reached */
4977  }
4978  t->gen = 1;
4979  t->max = c;
4980  }
4981  }
4982  return t->now;
4983  }
4984  else if (++t->now < t->max) {
4985  return t->now;
4986  }
4987  else {
4988  t->gen = 0;
4989  return t->max;
4990  }
4991  }
4992 }
4993 
4994 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4995 
4996 static VALUE
4997 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4998 {
4999  const unsigned int errc = -1;
5000  unsigned int trans[256];
5001  rb_encoding *enc, *e1, *e2;
5002  struct tr trsrc, trrepl;
5003  int cflag = 0;
5004  unsigned int c, c0, last = 0;
5005  int modify = 0, i, l;
5006  char *s, *send;
5007  VALUE hash = 0;
5008  int singlebyte = single_byte_optimizable(str);
5009  int cr;
5010 
5011 #define CHECK_IF_ASCII(c) \
5012  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5013  (cr = ENC_CODERANGE_VALID) : 0)
5014 
5015  StringValue(src);
5016  StringValue(repl);
5017  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5018  if (RSTRING_LEN(repl) == 0) {
5019  return rb_str_delete_bang(1, &src, str);
5020  }
5021 
5022  cr = ENC_CODERANGE(str);
5023  e1 = rb_enc_check(str, src);
5024  e2 = rb_enc_check(str, repl);
5025  if (e1 == e2) {
5026  enc = e1;
5027  }
5028  else {
5029  enc = rb_enc_check(src, repl);
5030  }
5031  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5032  if (RSTRING_LEN(src) > 1 &&
5033  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5034  trsrc.p + l < trsrc.pend) {
5035  cflag = 1;
5036  trsrc.p += l;
5037  }
5038  trrepl.p = RSTRING_PTR(repl);
5039  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5040  trsrc.gen = trrepl.gen = 0;
5041  trsrc.now = trrepl.now = 0;
5042  trsrc.max = trrepl.max = 0;
5043 
5044  if (cflag) {
5045  for (i=0; i<256; i++) {
5046  trans[i] = 1;
5047  }
5048  while ((c = trnext(&trsrc, enc)) != errc) {
5049  if (c < 256) {
5050  trans[c] = errc;
5051  }
5052  else {
5053  if (!hash) hash = rb_hash_new();
5054  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5055  }
5056  }
5057  while ((c = trnext(&trrepl, enc)) != errc)
5058  /* retrieve last replacer */;
5059  last = trrepl.now;
5060  for (i=0; i<256; i++) {
5061  if (trans[i] != errc) {
5062  trans[i] = last;
5063  }
5064  }
5065  }
5066  else {
5067  unsigned int r;
5068 
5069  for (i=0; i<256; i++) {
5070  trans[i] = errc;
5071  }
5072  while ((c = trnext(&trsrc, enc)) != errc) {
5073  r = trnext(&trrepl, enc);
5074  if (r == errc) r = trrepl.now;
5075  if (c < 256) {
5076  trans[c] = r;
5077  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5078  }
5079  else {
5080  if (!hash) hash = rb_hash_new();
5081  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5082  }
5083  }
5084  }
5085 
5086  if (cr == ENC_CODERANGE_VALID)
5087  cr = ENC_CODERANGE_7BIT;
5088  str_modify_keep_cr(str);
5089  s = RSTRING_PTR(str); send = RSTRING_END(str);
5090  if (sflag) {
5091  int clen, tlen;
5092  long offset, max = RSTRING_LEN(str);
5093  unsigned int save = -1;
5094  char *buf = ALLOC_N(char, max), *t = buf;
5095 
5096  while (s < send) {
5097  int may_modify = 0;
5098 
5099  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5100  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5101 
5102  s += clen;
5103  if (c < 256) {
5104  c = trans[c];
5105  }
5106  else if (hash) {
5107  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5108  if (NIL_P(tmp)) {
5109  if (cflag) c = last;
5110  else c = errc;
5111  }
5112  else if (cflag) c = errc;
5113  else c = NUM2INT(tmp);
5114  }
5115  else {
5116  c = errc;
5117  }
5118  if (c != (unsigned int)-1) {
5119  if (save == c) {
5120  CHECK_IF_ASCII(c);
5121  continue;
5122  }
5123  save = c;
5124  tlen = rb_enc_codelen(c, enc);
5125  modify = 1;
5126  }
5127  else {
5128  save = -1;
5129  c = c0;
5130  if (enc != e1) may_modify = 1;
5131  }
5132  while (t - buf + tlen >= max) {
5133  offset = t - buf;
5134  max *= 2;
5135  REALLOC_N(buf, char, max);
5136  t = buf + offset;
5137  }
5138  rb_enc_mbcput(c, t, enc);
5139  if (may_modify && memcmp(s, t, tlen) != 0) {
5140  modify = 1;
5141  }
5142  CHECK_IF_ASCII(c);
5143  t += tlen;
5144  }
5145  if (!STR_EMBED_P(str)) {
5146  xfree(RSTRING(str)->as.heap.ptr);
5147  }
5148  *t = '\0';
5149  RSTRING(str)->as.heap.ptr = buf;
5150  RSTRING(str)->as.heap.len = t - buf;
5151  STR_SET_NOEMBED(str);
5152  RSTRING(str)->as.heap.aux.capa = max;
5153  }
5154  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5155  while (s < send) {
5156  c = (unsigned char)*s;
5157  if (trans[c] != errc) {
5158  if (!cflag) {
5159  c = trans[c];
5160  *s = c;
5161  modify = 1;
5162  }
5163  else {
5164  *s = last;
5165  modify = 1;
5166  }
5167  }
5168  CHECK_IF_ASCII(c);
5169  s++;
5170  }
5171  }
5172  else {
5173  int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5174  long offset;
5175  char *buf = ALLOC_N(char, max), *t = buf;
5176 
5177  while (s < send) {
5178  int may_modify = 0;
5179  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5180  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5181 
5182  if (c < 256) {
5183  c = trans[c];
5184  }
5185  else if (hash) {
5186  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5187  if (NIL_P(tmp)) {
5188  if (cflag) c = last;
5189  else c = errc;
5190  }
5191  else if (cflag) c = errc;
5192  else c = NUM2INT(tmp);
5193  }
5194  else {
5195  c = cflag ? last : errc;
5196  }
5197  if (c != errc) {
5198  tlen = rb_enc_codelen(c, enc);
5199  modify = 1;
5200  }
5201  else {
5202  c = c0;
5203  if (enc != e1) may_modify = 1;
5204  }
5205  while (t - buf + tlen >= max) {
5206  offset = t - buf;
5207  max *= 2;
5208  REALLOC_N(buf, char, max);
5209  t = buf + offset;
5210  }
5211  if (s != t) {
5212  rb_enc_mbcput(c, t, enc);
5213  if (may_modify && memcmp(s, t, tlen) != 0) {
5214  modify = 1;
5215  }
5216  }
5217  CHECK_IF_ASCII(c);
5218  s += clen;
5219  t += tlen;
5220  }
5221  if (!STR_EMBED_P(str)) {
5222  xfree(RSTRING(str)->as.heap.ptr);
5223  }
5224  *t = '\0';
5225  RSTRING(str)->as.heap.ptr = buf;
5226  RSTRING(str)->as.heap.len = t - buf;
5227  STR_SET_NOEMBED(str);
5228  RSTRING(str)->as.heap.aux.capa = max;
5229  }
5230 
5231  if (modify) {
5232  if (cr != ENC_CODERANGE_BROKEN)
5233  ENC_CODERANGE_SET(str, cr);
5234  rb_enc_associate(str, enc);
5235  return str;
5236  }
5237  return Qnil;
5238 }
5239 
5240 
5241 /*
5242  * call-seq:
5243  * str.tr!(from_str, to_str) -> str or nil
5244  *
5245  * Translates <i>str</i> in place, using the same rules as
5246  * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5247  * changes were made.
5248  */
5249 
5250 static VALUE
5252 {
5253  return tr_trans(str, src, repl, 0);
5254 }
5255 
5256 
5257 /*
5258  * call-seq:
5259  * str.tr(from_str, to_str) => new_str
5260  *
5261  * Returns a copy of <i>str</i> with the characters in <i>from_str</i>
5262  * replaced by the corresponding characters in <i>to_str</i>. If
5263  * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
5264  * character in order to maintain the correspondence.
5265  *
5266  * "hello".tr('el', 'ip') #=> "hippo"
5267  * "hello".tr('aeiou', '*') #=> "h*ll*"
5268  *
5269  * Both strings may use the c1-c2 notation to denote ranges of characters,
5270  * and <i>from_str</i> may start with a <code>^</code>, which denotes all
5271  * characters except those listed.
5272  *
5273  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
5274  * "hello".tr('^aeiou', '*') #=> "*e**o"
5275  */
5276 
5277 static VALUE
5278 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5279 {
5280  str = rb_str_dup(str);
5281  tr_trans(str, src, repl, 0);
5282  return str;
5283 }
5284 
5285 #define TR_TABLE_SIZE 257
5286 static void
5287 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5288  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5289 {
5290  const unsigned int errc = -1;
5291  char buf[256];
5292  struct tr tr;
5293  unsigned int c;
5294  VALUE table = 0, ptable = 0;
5295  int i, l, cflag = 0;
5296 
5297  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5298  tr.gen = tr.now = tr.max = 0;
5299 
5300  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5301  cflag = 1;
5302  tr.p += l;
5303  }
5304  if (first) {
5305  for (i=0; i<256; i++) {
5306  stable[i] = 1;
5307  }
5308  stable[256] = cflag;
5309  }
5310  else if (stable[256] && !cflag) {
5311  stable[256] = 0;
5312  }
5313  for (i=0; i<256; i++) {
5314  buf[i] = cflag;
5315  }
5316 
5317  while ((c = trnext(&tr, enc)) != errc) {
5318  if (c < 256) {
5319  buf[c & 0xff] = !cflag;
5320  }
5321  else {
5322  VALUE key = UINT2NUM(c);
5323 
5324  if (!table) {
5325  table = rb_hash_new();
5326  if (cflag) {
5327  ptable = *ctablep;
5328  *ctablep = table;
5329  }
5330  else {
5331  ptable = *tablep;
5332  *tablep = table;
5333  }
5334  }
5335  if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
5336  rb_hash_aset(table, key, Qtrue);
5337  }
5338  }
5339  }
5340  for (i=0; i<256; i++) {
5341  stable[i] = stable[i] && buf[i];
5342  }
5343 }
5344 
5345 
5346 static int
5347 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5348 {
5349  if (c < 256) {
5350  return table[c] != 0;
5351  }
5352  else {
5353  VALUE v = UINT2NUM(c);
5354 
5355  if (del) {
5356  if (!NIL_P(rb_hash_lookup(del, v)) &&
5357  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5358  return TRUE;
5359  }
5360  }
5361  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5362  return FALSE;
5363  }
5364  return table[256] ? TRUE : FALSE;
5365  }
5366 }
5367 
5368 /*
5369  * call-seq:
5370  * str.delete!([other_str]+) -> str or nil
5371  *
5372  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5373  * <code>nil</code> if <i>str</i> was not modified.
5374  */
5375 
5376 static VALUE
5378 {
5379  char squeez[TR_TABLE_SIZE];
5380  rb_encoding *enc = 0;
5381  char *s, *send, *t;
5382  VALUE del = 0, nodel = 0;
5383  int modify = 0;
5384  int i, ascompat, cr;
5385 
5386  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5387  if (argc < 1) {
5388  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5389  }
5390  for (i=0; i<argc; i++) {
5391  VALUE s = argv[i];
5392 
5393  StringValue(s);
5394  enc = rb_enc_check(str, s);
5395  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5396  }
5397 
5398  str_modify_keep_cr(str);
5399  ascompat = rb_enc_asciicompat(enc);
5400  s = t = RSTRING_PTR(str);
5401  send = RSTRING_END(str);
5402  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5403  while (s < send) {
5404  unsigned int c;
5405  int clen;
5406 
5407  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5408  if (squeez[c]) {
5409  modify = 1;
5410  }
5411  else {
5412  if (t != s) *t = c;
5413  t++;
5414  }
5415  s++;
5416  }
5417  else {
5418  c = rb_enc_codepoint_len(s, send, &clen, enc);
5419 
5420  if (tr_find(c, squeez, del, nodel)) {
5421  modify = 1;
5422  }
5423  else {
5424  if (t != s) rb_enc_mbcput(c, t, enc);
5425  t += clen;
5426  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5427  }
5428  s += clen;
5429  }
5430  }
5431  *t = '\0';
5432  STR_SET_LEN(str, t - RSTRING_PTR(str));
5433  ENC_CODERANGE_SET(str, cr);
5434 
5435  if (modify) return str;
5436  return Qnil;
5437 }
5438 
5439 
5440 /*
5441  * call-seq:
5442  * str.delete([other_str]+) -> new_str
5443  *
5444  * Returns a copy of <i>str</i> with all characters in the intersection of its
5445  * arguments deleted. Uses the same rules for building the set of characters as
5446  * <code>String#count</code>.
5447  *
5448  * "hello".delete "l","lo" #=> "heo"
5449  * "hello".delete "lo" #=> "he"
5450  * "hello".delete "aeiou", "^e" #=> "hell"
5451  * "hello".delete "ej-m" #=> "ho"
5452  */
5453 
5454 static VALUE
5456 {
5457  str = rb_str_dup(str);
5458  rb_str_delete_bang(argc, argv, str);
5459  return str;
5460 }
5461 
5462 
5463 /*
5464  * call-seq:
5465  * str.squeeze!([other_str]*) -> str or nil
5466  *
5467  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
5468  * <code>nil</code> if no changes were made.
5469  */
5470 
5471 static VALUE
5473 {
5474  char squeez[TR_TABLE_SIZE];
5475  rb_encoding *enc = 0;
5476  VALUE del = 0, nodel = 0;
5477  char *s, *send, *t;
5478  int i, modify = 0;
5479  int ascompat, singlebyte = single_byte_optimizable(str);
5480  unsigned int save;
5481 
5482  if (argc == 0) {
5483  enc = STR_ENC_GET(str);
5484  }
5485  else {
5486  for (i=0; i<argc; i++) {
5487  VALUE s = argv[i];
5488 
5489  StringValue(s);
5490  enc = rb_enc_check(str, s);
5491  if (singlebyte && !single_byte_optimizable(s))
5492  singlebyte = 0;
5493  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5494  }
5495  }
5496 
5497  str_modify_keep_cr(str);
5498  s = t = RSTRING_PTR(str);
5499  if (!s || RSTRING_LEN(str) == 0) return Qnil;
5500  send = RSTRING_END(str);
5501  save = -1;
5502  ascompat = rb_enc_asciicompat(enc);
5503 
5504  if (singlebyte) {
5505  while (s < send) {
5506  unsigned int c = *(unsigned char*)s++;
5507  if (c != save || (argc > 0 && !squeez[c])) {
5508  *t++ = save = c;
5509  }
5510  }
5511  } else {
5512  while (s < send) {
5513  unsigned int c;
5514  int clen;
5515 
5516  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5517  if (c != save || (argc > 0 && !squeez[c])) {
5518  *t++ = save = c;
5519  }
5520  s++;
5521  }
5522  else {
5523  c = rb_enc_codepoint_len(s, send, &clen, enc);
5524 
5525  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5526  if (t != s) rb_enc_mbcput(c, t, enc);
5527  save = c;
5528  t += clen;
5529  }
5530  s += clen;
5531  }
5532  }
5533  }
5534 
5535  *t = '\0';
5536  if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5537  STR_SET_LEN(str, t - RSTRING_PTR(str));
5538  modify = 1;
5539  }
5540 
5541  if (modify) return str;
5542  return Qnil;
5543 }
5544 
5545 
5546 /*
5547  * call-seq:
5548  * str.squeeze([other_str]*) -> new_str
5549  *
5550  * Builds a set of characters from the <i>other_str</i> parameter(s) using the
5551  * procedure described for <code>String#count</code>. Returns a new string
5552  * where runs of the same character that occur in this set are replaced by a
5553  * single character. If no arguments are given, all runs of identical
5554  * characters are replaced by a single character.
5555  *
5556  * "yellow moon".squeeze #=> "yelow mon"
5557  * " now is the".squeeze(" ") #=> " now is the"
5558  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
5559  */
5560 
5561 static VALUE
5563 {
5564  str = rb_str_dup(str);
5565  rb_str_squeeze_bang(argc, argv, str);
5566  return str;
5567 }
5568 
5569 
5570 /*
5571  * call-seq:
5572  * str.tr_s!(from_str, to_str) -> str or nil
5573  *
5574  * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5575  * returning <i>str</i>, or <code>nil</code> if no changes were made.
5576  */
5577 
5578 static VALUE
5580 {
5581  return tr_trans(str, src, repl, 1);
5582 }
5583 
5584 
5585 /*
5586  * call-seq:
5587  * str.tr_s(from_str, to_str) -> new_str
5588  *
5589  * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5590  * then removes duplicate characters in regions that were affected by the
5591  * translation.
5592  *
5593  * "hello".tr_s('l', 'r') #=> "hero"
5594  * "hello".tr_s('el', '*') #=> "h*o"
5595  * "hello".tr_s('el', 'hx') #=> "hhxo"
5596  */
5597 
5598 static VALUE
5599 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5600 {
5601  str = rb_str_dup(str);
5602  tr_trans(str, src, repl, 1);
5603  return str;
5604 }
5605 
5606 
5607 /*
5608  * call-seq:
5609  * str.count([other_str]+) -> fixnum
5610  *
5611  * Each <i>other_str</i> parameter defines a set of characters to count. The
5612  * intersection of these sets defines the characters to count in
5613  * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
5614  * negated. The sequence c1--c2 means all characters between c1 and c2.
5615  *
5616  * a = "hello world"
5617  * a.count "lo" #=> 5
5618  * a.count "lo", "o" #=> 2
5619  * a.count "hello", "^l" #=> 4
5620  * a.count "ej-m" #=> 4
5621  */
5622 
5623 static VALUE
5625 {
5626  char table[TR_TABLE_SIZE];
5627  rb_encoding *enc = 0;
5628  VALUE del = 0, nodel = 0;
5629  char *s, *send;
5630  int i;
5631  int ascompat;
5632 
5633  if (argc < 1) {
5634  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5635  }
5636  for (i=0; i<argc; i++) {
5637  VALUE tstr = argv[i];
5638  unsigned char c;
5639 
5640  StringValue(tstr);
5641  enc = rb_enc_check(str, tstr);
5642  if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5643  (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5644  int n = 0;
5645 
5646  s = RSTRING_PTR(str);
5647  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5648  send = RSTRING_END(str);
5649  while (s < send) {
5650  if (*(unsigned char*)s++ == c) n++;
5651  }
5652  return INT2NUM(n);
5653  }
5654  tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5655  }
5656 
5657  s = RSTRING_PTR(str);
5658  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5659  send = RSTRING_END(str);
5660  ascompat = rb_enc_asciicompat(enc);
5661  i = 0;
5662  while (s < send) {
5663  unsigned int c;
5664 
5665  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5666  if (table[c]) {
5667  i++;
5668  }
5669  s++;
5670  }
5671  else {
5672  int clen;
5673  c = rb_enc_codepoint_len(s, send, &clen, enc);
5674  if (tr_find(c, table, del, nodel)) {
5675  i++;
5676  }
5677  s += clen;
5678  }
5679  }
5680 
5681  return INT2NUM(i);
5682 }
5683 
5684 static const char isspacetable[256] = {
5685  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5686  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5687  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5688  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5689  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5690  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5691  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5692  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5693  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5694  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5695  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5696  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5697  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5698  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5699  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5700  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5701 };
5702 
5703 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5704 
5705 /*
5706  * call-seq:
5707  * str.split(pattern=$;, [limit]) -> anArray
5708  *
5709  * Divides <i>str</i> into substrings based on a delimiter, returning an array
5710  * of these substrings.
5711  *
5712  * If <i>pattern</i> is a <code>String</code>, then its contents are used as
5713  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5714  * space, <i>str</i> is split on whitespace, with leading whitespace and runs
5715  * of contiguous whitespace characters ignored.
5716  *
5717  * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5718  * pattern matches. Whenever the pattern matches a zero-length string,
5719  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
5720  * groups, the respective matches will be returned in the array as well.
5721  *
5722  * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
5723  * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5724  * split on whitespace as if ` ' were specified.
5725  *
5726  * If the <i>limit</i> parameter is omitted, trailing null fields are
5727  * suppressed. If <i>limit</i> is a positive number, at most that number of
5728  * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5729  * string is returned as the only entry in an array). If negative, there is no
5730  * limit to the number of fields returned, and trailing null fields are not
5731  * suppressed.
5732  *
5733  * " now's the time".split #=> ["now's", "the", "time"]
5734  * " now's the time".split(' ') #=> ["now's", "the", "time"]
5735  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
5736  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5737  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
5738  * "hello".split(//, 3) #=> ["h", "e", "llo"]
5739  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
5740  *
5741  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
5742  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
5743  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
5744  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
5745  */
5746 
5747 static VALUE
5749 {
5750  rb_encoding *enc;
5751  VALUE spat;
5752  VALUE limit;
5753  enum {awk, string, regexp} split_type;
5754  long beg, end, i = 0;
5755  int lim = 0;
5756  VALUE result, tmp;
5757 
5758  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5759  lim = NUM2INT(limit);
5760  if (lim <= 0) limit = Qnil;
5761  else if (lim == 1) {
5762  if (RSTRING_LEN(str) == 0)
5763  return rb_ary_new2(0);
5764  return rb_ary_new3(1, str);
5765  }
5766  i = 1;
5767  }
5768 
5769  enc = STR_ENC_GET(str);
5770  if (NIL_P(spat)) {
5771  if (!NIL_P(rb_fs)) {
5772  spat = rb_fs;
5773  goto fs_set;
5774  }
5775  split_type = awk;
5776  }
5777  else {
5778  fs_set:
5779  if (TYPE(spat) == T_STRING) {
5780  rb_encoding *enc2 = STR_ENC_GET(spat);
5781 
5782  split_type = string;
5783  if (RSTRING_LEN(spat) == 0) {
5784  /* Special case - split into chars */
5785  spat = rb_reg_regcomp(spat);
5786  split_type = regexp;
5787  }
5788  else if (rb_enc_asciicompat(enc2) == 1) {
5789  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5790  split_type = awk;
5791  }
5792  }
5793  else {
5794  int l;
5795  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5796  RSTRING_LEN(spat) == l) {
5797  split_type = awk;
5798  }
5799  }
5800  }
5801  else {
5802  spat = get_pat(spat, 1);
5803  split_type = regexp;
5804  }
5805  }
5806 
5807  result = rb_ary_new();
5808  beg = 0;
5809  if (split_type == awk) {
5810  char *ptr = RSTRING_PTR(str);
5811  char *eptr = RSTRING_END(str);
5812  char *bptr = ptr;
5813  int skip = 1;
5814  unsigned int c;
5815 
5816  end = beg;
5817  if (is_ascii_string(str)) {
5818  while (ptr < eptr) {
5819  c = (unsigned char)*ptr++;
5820  if (skip) {
5821  if (ascii_isspace(c)) {
5822  beg = ptr - bptr;
5823  }
5824  else {
5825  end = ptr - bptr;
5826  skip = 0;
5827  if (!NIL_P(limit) && lim <= i) break;
5828  }
5829  }
5830  else if (ascii_isspace(c)) {
5831  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5832  skip = 1;
5833  beg = ptr - bptr;
5834  if (!NIL_P(limit)) ++i;
5835  }
5836  else {
5837  end = ptr - bptr;
5838  }
5839  }
5840  }
5841  else {
5842  while (ptr < eptr) {
5843  int n;
5844 
5845  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
5846  ptr += n;
5847  if (skip) {
5848  if (rb_isspace(c)) {
5849  beg = ptr - bptr;
5850  }
5851  else {
5852  end = ptr - bptr;
5853  skip = 0;
5854  if (!NIL_P(limit) && lim <= i) break;
5855  }
5856  }
5857  else if (rb_isspace(c)) {
5858  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5859  skip = 1;
5860  beg = ptr - bptr;
5861  if (!NIL_P(limit)) ++i;
5862  }
5863  else {
5864  end = ptr - bptr;
5865  }
5866  }
5867  }
5868  }
5869  else if (split_type == string) {
5870  char *ptr = RSTRING_PTR(str);
5871  char *temp = ptr;
5872  char *eptr = RSTRING_END(str);
5873  char *sptr = RSTRING_PTR(spat);
5874  long slen = RSTRING_LEN(spat);
5875 
5876  if (is_broken_string(str)) {
5877  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
5878  }
5879  if (is_broken_string(spat)) {
5880  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
5881  }
5882  enc = rb_enc_check(str, spat);
5883  while (ptr < eptr &&
5884  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
5885  /* Check we are at the start of a char */
5886  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
5887  if (t != ptr + end) {
5888  ptr = t;
5889  continue;
5890  }
5891  rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
5892  ptr += end + slen;
5893  if (!NIL_P(limit) && lim <= ++i) break;
5894  }
5895  beg = ptr - temp;
5896  }
5897  else {
5898  char *ptr = RSTRING_PTR(str);
5899  long len = RSTRING_LEN(str);
5900  long start = beg;
5901  long idx;
5902  int last_null = 0;
5903  struct re_registers *regs;
5904 
5905  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
5906  regs = RMATCH_REGS(rb_backref_get());
5907  if (start == end && BEG(0) == END(0)) {
5908  if (!ptr) {
5909  rb_ary_push(result, str_new_empty(str));
5910  break;
5911  }
5912  else if (last_null == 1) {
5913  rb_ary_push(result, rb_str_subseq(str, beg,
5914  rb_enc_fast_mbclen(ptr+beg,
5915  ptr+len,
5916  enc)));
5917  beg = start;
5918  }
5919  else {
5920  if (ptr+start == ptr+len)
5921  start++;
5922  else
5923  start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
5924  last_null = 1;
5925  continue;
5926  }
5927  }
5928  else {
5929  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5930  beg = start = END(0);
5931  }
5932  last_null = 0;
5933 
5934  for (idx=1; idx < regs->num_regs; idx++) {
5935  if (BEG(idx) == -1) continue;
5936  if (BEG(idx) == END(idx))
5937  tmp = str_new_empty(str);
5938  else
5939  tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5940  rb_ary_push(result, tmp);
5941  }
5942  if (!NIL_P(limit) && lim <= ++i) break;
5943  }
5944  }
5945  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5946  if (RSTRING_LEN(str) == beg)
5947  tmp = str_new_empty(str);
5948  else
5949  tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5950  rb_ary_push(result, tmp);
5951  }
5952  if (NIL_P(limit) && lim == 0) {
5953  long len;
5954  while ((len = RARRAY_LEN(result)) > 0 &&
5955  (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
5956  rb_ary_pop(result);
5957  }
5958 
5959  return result;
5960 }
5961 
5962 VALUE
5963 rb_str_split(VALUE str, const char *sep0)
5964 {
5965  VALUE sep;
5966 
5967  StringValue(str);
5968  sep = rb_str_new2(sep0);
5969  return rb_str_split_m(1, &sep, str);
5970 }
5971 
5972 
5973 /*
5974  * call-seq:
5975  * str.each_line(separator=$/) {|substr| block } -> str
5976  * str.each_line(separator=$/) -> an_enumerator
5977  *
5978  * str.lines(separator=$/) {|substr| block } -> str
5979  * str.lines(separator=$/) -> an_enumerator
5980  *
5981  * Splits <i>str</i> using the supplied parameter as the record separator
5982  * (<code>$/</code> by default), passing each substring in turn to the supplied
5983  * block. If a zero-length record separator is supplied, the string is split
5984  * into paragraphs delimited by multiple successive newlines.
5985  *
5986  * If no block is given, an enumerator is returned instead.
5987  *
5988  * print "Example one\n"
5989  * "hello\nworld".each_line {|s| p s}
5990  * print "Example two\n"
5991  * "hello\nworld".each_line('l') {|s| p s}
5992  * print "Example three\n"
5993  * "hello\n\n\nworld".each_line('') {|s| p s}
5994  *
5995  * <em>produces:</em>
5996  *
5997  * Example one
5998  * "hello\n"
5999  * "world"
6000  * Example two
6001  * "hel"
6002  * "l"
6003  * "o\nworl"
6004  * "d"
6005  * Example three
6006  * "hello\n\n\n"
6007  * "world"
6008  */
6009 
6010 static VALUE
6012 {
6013  rb_encoding *enc;
6014  VALUE rs;
6015  unsigned int newline;
6016  const char *p, *pend, *s, *ptr;
6017  long len, rslen;
6018  VALUE line;
6019  int n;
6020  VALUE orig = str;
6021 
6022  if (argc == 0) {
6023  rs = rb_rs;
6024  }
6025  else {
6026  rb_scan_args(argc, argv, "01", &rs);
6027  }
6028  RETURN_ENUMERATOR(str, argc, argv);
6029  if (NIL_P(rs)) {
6030  rb_yield(str);
6031  return orig;
6032  }
6033  str = rb_str_new4(str);
6034  ptr = p = s = RSTRING_PTR(str);
6035  pend = p + RSTRING_LEN(str);
6036  len = RSTRING_LEN(str);
6037  StringValue(rs);
6038  if (rs == rb_default_rs) {
6039  enc = rb_enc_get(str);
6040  while (p < pend) {
6041  char *p0;
6042 
6043  p = memchr(p, '\n', pend - p);
6044  if (!p) break;
6045  p0 = rb_enc_left_char_head(s, p, pend, enc);
6046  if (!rb_enc_is_newline(p0, pend, enc)) {
6047  p++;
6048  continue;
6049  }
6050  p = p0 + rb_enc_mbclen(p0, pend, enc);
6051  line = rb_str_new5(str, s, p - s);
6052  OBJ_INFECT(line, str);
6053  rb_enc_cr_str_copy_for_substr(line, str);
6054  rb_yield(line);
6055  str_mod_check(str, ptr, len);
6056  s = p;
6057  }
6058  goto finish;
6059  }
6060 
6061  enc = rb_enc_check(str, rs);
6062  rslen = RSTRING_LEN(rs);
6063  if (rslen == 0) {
6064  newline = '\n';
6065  }
6066  else {
6067  newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6068  }
6069 
6070  while (p < pend) {
6071  unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6072 
6073  again:
6074  if (rslen == 0 && c == newline) {
6075  p += n;
6076  if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6077  goto again;
6078  }
6079  while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6080  p += n;
6081  }
6082  p -= n;
6083  }
6084  if (c == newline &&
6085  (rslen <= 1 ||
6086  (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6087  line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
6088  OBJ_INFECT(line, str);
6089  rb_enc_cr_str_copy_for_substr(line, str);
6090  rb_yield(line);
6091  str_mod_check(str, ptr, len);
6092  s = p + (rslen ? rslen : n);
6093  }
6094  p += n;
6095  }
6096 
6097  finish:
6098  if (s != pend) {
6099  line = rb_str_new5(str, s, pend - s);
6100  OBJ_INFECT(line, str);
6101  rb_enc_cr_str_copy_for_substr(line, str);
6102  rb_yield(line);
6103  }
6104 
6105  return orig;
6106 }
6107 
6108 
6109 /*
6110  * call-seq:
6111  * str.bytes {|fixnum| block } -> str
6112  * str.bytes -> an_enumerator
6113  *
6114  * str.each_byte {|fixnum| block } -> str
6115  * str.each_byte -> an_enumerator
6116  *
6117  * Passes each byte in <i>str</i> to the given block, or returns
6118  * an enumerator if no block is given.
6119  *
6120  * "hello".each_byte {|c| print c, ' ' }
6121  *
6122  * <em>produces:</em>
6123  *
6124  * 104 101 108 108 111
6125  */
6126 
6127 static VALUE
6129 {
6130  long i;
6131 
6132  RETURN_ENUMERATOR(str, 0, 0);
6133  for (i=0; i<RSTRING_LEN(str); i++) {
6134  rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6135  }
6136  return str;
6137 }
6138 
6139 
6140 /*
6141  * call-seq:
6142  * str.chars {|cstr| block } -> str
6143  * str.chars -> an_enumerator
6144  *
6145  * str.each_char {|cstr| block } -> str
6146  * str.each_char -> an_enumerator
6147  *
6148  * Passes each character in <i>str</i> to the given block, or returns
6149  * an enumerator if no block is given.
6150  *
6151  * "hello".each_char {|c| print c, ' ' }
6152  *
6153  * <em>produces:</em>
6154  *
6155  * h e l l o
6156  */
6157 
6158 static VALUE
6160 {
6161  VALUE orig = str;
6162  long i, len, n;
6163  const char *ptr;
6164  rb_encoding *enc;
6165 
6166  RETURN_ENUMERATOR(str, 0, 0);
6167  str = rb_str_new4(str);
6168  ptr = RSTRING_PTR(str);
6169  len = RSTRING_LEN(str);
6170  enc = rb_enc_get(str);
6171  switch (ENC_CODERANGE(str)) {
6172  case ENC_CODERANGE_VALID:
6173  case ENC_CODERANGE_7BIT:
6174  for (i = 0; i < len; i += n) {
6175  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6176  rb_yield(rb_str_subseq(str, i, n));
6177  }
6178  break;
6179  default:
6180  for (i = 0; i < len; i += n) {
6181  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6182  rb_yield(rb_str_subseq(str, i, n));
6183  }
6184  }
6185  return orig;
6186 }
6187 
6188 /*
6189  * call-seq:
6190  * str.codepoints {|integer| block } -> str
6191  * str.codepoints -> an_enumerator
6192  *
6193  * str.each_codepoint {|integer| block } -> str
6194  * str.each_codepoint -> an_enumerator
6195  *
6196  * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6197  * also known as a <i>codepoint</i> when applied to Unicode strings to the
6198  * given block.
6199  *
6200  * If no block is given, an enumerator is returned instead.
6201  *
6202  * "hello\u0639".each_codepoint {|c| print c, ' ' }
6203  *
6204  * <em>produces:</em>
6205  *
6206  * 104 101 108 108 111 1593
6207  */
6208 
6209 static VALUE
6211 {
6212  VALUE orig = str;
6213  int n;
6214  unsigned int c;
6215  const char *ptr, *end;
6216  rb_encoding *enc;
6217 
6218  if (single_byte_optimizable(str)) return rb_str_each_byte(str);
6219  RETURN_ENUMERATOR(str, 0, 0);
6220  str = rb_str_new4(str);
6221  ptr = RSTRING_PTR(str);
6222  end = RSTRING_END(str);
6223  enc = STR_ENC_GET(str);
6224  while (ptr < end) {
6225  c = rb_enc_codepoint_len(ptr, end, &n, enc);
6226  rb_yield(UINT2NUM(c));
6227  ptr += n;
6228  }
6229  return orig;
6230 }
6231 
6232 static long
6234 {
6235  rb_encoding *enc = STR_ENC_GET(str);
6236  const char *p, *p2, *beg, *end;
6237 
6238  beg = RSTRING_PTR(str);
6239  end = beg + RSTRING_LEN(str);
6240  if (beg > end) return 0;
6241  p = rb_enc_prev_char(beg, end, end, enc);
6242  if (!p) return 0;
6243  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6244  p2 = rb_enc_prev_char(beg, p, end, enc);
6245  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6246  }
6247  return p - beg;
6248 }
6249 
6250 /*
6251  * call-seq:
6252  * str.chop! -> str or nil
6253  *
6254  * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6255  * or <code>nil</code> if <i>str</i> is the empty string. See also
6256  * <code>String#chomp!</code>.
6257  */
6258 
6259 static VALUE
6261 {
6262  str_modify_keep_cr(str);
6263  if (RSTRING_LEN(str) > 0) {
6264  long len;
6265  len = chopped_length(str);
6266  STR_SET_LEN(str, len);
6267  RSTRING_PTR(str)[len] = '\0';
6268  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6269  ENC_CODERANGE_CLEAR(str);
6270  }
6271  return str;
6272  }
6273  return Qnil;
6274 }
6275 
6276 
6277 /*
6278  * call-seq:
6279  * str.chop -> new_str
6280  *
6281  * Returns a new <code>String</code> with the last character removed. If the
6282  * string ends with <code>\r\n</code>, both characters are removed. Applying
6283  * <code>chop</code> to an empty string returns an empty
6284  * string. <code>String#chomp</code> is often a safer alternative, as it leaves
6285  * the string unchanged if it doesn't end in a record separator.
6286  *
6287  * "string\r\n".chop #=> "string"
6288  * "string\n\r".chop #=> "string\n"
6289  * "string\n".chop #=> "string"
6290  * "string".chop #=> "strin"
6291  * "x".chop.chop #=> ""
6292  */
6293 
6294 static VALUE
6296 {
6297  VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
6298  rb_enc_cr_str_copy_for_substr(str2, str);
6299  OBJ_INFECT(str2, str);
6300  return str2;
6301 }
6302 
6303 
6304 /*
6305  * call-seq:
6306  * str.chomp!(separator=$/) -> str or nil
6307  *
6308  * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6309  * returning <i>str</i>, or <code>nil</code> if no modifications were made.
6310  */
6311 
6312 static VALUE
6314 {
6315  rb_encoding *enc;
6316  VALUE rs;
6317  int newline;
6318  char *p, *pp, *e;
6319  long len, rslen;
6320 
6321  str_modify_keep_cr(str);
6322  len = RSTRING_LEN(str);
6323  if (len == 0) return Qnil;
6324  p = RSTRING_PTR(str);
6325  e = p + len;
6326  if (argc == 0) {
6327  rs = rb_rs;
6328  if (rs == rb_default_rs) {
6329  smart_chomp:
6330  enc = rb_enc_get(str);
6331  if (rb_enc_mbminlen(enc) > 1) {
6332  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6333  if (rb_enc_is_newline(pp, e, enc)) {
6334  e = pp;
6335  }
6336  pp = e - rb_enc_mbminlen(enc);
6337  if (pp >= p) {
6338  pp = rb_enc_left_char_head(p, pp, e, enc);
6339  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6340  e = pp;
6341  }
6342  }
6343  if (e == RSTRING_END(str)) {
6344  return Qnil;
6345  }
6346  len = e - RSTRING_PTR(str);
6347  STR_SET_LEN(str, len);
6348  }
6349  else {
6350  if (RSTRING_PTR(str)[len-1] == '\n') {
6351  STR_DEC_LEN(str);
6352  if (RSTRING_LEN(str) > 0 &&
6353  RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6354  STR_DEC_LEN(str);
6355  }
6356  }
6357  else if (RSTRING_PTR(str)[len-1] == '\r') {
6358  STR_DEC_LEN(str);
6359  }
6360  else {
6361  return Qnil;
6362  }
6363  }
6364  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6365  return str;
6366  }
6367  }
6368  else {
6369  rb_scan_args(argc, argv, "01", &rs);
6370  }
6371  if (NIL_P(rs)) return Qnil;
6372  StringValue(rs);
6373  rslen = RSTRING_LEN(rs);
6374  if (rslen == 0) {
6375  while (len>0 && p[len-1] == '\n') {
6376  len--;
6377  if (len>0 && p[len-1] == '\r')
6378  len--;
6379  }
6380  if (len < RSTRING_LEN(str)) {
6381  STR_SET_LEN(str, len);
6382  RSTRING_PTR(str)[len] = '\0';
6383  return str;
6384  }
6385  return Qnil;
6386  }
6387  if (rslen > len) return Qnil;
6388  newline = RSTRING_PTR(rs)[rslen-1];
6389  if (rslen == 1 && newline == '\n')
6390  goto smart_chomp;
6391 
6392  enc = rb_enc_check(str, rs);
6393  if (is_broken_string(rs)) {
6394  return Qnil;
6395  }
6396  pp = e - rslen;
6397  if (p[len-1] == newline &&
6398  (rslen <= 1 ||
6399  memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6400  if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6401  return Qnil;
6402  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6403  ENC_CODERANGE_CLEAR(str);
6404  }
6405  STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6406  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6407  return str;
6408  }
6409  return Qnil;
6410 }
6411 
6412 
6413 /*
6414  * call-seq:
6415  * str.chomp(separator=$/) -> new_str
6416  *
6417  * Returns a new <code>String</code> with the given record separator removed
6418  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
6419  * changed from the default Ruby record separator, then <code>chomp</code> also
6420  * removes carriage return characters (that is it will remove <code>\n</code>,
6421  * <code>\r</code>, and <code>\r\n</code>).
6422  *
6423  * "hello".chomp #=> "hello"
6424  * "hello\n".chomp #=> "hello"
6425  * "hello\r\n".chomp #=> "hello"
6426  * "hello\n\r".chomp #=> "hello\n"
6427  * "hello\r".chomp #=> "hello"
6428  * "hello \n there".chomp #=> "hello \n there"
6429  * "hello".chomp("llo") #=> "he"
6430  */
6431 
6432 static VALUE
6434 {
6435  str = rb_str_dup(str);
6436  rb_str_chomp_bang(argc, argv, str);
6437  return str;
6438 }
6439 
6440 /*
6441  * call-seq:
6442  * str.lstrip! -> self or nil
6443  *
6444  * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6445  * change was made. See also <code>String#rstrip!</code> and
6446  * <code>String#strip!</code>.
6447  *
6448  * " hello ".lstrip #=> "hello "
6449  * "hello".lstrip! #=> nil
6450  */
6451 
6452 static VALUE
6454 {
6455  rb_encoding *enc;
6456  char *s, *t, *e;
6457 
6458  str_modify_keep_cr(str);
6459  enc = STR_ENC_GET(str);
6460  s = RSTRING_PTR(str);
6461  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6462  e = t = RSTRING_END(str);
6463  /* remove spaces at head */
6464  while (s < e) {
6465  int n;
6466  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6467 
6468  if (!rb_isspace(cc)) break;
6469  s += n;
6470  }
6471 
6472  if (s > RSTRING_PTR(str)) {
6473  STR_SET_LEN(str, t-s);
6474  memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6475  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6476  return str;
6477  }
6478  return Qnil;
6479 }
6480 
6481 
6482 /*
6483  * call-seq:
6484  * str.lstrip -> new_str
6485  *
6486  * Returns a copy of <i>str</i> with leading whitespace removed. See also
6487  * <code>String#rstrip</code> and <code>String#strip</code>.
6488  *
6489  * " hello ".lstrip #=> "hello "
6490  * "hello".lstrip #=> "hello"
6491  */
6492 
6493 static VALUE
6495 {
6496  str = rb_str_dup(str);
6497  rb_str_lstrip_bang(str);
6498  return str;
6499 }
6500 
6501 
6502 /*
6503  * call-seq:
6504  * str.rstrip! -> self or nil
6505  *
6506  * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
6507  * no change was made. See also <code>String#lstrip!</code> and
6508  * <code>String#strip!</code>.
6509  *
6510  * " hello ".rstrip #=> " hello"
6511  * "hello".rstrip! #=> nil
6512  */
6513 
6514 static VALUE
6516 {
6517  rb_encoding *enc;
6518  char *s, *t, *e;
6519 
6520  str_modify_keep_cr(str);
6521  enc = STR_ENC_GET(str);
6523  s = RSTRING_PTR(str);
6524  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6525  t = e = RSTRING_END(str);
6526 
6527  /* remove trailing spaces or '\0's */
6528  if (single_byte_optimizable(str)) {
6529  unsigned char c;
6530  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
6531  }
6532  else {
6533  char *tp;
6534 
6535  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
6536  unsigned int c = rb_enc_codepoint(tp, e, enc);
6537  if (c && !rb_isspace(c)) break;
6538  t = tp;
6539  }
6540  }
6541  if (t < e) {
6542  long len = t-RSTRING_PTR(str);
6543 
6544  STR_SET_LEN(str, len);
6545  RSTRING_PTR(str)[len] = '\0';
6546  return str;
6547  }
6548  return Qnil;
6549 }
6550 
6551 
6552 /*
6553  * call-seq:
6554  * str.rstrip -> new_str
6555  *
6556  * Returns a copy of <i>str</i> with trailing whitespace removed. See also
6557  * <code>String#lstrip</code> and <code>String#strip</code>.
6558  *
6559  * " hello ".rstrip #=> " hello"
6560  * "hello".rstrip #=> "hello"
6561  */
6562 
6563 static VALUE
6565 {
6566  str = rb_str_dup(str);
6567  rb_str_rstrip_bang(str);
6568  return str;
6569 }
6570 
6571 
6572 /*
6573  * call-seq:
6574  * str.strip! -> str or nil
6575  *
6576  * Removes leading and trailing whitespace from <i>str</i>. Returns
6577  * <code>nil</code> if <i>str</i> was not altered.
6578  */
6579 
6580 static VALUE
6582 {
6583  VALUE l = rb_str_lstrip_bang(str);
6584  VALUE r = rb_str_rstrip_bang(str);
6585 
6586  if (NIL_P(l) && NIL_P(r)) return Qnil;
6587  return str;
6588 }
6589 
6590 
6591 /*
6592  * call-seq:
6593  * str.strip -> new_str
6594  *
6595  * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
6596  *
6597  * " hello ".strip #=> "hello"
6598  * "\tgoodbye\r\n".strip #=> "goodbye"
6599  */
6600 
6601 static VALUE
6603 {
6604  str = rb_str_dup(str);
6605  rb_str_strip_bang(str);
6606  return str;
6607 }
6608 
6609 static VALUE
6610 scan_once(VALUE str, VALUE pat, long *start)
6611 {
6612  VALUE result, match;
6613  struct re_registers *regs;
6614  int i;
6615 
6616  if (rb_reg_search(pat, str, *start, 0) >= 0) {
6617  match = rb_backref_get();
6618  regs = RMATCH_REGS(match);
6619  if (BEG(0) == END(0)) {
6620  rb_encoding *enc = STR_ENC_GET(str);
6621  /*
6622  * Always consume at least one character of the input string
6623  */
6624  if (RSTRING_LEN(str) > END(0))
6625  *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
6626  RSTRING_END(str), enc);
6627  else
6628  *start = END(0)+1;
6629  }
6630  else {
6631  *start = END(0);
6632  }
6633  if (regs->num_regs == 1) {
6634  return rb_reg_nth_match(0, match);
6635  }
6636  result = rb_ary_new2(regs->num_regs);
6637  for (i=1; i < regs->num_regs; i++) {
6638  rb_ary_push(result, rb_reg_nth_match(i, match));
6639  }
6640 
6641  return result;
6642  }
6643  return Qnil;
6644 }
6645 
6646 
6647 /*
6648  * call-seq:
6649  * str.scan(pattern) -> array
6650  * str.scan(pattern) {|match, ...| block } -> str
6651  *
6652  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
6653  * <code>Regexp</code> or a <code>String</code>). For each match, a result is
6654  * generated and either added to the result array or passed to the block. If
6655  * the pattern contains no groups, each individual result consists of the
6656  * matched string, <code>$&</code>. If the pattern contains groups, each
6657  * individual result is itself an array containing one entry per group.
6658  *
6659  * a = "cruel world"
6660  * a.scan(/\w+/) #=> ["cruel", "world"]
6661  * a.scan(/.../) #=> ["cru", "el ", "wor"]
6662  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
6663  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
6664  *
6665  * And the block form:
6666  *
6667  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
6668  * print "\n"
6669  * a.scan(/(.)(.)/) {|x,y| print y, x }
6670  * print "\n"
6671  *
6672  * <em>produces:</em>
6673  *
6674  * <<cruel>> <<world>>
6675  * rceu lowlr
6676  */
6677 
6678 static VALUE
6680 {
6681  VALUE result;
6682  long start = 0;
6683  long last = -1, prev = 0;
6684  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
6685 
6686  pat = get_pat(pat, 1);
6687  if (!rb_block_given_p()) {
6688  VALUE ary = rb_ary_new();
6689 
6690  while (!NIL_P(result = scan_once(str, pat, &start))) {
6691  last = prev;
6692  prev = start;
6693  rb_ary_push(ary, result);
6694  }
6695  if (last >= 0) rb_reg_search(pat, str, last, 0);
6696  return ary;
6697  }
6698 
6699  while (!NIL_P(result = scan_once(str, pat, &start))) {
6700  last = prev;
6701  prev = start;
6702  rb_yield(result);
6703  str_mod_check(str, p, len);
6704  }
6705  if (last >= 0) rb_reg_search(pat, str, last, 0);
6706  return str;
6707 }
6708 
6709 
6710 /*
6711  * call-seq:
6712  * str.hex -> integer
6713  *
6714  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
6715  * (with an optional sign and an optional <code>0x</code>) and returns the
6716  * corresponding number. Zero is returned on error.
6717  *
6718  * "0x0a".hex #=> 10
6719  * "-1234".hex #=> -4660
6720  * "0".hex #=> 0
6721  * "wombat".hex #=> 0
6722  */
6723 
6724 static VALUE
6726 {
6727  rb_encoding *enc = rb_enc_get(str);
6728 
6729  if (!rb_enc_asciicompat(enc)) {
6730  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6731  }
6732  return rb_str_to_inum(str, 16, FALSE);
6733 }
6734 
6735 
6736 /*
6737  * call-seq:
6738  * str.oct -> integer
6739  *
6740  * Treats leading characters of <i>str</i> as a string of octal digits (with an
6741  * optional sign) and returns the corresponding number. Returns 0 if the
6742  * conversion fails.
6743  *
6744  * "123".oct #=> 83
6745  * "-377".oct #=> -255
6746  * "bad".oct #=> 0
6747  * "0377bad".oct #=> 255
6748  */
6749 
6750 static VALUE
6752 {
6753  rb_encoding *enc = rb_enc_get(str);
6754 
6755  if (!rb_enc_asciicompat(enc)) {
6756  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6757  }
6758  return rb_str_to_inum(str, -8, FALSE);
6759 }
6760 
6761 
6762 /*
6763  * call-seq:
6764  * str.crypt(other_str) -> new_str
6765  *
6766  * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
6767  * library function <code>crypt</code>. The argument is the salt string, which
6768  * should be two characters long, each character drawn from
6769  * <code>[a-zA-Z0-9./]</code>.
6770  */
6771 
6772 static VALUE
6774 {
6775  extern char *crypt(const char *, const char *);
6776  VALUE result;
6777  const char *s, *saltp;
6778  char *res;
6779 #ifdef BROKEN_CRYPT
6780  char salt_8bit_clean[3];
6781 #endif
6782 
6783  StringValue(salt);
6784  if (RSTRING_LEN(salt) < 2)
6785  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
6786 
6787  s = RSTRING_PTR(str);
6788  if (!s) s = "";
6789  saltp = RSTRING_PTR(salt);
6790 #ifdef BROKEN_CRYPT
6791  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
6792  salt_8bit_clean[0] = saltp[0] & 0x7f;
6793  salt_8bit_clean[1] = saltp[1] & 0x7f;
6794  salt_8bit_clean[2] = '\0';
6795  saltp = salt_8bit_clean;
6796  }
6797 #endif
6798  res = crypt(s, saltp);
6799  if (!res) {
6800  rb_sys_fail("crypt");
6801  }
6802  result = rb_str_new2(res);
6803  OBJ_INFECT(result, str);
6804  OBJ_INFECT(result, salt);
6805  return result;
6806 }
6807 
6808 
6809 /*
6810  * call-seq:
6811  * str.intern -> symbol
6812  * str.to_sym -> symbol
6813  *
6814  * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
6815  * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
6816  *
6817  * "Koala".intern #=> :Koala
6818  * s = 'cat'.to_sym #=> :cat
6819  * s == :cat #=> true
6820  * s = '@cat'.to_sym #=> :@cat
6821  * s == :@cat #=> true
6822  *
6823  * This can also be used to create symbols that cannot be represented using the
6824  * <code>:xxx</code> notation.
6825  *
6826  * 'cat and dog'.to_sym #=> :"cat and dog"
6827  */
6828 
6829 VALUE
6831 {
6832  VALUE str = RB_GC_GUARD(s);
6833  ID id;
6834 
6835  id = rb_intern_str(str);
6836  return ID2SYM(id);
6837 }
6838 
6839 
6840 /*
6841  * call-seq:
6842  * str.ord -> integer
6843  *
6844  * Return the <code>Integer</code> ordinal of a one-character string.
6845  *
6846  * "a".ord #=> 97
6847  */
6848 
6849 VALUE
6851 {
6852  unsigned int c;
6853 
6855  return UINT2NUM(c);
6856 }
6857 /*
6858  * call-seq:
6859  * str.sum(n=16) -> integer
6860  *
6861  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
6862  * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
6863  * to 16. The result is simply the sum of the binary value of each character in
6864  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
6865  * checksum.
6866  */
6867 
6868 static VALUE
6870 {
6871  VALUE vbits;
6872  int bits;
6873  char *ptr, *p, *pend;
6874  long len;
6875  VALUE sum = INT2FIX(0);
6876  unsigned long sum0 = 0;
6877 
6878  if (argc == 0) {
6879  bits = 16;
6880  }
6881  else {
6882  rb_scan_args(argc, argv, "01", &vbits);
6883  bits = NUM2INT(vbits);
6884  }
6885  ptr = p = RSTRING_PTR(str);
6886  len = RSTRING_LEN(str);
6887  pend = p + len;
6888 
6889  while (p < pend) {
6890  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
6891  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6892  str_mod_check(str, ptr, len);
6893  sum0 = 0;
6894  }
6895  sum0 += (unsigned char)*p;
6896  p++;
6897  }
6898 
6899  if (bits == 0) {
6900  if (sum0) {
6901  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6902  }
6903  }
6904  else {
6905  if (sum == INT2FIX(0)) {
6906  if (bits < (int)sizeof(long)*CHAR_BIT) {
6907  sum0 &= (((unsigned long)1)<<bits)-1;
6908  }
6909  sum = LONG2FIX(sum0);
6910  }
6911  else {
6912  VALUE mod;
6913 
6914  if (sum0) {
6915  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6916  }
6917 
6918  mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
6919  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
6920  sum = rb_funcall(sum, '&', 1, mod);
6921  }
6922  }
6923  return sum;
6924 }
6925 
6926 static VALUE
6927 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
6928 {
6929  rb_encoding *enc;
6930  VALUE w;
6931  long width, len, flen = 1, fclen = 1;
6932  VALUE res;
6933  char *p;
6934  const char *f = " ";
6935  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
6936  volatile VALUE pad;
6937  int singlebyte = 1, cr;
6938 
6939  rb_scan_args(argc, argv, "11", &w, &pad);
6940  enc = STR_ENC_GET(str);
6941  width = NUM2LONG(w);
6942  if (argc == 2) {
6943  StringValue(pad);
6944  enc = rb_enc_check(str, pad);
6945  f = RSTRING_PTR(pad);
6946  flen = RSTRING_LEN(pad);
6947  fclen = str_strlen(pad, enc);
6948  singlebyte = single_byte_optimizable(pad);
6949  if (flen == 0 || fclen == 0) {
6950  rb_raise(rb_eArgError, "zero width padding");
6951  }
6952  }
6953  len = str_strlen(str, enc);
6954  if (width < 0 || len >= width) return rb_str_dup(str);
6955  n = width - len;
6956  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
6957  rlen = n - llen;
6958  cr = ENC_CODERANGE(str);
6959  if (flen > 1) {
6960  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
6961  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
6962  }
6963  size = RSTRING_LEN(str);
6964  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
6965  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
6966  (len += llen2 + rlen2) >= LONG_MAX - size) {
6967  rb_raise(rb_eArgError, "argument too big");
6968  }
6969  len += size;
6970  res = rb_str_new5(str, 0, len);
6971  p = RSTRING_PTR(res);
6972  if (flen <= 1) {
6973  memset(p, *f, llen);
6974  p += llen;
6975  }
6976  else {
6977  while (llen >= fclen) {
6978  memcpy(p,f,flen);
6979  p += flen;
6980  llen -= fclen;
6981  }
6982  if (llen > 0) {
6983  memcpy(p, f, llen2);
6984  p += llen2;
6985  }
6986  }
6987  memcpy(p, RSTRING_PTR(str), size);
6988  p += size;
6989  if (flen <= 1) {
6990  memset(p, *f, rlen);
6991  p += rlen;
6992  }
6993  else {
6994  while (rlen >= fclen) {
6995  memcpy(p,f,flen);
6996  p += flen;
6997  rlen -= fclen;
6998  }
6999  if (rlen > 0) {
7000  memcpy(p, f, rlen2);
7001  p += rlen2;
7002  }
7003  }
7004  *p = '\0';
7005  STR_SET_LEN(res, p-RSTRING_PTR(res));
7006  OBJ_INFECT(res, str);
7007  if (!NIL_P(pad)) OBJ_INFECT(res, pad);
7008  rb_enc_associate(res, enc);
7009  if (argc == 2)
7010  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
7011  if (cr != ENC_CODERANGE_BROKEN)
7012  ENC_CODERANGE_SET(res, cr);
7013  return res;
7014 }
7015 
7016 
7017 /*
7018  * call-seq:
7019  * str.ljust(integer, padstr=' ') -> new_str
7020  *
7021  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7022  * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
7023  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7024  *
7025  * "hello".ljust(4) #=> "hello"
7026  * "hello".ljust(20) #=> "hello "
7027  * "hello".ljust(20, '1234') #=> "hello123412341234123"
7028  */
7029 
7030 static VALUE
7032 {
7033  return rb_str_justify(argc, argv, str, 'l');
7034 }
7035 
7036 
7037 /*
7038  * call-seq:
7039  * str.rjust(integer, padstr=' ') -> new_str
7040  *
7041  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7042  * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
7043  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7044  *
7045  * "hello".rjust(4) #=> "hello"
7046  * "hello".rjust(20) #=> " hello"
7047  * "hello".rjust(20, '1234') #=> "123412341234123hello"
7048  */
7049 
7050 static VALUE
7052 {
7053  return rb_str_justify(argc, argv, str, 'r');
7054 }
7055 
7056 
7057 /*
7058  * call-seq:
7059  * str.center(integer, padstr) -> new_str
7060  *
7061  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7062  * <code>String</code> of length <i>integer</i> with <i>str</i> centered and
7063  * padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7064  *
7065  * "hello".center(4) #=> "hello"
7066  * "hello".center(20) #=> " hello "
7067  * "hello".center(20, '123') #=> "1231231hello12312312"
7068  */
7069 
7070 static VALUE
7072 {
7073  return rb_str_justify(argc, argv, str, 'c');
7074 }
7075 
7076 /*
7077  * call-seq:
7078  * str.partition(sep) -> [head, sep, tail]
7079  * str.partition(regexp) -> [head, match, tail]
7080  *
7081  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
7082  * and returns the part before it, the match, and the part
7083  * after it.
7084  * If it is not found, returns two empty strings and <i>str</i>.
7085  *
7086  * "hello".partition("l") #=> ["he", "l", "lo"]
7087  * "hello".partition("x") #=> ["hello", "", ""]
7088  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
7089  */
7090 
7091 static VALUE
7093 {
7094  long pos;
7095  int regex = FALSE;
7096 
7097  if (TYPE(sep) == T_REGEXP) {
7098  pos = rb_reg_search(sep, str, 0, 0);
7099  regex = TRUE;
7100  }
7101  else {
7102  VALUE tmp;
7103 
7104  tmp = rb_check_string_type(sep);
7105  if (NIL_P(tmp)) {
7106  rb_raise(rb_eTypeError, "type mismatch: %s given",
7107  rb_obj_classname(sep));
7108  }
7109  sep = tmp;
7110  pos = rb_str_index(str, sep, 0);
7111  }
7112  if (pos < 0) {
7113  failed:
7114  return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
7115  }
7116  if (regex) {
7117  sep = rb_str_subpat(str, sep, INT2FIX(0));
7118  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
7119  }
7120  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
7121  sep,
7122  rb_str_subseq(str, pos+RSTRING_LEN(sep),
7123  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
7124 }
7125 
7126 /*
7127  * call-seq:
7128  * str.rpartition(sep) -> [head, sep, tail]
7129  * str.rpartition(regexp) -> [head, match, tail]
7130  *
7131  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
7132  * of the string, and returns the part before it, the match, and the part
7133  * after it.
7134  * If it is not found, returns two empty strings and <i>str</i>.
7135  *
7136  * "hello".rpartition("l") #=> ["hel", "l", "o"]
7137  * "hello".rpartition("x") #=> ["", "", "hello"]
7138  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
7139  */
7140 
7141 static VALUE
7143 {
7144  long pos = RSTRING_LEN(str);
7145  int regex = FALSE;
7146 
7147  if (TYPE(sep) == T_REGEXP) {
7148  pos = rb_reg_search(sep, str, pos, 1);
7149  regex = TRUE;
7150  }
7151  else {
7152  VALUE tmp;
7153 
7154  tmp = rb_check_string_type(sep);
7155  if (NIL_P(tmp)) {
7156  rb_raise(rb_eTypeError, "type mismatch: %s given",
7157  rb_obj_classname(sep));
7158  }
7159  sep = tmp;
7160  pos = rb_str_sublen(str, pos);
7161  pos = rb_str_rindex(str, sep, pos);
7162  }
7163  if (pos < 0) {
7164  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
7165  }
7166  if (regex) {
7167  sep = rb_reg_nth_match(0, rb_backref_get());
7168  }
7169  return rb_ary_new3(3, rb_str_substr(str, 0, pos),
7170  sep,
7171  rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
7172 }
7173 
7174 /*
7175  * call-seq:
7176  * str.start_with?([prefix]+) -> true or false
7177  *
7178  * Returns true if <i>str</i> starts with one of the prefixes given.
7179  *
7180  * p "hello".start_with?("hell") #=> true
7181  *
7182  * # returns true if one of the prefixes matches.
7183  * p "hello".start_with?("heaven", "hell") #=> true
7184  * p "hello".start_with?("heaven", "paradise") #=> false
7185  *
7186  *
7187  *
7188  */
7189 
7190 static VALUE
7192 {
7193  int i;
7194 
7195  for (i=0; i<argc; i++) {
7196  VALUE tmp = rb_check_string_type(argv[i]);
7197  if (NIL_P(tmp)) continue;
7198  rb_enc_check(str, tmp);
7199  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7200  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7201  return Qtrue;
7202  }
7203  return Qfalse;
7204 }
7205 
7206 /*
7207  * call-seq:
7208  * str.end_with?([suffix]+) -> true or false
7209  *
7210  * Returns true if <i>str</i> ends with one of the suffixes given.
7211  */
7212 
7213 static VALUE
7215 {
7216  int i;
7217  char *p, *s, *e;
7218  rb_encoding *enc;
7219 
7220  for (i=0; i<argc; i++) {
7221  VALUE tmp = rb_check_string_type(argv[i]);
7222  if (NIL_P(tmp)) continue;
7223  enc = rb_enc_check(str, tmp);
7224  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7225  p = RSTRING_PTR(str);
7226  e = p + RSTRING_LEN(str);
7227  s = e - RSTRING_LEN(tmp);
7228  if (rb_enc_left_char_head(p, s, e, enc) != s)
7229  continue;
7230  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7231  return Qtrue;
7232  }
7233  return Qfalse;
7234 }
7235 
7236 void
7237 rb_str_setter(VALUE val, ID id, VALUE *var)
7238 {
7239  if (!NIL_P(val) && TYPE(val) != T_STRING) {
7240  rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
7241  }
7242  *var = val;
7243 }
7244 
7245 
7246 /*
7247  * call-seq:
7248  * str.force_encoding(encoding) -> str
7249  *
7250  * Changes the encoding to +encoding+ and returns self.
7251  */
7252 
7253 static VALUE
7255 {
7256  str_modifiable(str);
7257  rb_enc_associate(str, rb_to_encoding(enc));
7258  ENC_CODERANGE_CLEAR(str);
7259  return str;
7260 }
7261 
7262 /*
7263  * call-seq:
7264  * str.valid_encoding? -> true or false
7265  *
7266  * Returns true for a string which encoded correctly.
7267  *
7268  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
7269  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
7270  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
7271  */
7272 
7273 static VALUE
7275 {
7276  int cr = rb_enc_str_coderange(str);
7277 
7278  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
7279 }
7280 
7281 /*
7282  * call-seq:
7283  * str.ascii_only? -> true or false
7284  *
7285  * Returns true for a string which has only ASCII characters.
7286  *
7287  * "abc".force_encoding("UTF-8").ascii_only? #=> true
7288  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
7289  */
7290 
7291 static VALUE
7293 {
7294  int cr = rb_enc_str_coderange(str);
7295 
7296  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
7297 }
7298 
7313 VALUE
7314 rb_str_ellipsize(VALUE str, long len)
7315 {
7316  static const char ellipsis[] = "...";
7317  const long ellipsislen = sizeof(ellipsis) - 1;
7318  rb_encoding *const enc = rb_enc_get(str);
7319  const long blen = RSTRING_LEN(str);
7320  const char *const p = RSTRING_PTR(str), *e = p + blen;
7321  VALUE estr, ret = 0;
7322 
7323  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
7324  if (len * rb_enc_mbminlen(enc) >= blen ||
7325  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
7326  ret = str;
7327  }
7328  else if (len <= ellipsislen ||
7329  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
7330  if (rb_enc_asciicompat(enc)) {
7331  ret = rb_str_new_with_class(str, ellipsis, len);
7332  rb_enc_associate(ret, enc);
7333  }
7334  else {
7335  estr = rb_usascii_str_new(ellipsis, len);
7336  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
7337  }
7338  }
7339  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
7340  rb_str_cat(ret, ellipsis, ellipsislen);
7341  }
7342  else {
7343  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
7344  rb_enc_from_encoding(enc), 0, Qnil);
7345  rb_str_append(ret, estr);
7346  }
7347  return ret;
7348 }
7349 
7350 /**********************************************************************
7351  * Document-class: Symbol
7352  *
7353  * <code>Symbol</code> objects represent names and some strings
7354  * inside the Ruby
7355  * interpreter. They are generated using the <code>:name</code> and
7356  * <code>:"string"</code> literals
7357  * syntax, and by the various <code>to_sym</code> methods. The same
7358  * <code>Symbol</code> object will be created for a given name or string
7359  * for the duration of a program's execution, regardless of the context
7360  * or meaning of that name. Thus if <code>Fred</code> is a constant in
7361  * one context, a method in another, and a class in a third, the
7362  * <code>Symbol</code> <code>:Fred</code> will be the same object in
7363  * all three contexts.
7364  *
7365  * module One
7366  * class Fred
7367  * end
7368  * $f1 = :Fred
7369  * end
7370  * module Two
7371  * Fred = 1
7372  * $f2 = :Fred
7373  * end
7374  * def Fred()
7375  * end
7376  * $f3 = :Fred
7377  * $f1.object_id #=> 2514190
7378  * $f2.object_id #=> 2514190
7379  * $f3.object_id #=> 2514190
7380  *
7381  */
7382 
7383 
7384 /*
7385  * call-seq:
7386  * sym == obj -> true or false
7387  *
7388  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
7389  * symbol, returns <code>true</code>.
7390  */
7391 
7392 static VALUE
7393 sym_equal(VALUE sym1, VALUE sym2)
7394 {
7395  if (sym1 == sym2) return Qtrue;
7396  return Qfalse;
7397 }
7398 
7399 
7400 static int
7401 sym_printable(const char *s, const char *send, rb_encoding *enc)
7402 {
7403  while (s < send) {
7404  int n;
7405  int c = rb_enc_codepoint_len(s, send, &n, enc);
7406 
7407  if (!rb_enc_isprint(c, enc)) return FALSE;
7408  s += n;
7409  }
7410  return TRUE;
7411 }
7412 
7413 /*
7414  * call-seq:
7415  * sym.inspect -> string
7416  *
7417  * Returns the representation of <i>sym</i> as a symbol literal.
7418  *
7419  * :fred.inspect #=> ":fred"
7420  */
7421 
7422 static VALUE
7424 {
7425  VALUE str;
7426  ID id = SYM2ID(sym);
7427  rb_encoding *enc;
7428  const char *ptr;
7429  long len;
7430  char *dest;
7432 
7433  if (resenc == NULL) resenc = rb_default_external_encoding();
7434  sym = rb_id2str(id);
7435  enc = STR_ENC_GET(sym);
7436  ptr = RSTRING_PTR(sym);
7437  len = RSTRING_LEN(sym);
7438  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
7439  !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
7440  str = rb_str_inspect(sym);
7441  len = RSTRING_LEN(str);
7442  rb_str_resize(str, len + 1);
7443  dest = RSTRING_PTR(str);
7444  memmove(dest + 1, dest, len);
7445  dest[0] = ':';
7446  }
7447  else {
7448  char *dest;
7449  str = rb_enc_str_new(0, len + 1, enc);
7450  dest = RSTRING_PTR(str);
7451  dest[0] = ':';
7452  memcpy(dest + 1, ptr, len);
7453  }
7454  return str;
7455 }
7456 
7457 
7458 /*
7459  * call-seq:
7460  * sym.id2name -> string
7461  * sym.to_s -> string
7462  *
7463  * Returns the name or string corresponding to <i>sym</i>.
7464  *
7465  * :fred.id2name #=> "fred"
7466  */
7467 
7468 
7469 VALUE
7471 {
7472  ID id = SYM2ID(sym);
7473 
7474  return str_new3(rb_cString, rb_id2str(id));
7475 }
7476 
7477 
7478 /*
7479  * call-seq:
7480  * sym.to_sym -> sym
7481  * sym.intern -> sym
7482  *
7483  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
7484  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
7485  * in this case.
7486  */
7487 
7488 static VALUE
7490 {
7491  return sym;
7492 }
7493 
7494 static VALUE
7496 {
7497  VALUE obj;
7498 
7499  if (argc < 1) {
7500  rb_raise(rb_eArgError, "no receiver given");
7501  }
7502  obj = argv[0];
7503  return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
7504 }
7505 
7506 /*
7507  * call-seq:
7508  * sym.to_proc
7509  *
7510  * Returns a _Proc_ object which respond to the given method by _sym_.
7511  *
7512  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
7513  */
7514 
7515 static VALUE
7517 {
7518  static VALUE sym_proc_cache = Qfalse;
7519  enum {SYM_PROC_CACHE_SIZE = 67};
7520  VALUE proc;
7521  long id, index;
7522  VALUE *aryp;
7523 
7524  if (!sym_proc_cache) {
7525  sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
7526  rb_gc_register_mark_object(sym_proc_cache);
7527  rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
7528  }
7529 
7530  id = SYM2ID(sym);
7531  index = (id % SYM_PROC_CACHE_SIZE) << 1;
7532 
7533  aryp = RARRAY_PTR(sym_proc_cache);
7534  if (aryp[index] == sym) {
7535  return aryp[index + 1];
7536  }
7537  else {
7538  proc = rb_proc_new(sym_call, (VALUE)id);
7539  aryp[index] = sym;
7540  aryp[index + 1] = proc;
7541  return proc;
7542  }
7543 }
7544 
7545 /*
7546  * call-seq:
7547  *
7548  * sym.succ
7549  *
7550  * Same as <code>sym.to_s.succ.intern</code>.
7551  */
7552 
7553 static VALUE
7555 {
7556  return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
7557 }
7558 
7559 /*
7560  * call-seq:
7561  *
7562  * str <=> other -> -1, 0, +1 or nil
7563  *
7564  * Compares _sym_ with _other_ in string form.
7565  */
7566 
7567 static VALUE
7569 {
7570  if (!SYMBOL_P(other)) {
7571  return Qnil;
7572  }
7573  return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
7574 }
7575 
7576 /*
7577  * call-seq:
7578  *
7579  * sym.casecmp(other) -> -1, 0, +1 or nil
7580  *
7581  * Case-insensitive version of <code>Symbol#<=></code>.
7582  */
7583 
7584 static VALUE
7586 {
7587  if (!SYMBOL_P(other)) {
7588  return Qnil;
7589  }
7590  return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
7591 }
7592 
7593 /*
7594  * call-seq:
7595  * sym =~ obj -> fixnum or nil
7596  *
7597  * Returns <code>sym.to_s =~ obj</code>.
7598  */
7599 
7600 static VALUE
7602 {
7603  return rb_str_match(rb_sym_to_s(sym), other);
7604 }
7605 
7606 /*
7607  * call-seq:
7608  * sym[idx] -> char
7609  * sym[b, n] -> char
7610  *
7611  * Returns <code>sym.to_s[]</code>.
7612  */
7613 
7614 static VALUE
7616 {
7617  return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
7618 }
7619 
7620 /*
7621  * call-seq:
7622  * sym.length -> integer
7623  *
7624  * Same as <code>sym.to_s.length</code>.
7625  */
7626 
7627 static VALUE
7629 {
7630  return rb_str_length(rb_id2str(SYM2ID(sym)));
7631 }
7632 
7633 /*
7634  * call-seq:
7635  * sym.empty? -> true or false
7636  *
7637  * Returns that _sym_ is :"" or not.
7638  */
7639 
7640 static VALUE
7642 {
7643  return rb_str_empty(rb_id2str(SYM2ID(sym)));
7644 }
7645 
7646 /*
7647  * call-seq:
7648  * sym.upcase -> symbol
7649  *
7650  * Same as <code>sym.to_s.upcase.intern</code>.
7651  */
7652 
7653 static VALUE
7655 {
7657 }
7658 
7659 /*
7660  * call-seq:
7661  * sym.downcase -> symbol
7662  *
7663  * Same as <code>sym.to_s.downcase.intern</code>.
7664  */
7665 
7666 static VALUE
7668 {
7670 }
7671 
7672 /*
7673  * call-seq:
7674  * sym.capitalize -> symbol
7675  *
7676  * Same as <code>sym.to_s.capitalize.intern</code>.
7677  */
7678 
7679 static VALUE
7681 {
7683 }
7684 
7685 /*
7686  * call-seq:
7687  * sym.swapcase -> symbol
7688  *
7689  * Same as <code>sym.to_s.swapcase.intern</code>.
7690  */
7691 
7692 static VALUE
7694 {
7696 }
7697 
7698 /*
7699  * call-seq:
7700  * sym.encoding -> encoding
7701  *
7702  * Returns the Encoding object that represents the encoding of _sym_.
7703  */
7704 
7705 static VALUE
7707 {
7708  return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
7709 }
7710 
7711 ID
7713 {
7714  VALUE tmp;
7715 
7716  switch (TYPE(name)) {
7717  default:
7718  tmp = rb_check_string_type(name);
7719  if (NIL_P(tmp)) {
7720  tmp = rb_inspect(name);
7721  rb_raise(rb_eTypeError, "%s is not a symbol",
7722  RSTRING_PTR(tmp));
7723  }
7724  name = tmp;
7725  /* fall through */
7726  case T_STRING:
7727  name = rb_str_intern(name);
7728  /* fall through */
7729  case T_SYMBOL:
7730  return SYM2ID(name);
7731  }
7732  return Qnil; /* not reached */
7733 }
7734 
7735 /*
7736  * A <code>String</code> object holds and manipulates an arbitrary sequence of
7737  * bytes, typically representing characters. String objects may be created
7738  * using <code>String::new</code> or as literals.
7739  *
7740  * Because of aliasing issues, users of strings should be aware of the methods
7741  * that modify the contents of a <code>String</code> object. Typically,
7742  * methods with names ending in ``!'' modify their receiver, while those
7743  * without a ``!'' return a new <code>String</code>. However, there are
7744  * exceptions, such as <code>String#[]=</code>.
7745  *
7746  */
7747 
7748 void
7750 {
7751 #undef rb_intern
7752 #define rb_intern(str) rb_intern_const(str)
7753 
7754  rb_cString = rb_define_class("String", rb_cObject);
7758  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
7759  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
7763  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
7765  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
7771  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
7772  rb_define_method(rb_cString, "length", rb_str_length, 0);
7774  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
7775  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
7782  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
7785  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
7788  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
7789  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
7790  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
7791 
7792  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
7795  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
7796  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
7798 
7799  rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
7800  rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
7801  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
7802  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
7803 
7808 
7816  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
7818  rb_define_method(rb_cString, "concat", rb_str_concat, 1);
7820  rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
7822  rb_define_method(rb_cString, "intern", rb_str_intern, 0);
7823  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
7825 
7826  rb_define_method(rb_cString, "include?", rb_str_include, 1);
7827  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
7828  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
7829 
7831 
7832  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
7833  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
7834  rb_define_method(rb_cString, "center", rb_str_center, -1);
7835 
7836  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
7837  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
7839  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
7841  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
7842  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
7843 
7851 
7854  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
7855  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
7856  rb_define_method(rb_cString, "count", rb_str_count, -1);
7857 
7862 
7863  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
7864  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
7865  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
7866  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
7867 
7868  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
7869 
7870  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
7872 
7873  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
7874  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
7875 
7876  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
7877  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
7878  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
7880 
7881  id_to_s = rb_intern("to_s");
7882 
7883  rb_fs = Qnil;
7884  rb_define_variable("$;", &rb_fs);
7885  rb_define_variable("$-F", &rb_fs);
7886 
7887  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
7891  rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
7892 
7895  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
7897  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
7898  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
7899  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
7900  rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
7901  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
7902  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
7903 
7904  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
7905  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
7907 
7908  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
7909  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
7910  rb_define_method(rb_cSymbol, "length", sym_length, 0);
7911  rb_define_method(rb_cSymbol, "size", sym_length, 0);
7912  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
7913  rb_define_method(rb_cSymbol, "match", sym_match, 1);
7914 
7915  rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
7916  rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
7917  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
7918  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
7919 
7920  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
7921 }
7922