The Ruby Cross Reference

Implementation: mri jruby rubinius
Version: 1.8.7-p370 1.9.1-p431 1.9.2-p381 1.9.3-p362 2.0.0-p0 HEAD
001 /**********************************************************************
002 
003   string.c -
004 
005   $Author$
006   created at: Mon Aug  9 17:12:58 JST 1993
007 
008   Copyright (C) 1993-2007 Yukihiro Matsumoto
009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
011 
012 **********************************************************************/
013 
014 #include "ruby/ruby.h"
015 #include "ruby/re.h"
016 #include "ruby/encoding.h"
017 #include "vm_core.h"
018 #include "internal.h"
019 #include "probes.h"
020 #include <assert.h>
021 
022 #define BEG(no) (regs->beg[(no)])
023 #define END(no) (regs->end[(no)])
024 
025 #include <math.h>
026 #include <ctype.h>
027 
028 #ifdef HAVE_UNISTD_H
029 #include <unistd.h>
030 #endif
031 
032 #define STRING_ENUMERATORS_WANTARRAY 0 /* next major */
033 
034 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
035 
036 #undef rb_str_new_cstr
037 #undef rb_tainted_str_new_cstr
038 #undef rb_usascii_str_new_cstr
039 #undef rb_external_str_new_cstr
040 #undef rb_locale_str_new_cstr
041 #undef rb_str_new2
042 #undef rb_str_new3
043 #undef rb_str_new4
044 #undef rb_str_new5
045 #undef rb_tainted_str_new2
046 #undef rb_usascii_str_new2
047 #undef rb_str_dup_frozen
048 #undef rb_str_buf_new_cstr
049 #undef rb_str_buf_new2
050 #undef rb_str_buf_cat2
051 #undef rb_str_cat2
052 
053 static VALUE rb_str_clear(VALUE str);
054 
055 VALUE rb_cString;
056 VALUE rb_cSymbol;
057 
058 #define RUBY_MAX_CHAR_LEN 16
059 #define STR_TMPLOCK FL_USER7
060 #define STR_NOEMBED FL_USER1
061 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
062 #define STR_ASSOC   FL_USER3
063 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
064 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
065 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
066 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
067 #define STR_UNSET_NOCAPA(s) do {\
068     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
069 } while (0)
070 
071 
072 #define STR_SET_NOEMBED(str) do {\
073     FL_SET((str), STR_NOEMBED);\
074     STR_SET_EMBED_LEN((str), 0);\
075 } while (0)
076 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
077 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
078 #define STR_SET_EMBED_LEN(str, n) do { \
079     long tmp_n = (n);\
080     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
081     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
082 } while (0)
083 
084 #define STR_SET_LEN(str, n) do { \
085     if (STR_EMBED_P(str)) {\
086         STR_SET_EMBED_LEN((str), (n));\
087     }\
088     else {\
089         RSTRING(str)->as.heap.len = (n);\
090     }\
091 } while (0)
092 
093 #define STR_DEC_LEN(str) do {\
094     if (STR_EMBED_P(str)) {\
095         long n = RSTRING_LEN(str);\
096         n--;\
097         STR_SET_EMBED_LEN((str), n);\
098     }\
099     else {\
100         RSTRING(str)->as.heap.len--;\
101     }\
102 } while (0)
103 
104 #define RESIZE_CAPA(str,capacity) do {\
105     if (STR_EMBED_P(str)) {\
106         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
107             char *tmp = ALLOC_N(char, (capacity)+1);\
108             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
109             RSTRING(str)->as.heap.ptr = tmp;\
110             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
111             STR_SET_NOEMBED(str);\
112             RSTRING(str)->as.heap.aux.capa = (capacity);\
113         }\
114     }\
115     else {\
116         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
117         if (!STR_NOCAPA_P(str))\
118             RSTRING(str)->as.heap.aux.capa = (capacity);\
119     }\
120 } while (0)
121 
122 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
123 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
124 
125 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
126 
127 static inline int
128 single_byte_optimizable(VALUE str)
129 {
130     rb_encoding *enc;
131 
132     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
133     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
134         return 1;
135 
136     enc = STR_ENC_GET(str);
137     if (rb_enc_mbmaxlen(enc) == 1)
138         return 1;
139 
140     /* Conservative.  Possibly single byte.
141      * "\xa1" in Shift_JIS for example. */
142     return 0;
143 }
144 
145 VALUE rb_fs;
146 
147 static inline const char *
148 search_nonascii(const char *p, const char *e)
149 {
150 #if SIZEOF_VALUE == 8
151 # define NONASCII_MASK 0x8080808080808080ULL
152 #elif SIZEOF_VALUE == 4
153 # define NONASCII_MASK 0x80808080UL
154 #endif
155 #ifdef NONASCII_MASK
156     if ((int)sizeof(VALUE) * 2 < e - p) {
157         const VALUE *s, *t;
158         const VALUE lowbits = sizeof(VALUE) - 1;
159         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
160         while (p < (const char *)s) {
161             if (!ISASCII(*p))
162                 return p;
163             p++;
164         }
165         t = (const VALUE*)(~lowbits & (VALUE)e);
166         while (s < t) {
167             if (*s & NONASCII_MASK) {
168                 t = s;
169                 break;
170             }
171             s++;
172         }
173         p = (const char *)t;
174     }
175 #endif
176     while (p < e) {
177         if (!ISASCII(*p))
178             return p;
179         p++;
180     }
181     return NULL;
182 }
183 
184 static int
185 coderange_scan(const char *p, long len, rb_encoding *enc)
186 {
187     const char *e = p + len;
188 
189     if (rb_enc_to_index(enc) == 0) {
190         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
191         p = search_nonascii(p, e);
192         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
193     }
194 
195     if (rb_enc_asciicompat(enc)) {
196         p = search_nonascii(p, e);
197         if (!p) {
198             return ENC_CODERANGE_7BIT;
199         }
200         while (p < e) {
201             int ret = rb_enc_precise_mbclen(p, e, enc);
202             if (!MBCLEN_CHARFOUND_P(ret)) {
203                 return ENC_CODERANGE_BROKEN;
204             }
205             p += MBCLEN_CHARFOUND_LEN(ret);
206             if (p < e) {
207                 p = search_nonascii(p, e);
208                 if (!p) {
209                     return ENC_CODERANGE_VALID;
210                 }
211             }
212         }
213         if (e < p) {
214             return ENC_CODERANGE_BROKEN;
215         }
216         return ENC_CODERANGE_VALID;
217     }
218 
219     while (p < e) {
220         int ret = rb_enc_precise_mbclen(p, e, enc);
221 
222         if (!MBCLEN_CHARFOUND_P(ret)) {
223             return ENC_CODERANGE_BROKEN;
224         }
225         p += MBCLEN_CHARFOUND_LEN(ret);
226     }
227     if (e < p) {
228         return ENC_CODERANGE_BROKEN;
229     }
230     return ENC_CODERANGE_VALID;
231 }
232 
233 long
234 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
235 {
236     const char *p = s;
237 
238     if (*cr == ENC_CODERANGE_BROKEN)
239         return e - s;
240 
241     if (rb_enc_to_index(enc) == 0) {
242         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
243         p = search_nonascii(p, e);
244         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
245         return e - s;
246     }
247     else if (rb_enc_asciicompat(enc)) {
248         p = search_nonascii(p, e);
249         if (!p) {
250             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
251             return e - s;
252         }
253         while (p < e) {
254             int ret = rb_enc_precise_mbclen(p, e, enc);
255             if (!MBCLEN_CHARFOUND_P(ret)) {
256                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
257                 return p - s;
258             }
259             p += MBCLEN_CHARFOUND_LEN(ret);
260             if (p < e) {
261                 p = search_nonascii(p, e);
262                 if (!p) {
263                     *cr = ENC_CODERANGE_VALID;
264                     return e - s;
265                 }
266             }
267         }
268         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
269         return p - s;
270     }
271     else {
272         while (p < e) {
273             int ret = rb_enc_precise_mbclen(p, e, enc);
274             if (!MBCLEN_CHARFOUND_P(ret)) {
275                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
276                 return p - s;
277             }
278             p += MBCLEN_CHARFOUND_LEN(ret);
279         }
280         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
281         return p - s;
282     }
283 }
284 
285 static inline void
286 str_enc_copy(VALUE str1, VALUE str2)
287 {
288     rb_enc_set_index(str1, ENCODING_GET(str2));
289 }
290 
291 static void
292 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
293 {
294     /* this function is designed for copying encoding and coderange
295      * from src to new string "dest" which is made from the part of src.
296      */
297     str_enc_copy(dest, src);
298     if (RSTRING_LEN(dest) == 0) {
299         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
300             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
301         else
302             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
303         return;
304     }
305     switch (ENC_CODERANGE(src)) {
306       case ENC_CODERANGE_7BIT:
307         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
308         break;
309       case ENC_CODERANGE_VALID:
310         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
311             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
312             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
313         else
314             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
315         break;
316       default:
317         break;
318     }
319 }
320 
321 static void
322 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
323 {
324     str_enc_copy(dest, src);
325     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
326 }
327 
328 int
329 rb_enc_str_coderange(VALUE str)
330 {
331     int cr = ENC_CODERANGE(str);
332 
333     if (cr == ENC_CODERANGE_UNKNOWN) {
334         rb_encoding *enc = STR_ENC_GET(str);
335         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
336         ENC_CODERANGE_SET(str, cr);
337     }
338     return cr;
339 }
340 
341 int
342 rb_enc_str_asciionly_p(VALUE str)
343 {
344     rb_encoding *enc = STR_ENC_GET(str);
345 
346     if (!rb_enc_asciicompat(enc))
347         return FALSE;
348     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
349         return TRUE;
350     return FALSE;
351 }
352 
353 static inline void
354 str_mod_check(VALUE s, const char *p, long len)
355 {
356     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
357         rb_raise(rb_eRuntimeError, "string modified");
358     }
359 }
360 
361 size_t
362 rb_str_capacity(VALUE str)
363 {
364     if (STR_EMBED_P(str)) {
365         return RSTRING_EMBED_LEN_MAX;
366     }
367     else if (STR_NOCAPA_P(str)) {
368         return RSTRING(str)->as.heap.len;
369     }
370     else {
371         return RSTRING(str)->as.heap.aux.capa;
372     }
373 }
374 
375 static inline VALUE
376 str_alloc(VALUE klass)
377 {
378     NEWOBJ_OF(str, struct RString, klass, T_STRING);
379 
380     str->as.heap.ptr = 0;
381     str->as.heap.len = 0;
382     str->as.heap.aux.capa = 0;
383 
384     return (VALUE)str;
385 }
386 
387 static inline VALUE
388 empty_str_alloc(VALUE klass)
389 {
390     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
391         RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
392     }
393     return str_alloc(klass);
394 }
395 
396 static VALUE
397 str_new(VALUE klass, const char *ptr, long len)
398 {
399     VALUE str;
400 
401     if (len < 0) {
402         rb_raise(rb_eArgError, "negative string size (or size too big)");
403     }
404 
405     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
406         RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
407     }
408 
409     str = str_alloc(klass);
410     if (len > RSTRING_EMBED_LEN_MAX) {
411         RSTRING(str)->as.heap.aux.capa = len;
412         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
413         STR_SET_NOEMBED(str);
414     }
415     else if (len == 0) {
416         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
417     }
418     if (ptr) {
419         memcpy(RSTRING_PTR(str), ptr, len);
420     }
421     STR_SET_LEN(str, len);
422     RSTRING_PTR(str)[len] = '\0';
423     return str;
424 }
425 
426 VALUE
427 rb_str_new(const char *ptr, long len)
428 {
429     return str_new(rb_cString, ptr, len);
430 }
431 
432 VALUE
433 rb_usascii_str_new(const char *ptr, long len)
434 {
435     VALUE str = rb_str_new(ptr, len);
436     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
437     return str;
438 }
439 
440 VALUE
441 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
442 {
443     VALUE str = rb_str_new(ptr, len);
444     rb_enc_associate(str, enc);
445     return str;
446 }
447 
448 VALUE
449 rb_str_new_cstr(const char *ptr)
450 {
451     if (!ptr) {
452         rb_raise(rb_eArgError, "NULL pointer given");
453     }
454     return rb_str_new(ptr, strlen(ptr));
455 }
456 
457 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
458 #define rb_str_new2 rb_str_new_cstr
459 
460 VALUE
461 rb_usascii_str_new_cstr(const char *ptr)
462 {
463     VALUE str = rb_str_new2(ptr);
464     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
465     return str;
466 }
467 
468 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
469 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
470 
471 VALUE
472 rb_tainted_str_new(const char *ptr, long len)
473 {
474     VALUE str = rb_str_new(ptr, len);
475 
476     OBJ_TAINT(str);
477     return str;
478 }
479 
480 VALUE
481 rb_tainted_str_new_cstr(const char *ptr)
482 {
483     VALUE str = rb_str_new2(ptr);
484 
485     OBJ_TAINT(str);
486     return str;
487 }
488 
489 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
490 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
491 
492 VALUE
493 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
494 {
495     extern VALUE rb_cEncodingConverter;
496     rb_econv_t *ec;
497     rb_econv_result_t ret;
498     long len, olen;
499     VALUE econv_wrapper;
500     VALUE newstr;
501     const unsigned char *start, *sp;
502     unsigned char *dest, *dp;
503     size_t converted_output = 0;
504 
505     if (!to) return str;
506     if (!from) from = rb_enc_get(str);
507     if (from == to) return str;
508     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
509         to == rb_ascii8bit_encoding()) {
510         if (STR_ENC_GET(str) != to) {
511             str = rb_str_dup(str);
512             rb_enc_associate(str, to);
513         }
514         return str;
515     }
516 
517     len = RSTRING_LEN(str);
518     newstr = rb_str_new(0, len);
519     olen = len;
520 
521     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
522     RBASIC(econv_wrapper)->klass = 0;
523     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
524     if (!ec) return str;
525     DATA_PTR(econv_wrapper) = ec;
526 
527     sp = (unsigned char*)RSTRING_PTR(str);
528     start = sp;
529     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
530            (dp = dest + converted_output),
531            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
532            ret == econv_destination_buffer_full) {
533         /* destination buffer short */
534         size_t converted_input = sp - start;
535         size_t rest = len - converted_input;
536         converted_output = dp - dest;
537         rb_str_set_len(newstr, converted_output);
538         if (converted_input && converted_output &&
539             rest < (LONG_MAX / converted_output)) {
540             rest = (rest * converted_output) / converted_input;
541         }
542         else {
543             rest = olen;
544         }
545         olen += rest < 2 ? 2 : rest;
546         rb_str_resize(newstr, olen);
547     }
548     DATA_PTR(econv_wrapper) = 0;
549     rb_econv_close(ec);
550     rb_gc_force_recycle(econv_wrapper);
551     switch (ret) {
552       case econv_finished:
553         len = dp - (unsigned char*)RSTRING_PTR(newstr);
554         rb_str_set_len(newstr, len);
555         rb_enc_associate(newstr, to);
556         return newstr;
557 
558       default:
559         /* some error, return original */
560         return str;
561     }
562 }
563 
564 VALUE
565 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
566 {
567     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
568 }
569 
570 VALUE
571 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
572 {
573     VALUE str;
574 
575     str = rb_tainted_str_new(ptr, len);
576     if (eenc == rb_usascii_encoding() &&
577         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
578         rb_enc_associate(str, rb_ascii8bit_encoding());
579         return str;
580     }
581     rb_enc_associate(str, eenc);
582     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
583 }
584 
585 VALUE
586 rb_external_str_new(const char *ptr, long len)
587 {
588     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
589 }
590 
591 VALUE
592 rb_external_str_new_cstr(const char *ptr)
593 {
594     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
595 }
596 
597 VALUE
598 rb_locale_str_new(const char *ptr, long len)
599 {
600     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
601 }
602 
603 VALUE
604 rb_locale_str_new_cstr(const char *ptr)
605 {
606     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
607 }
608 
609 VALUE
610 rb_filesystem_str_new(const char *ptr, long len)
611 {
612     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
613 }
614 
615 VALUE
616 rb_filesystem_str_new_cstr(const char *ptr)
617 {
618     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
619 }
620 
621 VALUE
622 rb_str_export(VALUE str)
623 {
624     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
625 }
626 
627 VALUE
628 rb_str_export_locale(VALUE str)
629 {
630     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
631 }
632 
633 VALUE
634 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
635 {
636     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
637 }
638 
639 static VALUE
640 str_replace_shared_without_enc(VALUE str2, VALUE str)
641 {
642     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
643         STR_SET_EMBED(str2);
644         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
645         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
646     }
647     else {
648         str = rb_str_new_frozen(str);
649         FL_SET(str2, STR_NOEMBED);
650         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
651         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
652         RSTRING(str2)->as.heap.aux.shared = str;
653         FL_SET(str2, ELTS_SHARED);
654     }
655     return str2;
656 }
657 
658 static VALUE
659 str_replace_shared(VALUE str2, VALUE str)
660 {
661     str_replace_shared_without_enc(str2, str);
662     rb_enc_cr_str_exact_copy(str2, str);
663     return str2;
664 }
665 
666 static VALUE
667 str_new_shared(VALUE klass, VALUE str)
668 {
669     return str_replace_shared(str_alloc(klass), str);
670 }
671 
672 static VALUE
673 str_new3(VALUE klass, VALUE str)
674 {
675     return str_new_shared(klass, str);
676 }
677 
678 VALUE
679 rb_str_new_shared(VALUE str)
680 {
681     VALUE str2 = str_new3(rb_obj_class(str), str);
682 
683     OBJ_INFECT(str2, str);
684     return str2;
685 }
686 
687 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
688 #define rb_str_new3 rb_str_new_shared
689 
690 static VALUE
691 str_new4(VALUE klass, VALUE str)
692 {
693     VALUE str2;
694 
695     str2 = str_alloc(klass);
696     STR_SET_NOEMBED(str2);
697     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
698     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
699     if (STR_SHARED_P(str)) {
700         VALUE shared = RSTRING(str)->as.heap.aux.shared;
701         assert(OBJ_FROZEN(shared));
702         FL_SET(str2, ELTS_SHARED);
703         RSTRING(str2)->as.heap.aux.shared = shared;
704     }
705     else {
706         FL_SET(str, ELTS_SHARED);
707         RSTRING(str)->as.heap.aux.shared = str2;
708     }
709     rb_enc_cr_str_exact_copy(str2, str);
710     OBJ_INFECT(str2, str);
711     return str2;
712 }
713 
714 VALUE
715 rb_str_new_frozen(VALUE orig)
716 {
717     VALUE klass, str;
718 
719     if (OBJ_FROZEN(orig)) return orig;
720     klass = rb_obj_class(orig);
721     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
722         long ofs;
723         assert(OBJ_FROZEN(str));
724         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
725         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
726             ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
727             ENCODING_GET(str) != ENCODING_GET(orig)) {
728             str = str_new3(klass, str);
729             RSTRING(str)->as.heap.ptr += ofs;
730             RSTRING(str)->as.heap.len -= ofs;
731             rb_enc_cr_str_exact_copy(str, orig);
732             OBJ_INFECT(str, orig);
733         }
734     }
735     else if (STR_EMBED_P(orig)) {
736         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
737         rb_enc_cr_str_exact_copy(str, orig);
738         OBJ_INFECT(str, orig);
739     }
740     else if (STR_ASSOC_P(orig)) {
741         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
742         FL_UNSET(orig, STR_ASSOC);
743         str = str_new4(klass, orig);
744         FL_SET(str, STR_ASSOC);
745         RSTRING(str)->as.heap.aux.shared = assoc;
746     }
747     else {
748         str = str_new4(klass, orig);
749     }
750     OBJ_FREEZE(str);
751     return str;
752 }
753 
754 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
755 #define rb_str_new4 rb_str_new_frozen
756 
757 VALUE
758 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
759 {
760     return str_new(rb_obj_class(obj), ptr, len);
761 }
762 
763 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
764            rb_str_new_with_class, (obj, ptr, len))
765 #define rb_str_new5 rb_str_new_with_class
766 
767 static VALUE
768 str_new_empty(VALUE str)
769 {
770     VALUE v = rb_str_new5(str, 0, 0);
771     rb_enc_copy(v, str);
772     OBJ_INFECT(v, str);
773     return v;
774 }
775 
776 #define STR_BUF_MIN_SIZE 128
777 
778 VALUE
779 rb_str_buf_new(long capa)
780 {
781     VALUE str = str_alloc(rb_cString);
782 
783     if (capa < STR_BUF_MIN_SIZE) {
784         capa = STR_BUF_MIN_SIZE;
785     }
786     FL_SET(str, STR_NOEMBED);
787     RSTRING(str)->as.heap.aux.capa = capa;
788     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
789     RSTRING(str)->as.heap.ptr[0] = '\0';
790 
791     return str;
792 }
793 
794 VALUE
795 rb_str_buf_new_cstr(const char *ptr)
796 {
797     VALUE str;
798     long len = strlen(ptr);
799 
800     str = rb_str_buf_new(len);
801     rb_str_buf_cat(str, ptr, len);
802 
803     return str;
804 }
805 
806 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
807 #define rb_str_buf_new2 rb_str_buf_new_cstr
808 
809 VALUE
810 rb_str_tmp_new(long len)
811 {
812     return str_new(0, 0, len);
813 }
814 
815 void *
816 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
817 {
818     VALUE s = rb_str_tmp_new(len);
819     *store = s;
820     return RSTRING_PTR(s);
821 }
822 
823 void
824 rb_free_tmp_buffer(volatile VALUE *store)
825 {
826     VALUE s = *store;
827     *store = 0;
828     if (s) rb_str_clear(s);
829 }
830 
831 void
832 rb_str_free(VALUE str)
833 {
834     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
835         xfree(RSTRING(str)->as.heap.ptr);
836     }
837 }
838 
839 RUBY_FUNC_EXPORTED size_t
840 rb_str_memsize(VALUE str)
841 {
842     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
843         return RSTRING(str)->as.heap.aux.capa;
844     }
845     else {
846         return 0;
847     }
848 }
849 
850 VALUE
851 rb_str_to_str(VALUE str)
852 {
853     return rb_convert_type(str, T_STRING, "String", "to_str");
854 }
855 
856 static inline void str_discard(VALUE str);
857 
858 void
859 rb_str_shared_replace(VALUE str, VALUE str2)
860 {
861     rb_encoding *enc;
862     int cr;
863     if (str == str2) return;
864     enc = STR_ENC_GET(str2);
865     cr = ENC_CODERANGE(str2);
866     str_discard(str);
867     OBJ_INFECT(str, str2);
868     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
869         STR_SET_EMBED(str);
870         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
871         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
872         rb_enc_associate(str, enc);
873         ENC_CODERANGE_SET(str, cr);
874         return;
875     }
876     STR_SET_NOEMBED(str);
877     STR_UNSET_NOCAPA(str);
878     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
879     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
880     if (STR_NOCAPA_P(str2)) {
881         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
882         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
883     }
884     else {
885         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
886     }
887     STR_SET_EMBED(str2);        /* abandon str2 */
888     RSTRING_PTR(str2)[0] = 0;
889     STR_SET_EMBED_LEN(str2, 0);
890     rb_enc_associate(str, enc);
891     ENC_CODERANGE_SET(str, cr);
892 }
893 
894 static ID id_to_s;
895 
896 VALUE
897 rb_obj_as_string(VALUE obj)
898 {
899     VALUE str;
900 
901     if (RB_TYPE_P(obj, T_STRING)) {
902         return obj;
903     }
904     str = rb_funcall(obj, id_to_s, 0);
905     if (!RB_TYPE_P(str, T_STRING))
906         return rb_any_to_s(obj);
907     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
908     return str;
909 }
910 
911 static VALUE
912 str_replace(VALUE str, VALUE str2)
913 {
914     long len;
915 
916     len = RSTRING_LEN(str2);
917     if (STR_ASSOC_P(str2)) {
918         str2 = rb_str_new4(str2);
919     }
920     if (STR_SHARED_P(str2)) {
921         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
922         assert(OBJ_FROZEN(shared));
923         STR_SET_NOEMBED(str);
924         RSTRING(str)->as.heap.len = len;
925         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
926         FL_SET(str, ELTS_SHARED);
927         FL_UNSET(str, STR_ASSOC);
928         RSTRING(str)->as.heap.aux.shared = shared;
929     }
930     else {
931         str_replace_shared(str, str2);
932     }
933 
934     OBJ_INFECT(str, str2);
935     rb_enc_cr_str_exact_copy(str, str2);
936     return str;
937 }
938 
939 static VALUE
940 str_duplicate(VALUE klass, VALUE str)
941 {
942     VALUE dup = str_alloc(klass);
943     str_replace(dup, str);
944     return dup;
945 }
946 
947 VALUE
948 rb_str_dup(VALUE str)
949 {
950     return str_duplicate(rb_obj_class(str), str);
951 }
952 
953 VALUE
954 rb_str_resurrect(VALUE str)
955 {
956     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
957         RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
958                                   rb_sourcefile(), rb_sourceline());
959     }
960     return str_replace(str_alloc(rb_cString), str);
961 }
962 
963 /*
964  *  call-seq:
965  *     String.new(str="")   -> new_str
966  *
967  *  Returns a new string object containing a copy of <i>str</i>.
968  */
969 
970 static VALUE
971 rb_str_init(int argc, VALUE *argv, VALUE str)
972 {
973     VALUE orig;
974 
975     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
976         rb_str_replace(str, orig);
977     return str;
978 }
979 
980 static inline long
981 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
982 {
983     long c;
984     const char *q;
985 
986     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
987         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
988     }
989     else if (rb_enc_asciicompat(enc)) {
990         c = 0;
991         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
992             while (p < e) {
993                 if (ISASCII(*p)) {
994                     q = search_nonascii(p, e);
995                     if (!q)
996                         return c + (e - p);
997                     c += q - p;
998                     p = q;
999                 }
1000                 p += rb_enc_fast_mbclen(p, e, enc);
1001                 c++;
1002             }
1003         }
1004         else {
1005             while (p < e) {
1006                 if (ISASCII(*p)) {
1007                     q = search_nonascii(p, e);
1008                     if (!q)
1009                         return c + (e - p);
1010                     c += q - p;
1011                     p = q;
1012                 }
1013                 p += rb_enc_mbclen(p, e, enc);
1014                 c++;
1015             }
1016         }
1017         return c;
1018     }
1019 
1020     for (c=0; p<e; c++) {
1021         p += rb_enc_mbclen(p, e, enc);
1022     }
1023     return c;
1024 }
1025 
1026 long
1027 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1028 {
1029     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1030 }
1031 
1032 long
1033 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1034 {
1035     long c;
1036     const char *q;
1037     int ret;
1038 
1039     *cr = 0;
1040     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1041         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
1042     }
1043     else if (rb_enc_asciicompat(enc)) {
1044         c = 0;
1045         while (p < e) {
1046             if (ISASCII(*p)) {
1047                 q = search_nonascii(p, e);
1048                 if (!q) {
1049                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
1050                     return c + (e - p);
1051                 }
1052                 c += q - p;
1053                 p = q;
1054             }
1055             ret = rb_enc_precise_mbclen(p, e, enc);
1056             if (MBCLEN_CHARFOUND_P(ret)) {
1057                 *cr |= ENC_CODERANGE_VALID;
1058                 p += MBCLEN_CHARFOUND_LEN(ret);
1059             }
1060             else {
1061                 *cr = ENC_CODERANGE_BROKEN;
1062                 p++;
1063             }
1064             c++;
1065         }
1066         if (!*cr) *cr = ENC_CODERANGE_7BIT;
1067         return c;
1068     }
1069 
1070     for (c=0; p<e; c++) {
1071         ret = rb_enc_precise_mbclen(p, e, enc);
1072         if (MBCLEN_CHARFOUND_P(ret)) {
1073             *cr |= ENC_CODERANGE_VALID;
1074             p += MBCLEN_CHARFOUND_LEN(ret);
1075         }
1076         else {
1077             *cr = ENC_CODERANGE_BROKEN;
1078             if (p + rb_enc_mbminlen(enc) <= e)
1079                 p += rb_enc_mbminlen(enc);
1080             else
1081                 p = e;
1082         }
1083     }
1084     if (!*cr) *cr = ENC_CODERANGE_7BIT;
1085     return c;
1086 }
1087 
1088 #ifdef NONASCII_MASK
1089 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1090 
1091 /*
1092  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1093  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
1094  * Therefore, following pseudo code can detect UTF-8 leading byte.
1095  *
1096  * if (!(byte & 0x80))
1097  *   byte |= 0x40;          // turn on bit6
1098  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
1099  *
1100  * This function calculate every bytes in the argument word `s'
1101  * using the above logic concurrently. and gather every bytes result.
1102  */
1103 static inline VALUE
1104 count_utf8_lead_bytes_with_word(const VALUE *s)
1105 {
1106     VALUE d = *s;
1107 
1108     /* Transform into bit0 represent UTF-8 leading or not. */
1109     d |= ~(d>>1);
1110     d >>= 6;
1111     d &= NONASCII_MASK >> 7;
1112 
1113     /* Gather every bytes. */
1114     d += (d>>8);
1115     d += (d>>16);
1116 #if SIZEOF_VALUE == 8
1117     d += (d>>32);
1118 #endif
1119     return (d&0xF);
1120 }
1121 #endif
1122 
1123 static long
1124 str_strlen(VALUE str, rb_encoding *enc)
1125 {
1126     const char *p, *e;
1127     long n;
1128     int cr;
1129 
1130     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1131     if (!enc) enc = STR_ENC_GET(str);
1132     p = RSTRING_PTR(str);
1133     e = RSTRING_END(str);
1134     cr = ENC_CODERANGE(str);
1135 #ifdef NONASCII_MASK
1136     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1137         enc == rb_utf8_encoding()) {
1138 
1139         VALUE len = 0;
1140         if ((int)sizeof(VALUE) * 2 < e - p) {
1141             const VALUE *s, *t;
1142             const VALUE lowbits = sizeof(VALUE) - 1;
1143             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1144             t = (const VALUE*)(~lowbits & (VALUE)e);
1145             while (p < (const char *)s) {
1146                 if (is_utf8_lead_byte(*p)) len++;
1147                 p++;
1148             }
1149             while (s < t) {
1150                 len += count_utf8_lead_bytes_with_word(s);
1151                 s++;
1152             }
1153             p = (const char *)s;
1154         }
1155         while (p < e) {
1156             if (is_utf8_lead_byte(*p)) len++;
1157             p++;
1158         }
1159         return (long)len;
1160     }
1161 #endif
1162     n = rb_enc_strlen_cr(p, e, enc, &cr);
1163     if (cr) {
1164         ENC_CODERANGE_SET(str, cr);
1165     }
1166     return n;
1167 }
1168 
1169 long
1170 rb_str_strlen(VALUE str)
1171 {
1172     return str_strlen(str, STR_ENC_GET(str));
1173 }
1174 
1175 /*
1176  *  call-seq:
1177  *     str.length   -> integer
1178  *     str.size     -> integer
1179  *
1180  *  Returns the character length of <i>str</i>.
1181  */
1182 
1183 VALUE
1184 rb_str_length(VALUE str)
1185 {
1186     long len;
1187 
1188     len = str_strlen(str, STR_ENC_GET(str));
1189     return LONG2NUM(len);
1190 }
1191 
1192 /*
1193  *  call-seq:
1194  *     str.bytesize  -> integer
1195  *
1196  *  Returns the length of +str+ in bytes.
1197  *
1198  *    "\x80\u3042".bytesize  #=> 4
1199  *    "hello".bytesize       #=> 5
1200  */
1201 
1202 static VALUE
1203 rb_str_bytesize(VALUE str)
1204 {
1205     return LONG2NUM(RSTRING_LEN(str));
1206 }
1207 
1208 /*
1209  *  call-seq:
1210  *     str.empty?   -> true or false
1211  *
1212  *  Returns <code>true</code> if <i>str</i> has a length of zero.
1213  *
1214  *     "hello".empty?   #=> false
1215  *     " ".empty?       #=> false
1216  *     "".empty?        #=> true
1217  */
1218 
1219 static VALUE
1220 rb_str_empty(VALUE str)
1221 {
1222     if (RSTRING_LEN(str) == 0)
1223         return Qtrue;
1224     return Qfalse;
1225 }
1226 
1227 /*
1228  *  call-seq:
1229  *     str + other_str   -> new_str
1230  *
1231  *  Concatenation---Returns a new <code>String</code> containing
1232  *  <i>other_str</i> concatenated to <i>str</i>.
1233  *
1234  *     "Hello from " + self.to_s   #=> "Hello from main"
1235  */
1236 
1237 VALUE
1238 rb_str_plus(VALUE str1, VALUE str2)
1239 {
1240     VALUE str3;
1241     rb_encoding *enc;
1242 
1243     StringValue(str2);
1244     enc = rb_enc_check(str1, str2);
1245     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1246     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1247     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1248            RSTRING_PTR(str2), RSTRING_LEN(str2));
1249     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1250 
1251     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1252         OBJ_TAINT(str3);
1253     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
1254                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
1255     return str3;
1256 }
1257 
1258 /*
1259  *  call-seq:
1260  *     str * integer   -> new_str
1261  *
1262  *  Copy --- Returns a new String containing +integer+ copies of the receiver.
1263  *  +integer+ must be greater than or equal to 0.
1264  *
1265  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
1266  *     "Ho! " * 0   #=> ""
1267  */
1268 
1269 VALUE
1270 rb_str_times(VALUE str, VALUE times)
1271 {
1272     VALUE str2;
1273     long n, len;
1274     char *ptr2;
1275 
1276     len = NUM2LONG(times);
1277     if (len < 0) {
1278         rb_raise(rb_eArgError, "negative argument");
1279     }
1280     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
1281         rb_raise(rb_eArgError, "argument too big");
1282     }
1283 
1284     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1285     ptr2 = RSTRING_PTR(str2);
1286     if (len) {
1287         n = RSTRING_LEN(str);
1288         memcpy(ptr2, RSTRING_PTR(str), n);
1289         while (n <= len/2) {
1290             memcpy(ptr2 + n, ptr2, n);
1291             n *= 2;
1292         }
1293         memcpy(ptr2 + n, ptr2, len-n);
1294     }
1295     ptr2[RSTRING_LEN(str2)] = '\0';
1296     OBJ_INFECT(str2, str);
1297     rb_enc_cr_str_copy_for_substr(str2, str);
1298 
1299     return str2;
1300 }
1301 
1302 /*
1303  *  call-seq:
1304  *     str % arg   -> new_str
1305  *
1306  *  Format---Uses <i>str</i> as a format specification, and returns the result
1307  *  of applying it to <i>arg</i>. If the format specification contains more than
1308  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1309  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
1310  *  details of the format string.
1311  *
1312  *     "%05d" % 123                              #=> "00123"
1313  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
1314  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
1315  */
1316 
1317 static VALUE
1318 rb_str_format_m(VALUE str, VALUE arg)
1319 {
1320     volatile VALUE tmp = rb_check_array_type(arg);
1321 
1322     if (!NIL_P(tmp)) {
1323         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1324     }
1325     return rb_str_format(1, &arg, str);
1326 }
1327 
1328 static inline void
1329 str_modifiable(VALUE str)
1330 {
1331     if (FL_TEST(str, STR_TMPLOCK)) {
1332         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1333     }
1334     rb_check_frozen(str);
1335     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1336         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1337 }
1338 
1339 static inline int
1340 str_independent(VALUE str)
1341 {
1342     str_modifiable(str);
1343     if (!STR_SHARED_P(str)) return 1;
1344     if (STR_EMBED_P(str)) return 1;
1345     return 0;
1346 }
1347 
1348 static void
1349 str_make_independent_expand(VALUE str, long expand)
1350 {
1351     char *ptr;
1352     long len = RSTRING_LEN(str);
1353     long capa = len + expand;
1354 
1355     if (len > capa) len = capa;
1356     ptr = ALLOC_N(char, capa + 1);
1357     if (RSTRING_PTR(str)) {
1358         memcpy(ptr, RSTRING_PTR(str), len);
1359     }
1360     STR_SET_NOEMBED(str);
1361     STR_UNSET_NOCAPA(str);
1362     ptr[len] = 0;
1363     RSTRING(str)->as.heap.ptr = ptr;
1364     RSTRING(str)->as.heap.len = len;
1365     RSTRING(str)->as.heap.aux.capa = capa;
1366 }
1367 
1368 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1369 
1370 void
1371 rb_str_modify(VALUE str)
1372 {
1373     if (!str_independent(str))
1374         str_make_independent(str);
1375     ENC_CODERANGE_CLEAR(str);
1376 }
1377 
1378 void
1379 rb_str_modify_expand(VALUE str, long expand)
1380 {
1381     if (expand < 0) {
1382         rb_raise(rb_eArgError, "negative expanding string size");
1383     }
1384     if (!str_independent(str)) {
1385         str_make_independent_expand(str, expand);
1386     }
1387     else if (expand > 0) {
1388         long len = RSTRING_LEN(str);
1389         long capa = len + expand;
1390         if (!STR_EMBED_P(str)) {
1391             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1392             RSTRING(str)->as.heap.aux.capa = capa;
1393         }
1394         else if (capa > RSTRING_EMBED_LEN_MAX) {
1395             str_make_independent_expand(str, expand);
1396         }
1397     }
1398     ENC_CODERANGE_CLEAR(str);
1399 }
1400 
1401 /* As rb_str_modify(), but don't clear coderange */
1402 static void
1403 str_modify_keep_cr(VALUE str)
1404 {
1405     if (!str_independent(str))
1406         str_make_independent(str);
1407     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1408         /* Force re-scan later */
1409         ENC_CODERANGE_CLEAR(str);
1410 }
1411 
1412 static inline void
1413 str_discard(VALUE str)
1414 {
1415     str_modifiable(str);
1416     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1417         xfree(RSTRING_PTR(str));
1418         RSTRING(str)->as.heap.ptr = 0;
1419         RSTRING(str)->as.heap.len = 0;
1420     }
1421 }
1422 
1423 void
1424 rb_str_associate(VALUE str, VALUE add)
1425 {
1426     /* sanity check */
1427     rb_check_frozen(str);
1428     if (STR_ASSOC_P(str)) {
1429         /* already associated */
1430         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1431     }
1432     else {
1433         if (STR_SHARED_P(str)) {
1434             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1435             str_make_independent(str);
1436             if (STR_ASSOC_P(assoc)) {
1437                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1438                 rb_ary_concat(assoc, add);
1439                 add = assoc;
1440             }
1441         }
1442         else if (STR_EMBED_P(str)) {
1443             str_make_independent(str);
1444         }
1445         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1446             RESIZE_CAPA(str, RSTRING_LEN(str));
1447         }
1448         FL_SET(str, STR_ASSOC);
1449         RBASIC(add)->klass = 0;
1450         RSTRING(str)->as.heap.aux.shared = add;
1451     }
1452 }
1453 
1454 VALUE
1455 rb_str_associated(VALUE str)
1456 {
1457     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1458     if (STR_ASSOC_P(str)) {
1459         return RSTRING(str)->as.heap.aux.shared;
1460     }
1461     return Qfalse;
1462 }
1463 
1464 void
1465 rb_must_asciicompat(VALUE str)
1466 {
1467     rb_encoding *enc = rb_enc_get(str);
1468     if (!rb_enc_asciicompat(enc)) {
1469         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
1470     }
1471 }
1472 
1473 VALUE
1474 rb_string_value(volatile VALUE *ptr)
1475 {
1476     VALUE s = *ptr;
1477     if (!RB_TYPE_P(s, T_STRING)) {
1478         s = rb_str_to_str(s);
1479         *ptr = s;
1480     }
1481     return s;
1482 }
1483 
1484 char *
1485 rb_string_value_ptr(volatile VALUE *ptr)
1486 {
1487     VALUE str = rb_string_value(ptr);
1488     return RSTRING_PTR(str);
1489 }
1490 
1491 char *
1492 rb_string_value_cstr(volatile VALUE *ptr)
1493 {
1494     VALUE str = rb_string_value(ptr);
1495     char *s = RSTRING_PTR(str);
1496     long len = RSTRING_LEN(str);
1497 
1498     if (!s || memchr(s, 0, len)) {
1499         rb_raise(rb_eArgError, "string contains null byte");
1500     }
1501     if (s[len]) {
1502         rb_str_modify(str);
1503         s = RSTRING_PTR(str);
1504         s[RSTRING_LEN(str)] = 0;
1505     }
1506     return s;
1507 }
1508 
1509 VALUE
1510 rb_check_string_type(VALUE str)
1511 {
1512     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1513     return str;
1514 }
1515 
1516 /*
1517  *  call-seq:
1518  *     String.try_convert(obj) -> string or nil
1519  *
1520  *  Try to convert <i>obj</i> into a String, using to_str method.
1521  *  Returns converted string or nil if <i>obj</i> cannot be converted
1522  *  for any reason.
1523  *
1524  *     String.try_convert("str")     #=> "str"
1525  *     String.try_convert(/re/)      #=> nil
1526  */
1527 static VALUE
1528 rb_str_s_try_convert(VALUE dummy, VALUE str)
1529 {
1530     return rb_check_string_type(str);
1531 }
1532 
1533 static char*
1534 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1535 {
1536     long nth = *nthp;
1537     if (rb_enc_mbmaxlen(enc) == 1) {
1538         p += nth;
1539     }
1540     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1541         p += nth * rb_enc_mbmaxlen(enc);
1542     }
1543     else if (rb_enc_asciicompat(enc)) {
1544         const char *p2, *e2;
1545         int n;
1546 
1547         while (p < e && 0 < nth) {
1548             e2 = p + nth;
1549             if (e < e2) {
1550                 *nthp = nth;
1551                 return (char *)e;
1552             }
1553             if (ISASCII(*p)) {
1554                 p2 = search_nonascii(p, e2);
1555                 if (!p2) {
1556                     nth -= e2 - p;
1557                     *nthp = nth;
1558                     return (char *)e2;
1559                 }
1560                 nth -= p2 - p;
1561                 p = p2;
1562             }
1563             n = rb_enc_mbclen(p, e, enc);
1564             p += n;
1565             nth--;
1566         }
1567         *nthp = nth;
1568         if (nth != 0) {
1569             return (char *)e;
1570         }
1571         return (char *)p;
1572     }
1573     else {
1574         while (p < e && nth--) {
1575             p += rb_enc_mbclen(p, e, enc);
1576         }
1577     }
1578     if (p > e) p = e;
1579     *nthp = nth;
1580     return (char*)p;
1581 }
1582 
1583 char*
1584 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1585 {
1586     return str_nth_len(p, e, &nth, enc);
1587 }
1588 
1589 static char*
1590 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1591 {
1592     if (singlebyte)
1593         p += nth;
1594     else {
1595         p = str_nth_len(p, e, &nth, enc);
1596     }
1597     if (!p) return 0;
1598     if (p > e) p = e;
1599     return (char *)p;
1600 }
1601 
1602 /* char offset to byte offset */
1603 static long
1604 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1605 {
1606     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1607     if (!pp) return e - p;
1608     return pp - p;
1609 }
1610 
1611 long
1612 rb_str_offset(VALUE str, long pos)
1613 {
1614     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1615                       STR_ENC_GET(str), single_byte_optimizable(str));
1616 }
1617 
1618 #ifdef NONASCII_MASK
1619 static char *
1620 str_utf8_nth(const char *p, const char *e, long *nthp)
1621 {
1622     long nth = *nthp;
1623     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1624         const VALUE *s, *t;
1625         const VALUE lowbits = sizeof(VALUE) - 1;
1626         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1627         t = (const VALUE*)(~lowbits & (VALUE)e);
1628         while (p < (const char *)s) {
1629             if (is_utf8_lead_byte(*p)) nth--;
1630             p++;
1631         }
1632         do {
1633             nth -= count_utf8_lead_bytes_with_word(s);
1634             s++;
1635         } while (s < t && (int)sizeof(VALUE) <= nth);
1636         p = (char *)s;
1637     }
1638     while (p < e) {
1639         if (is_utf8_lead_byte(*p)) {
1640             if (nth == 0) break;
1641             nth--;
1642         }
1643         p++;
1644     }
1645     *nthp = nth;
1646     return (char *)p;
1647 }
1648 
1649 static long
1650 str_utf8_offset(const char *p, const char *e, long nth)
1651 {
1652     const char *pp = str_utf8_nth(p, e, &nth);
1653     return pp - p;
1654 }
1655 #endif
1656 
1657 /* byte offset to char offset */
1658 long
1659 rb_str_sublen(VALUE str, long pos)
1660 {
1661     if (single_byte_optimizable(str) || pos < 0)
1662         return pos;
1663     else {
1664         char *p = RSTRING_PTR(str);
1665         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1666     }
1667 }
1668 
1669 VALUE
1670 rb_str_subseq(VALUE str, long beg, long len)
1671 {
1672     VALUE str2;
1673 
1674     if (RSTRING_LEN(str) == beg + len &&
1675         RSTRING_EMBED_LEN_MAX < len) {
1676         str2 = rb_str_new_shared(rb_str_new_frozen(str));
1677         rb_str_drop_bytes(str2, beg);
1678     }
1679     else {
1680         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1681         RB_GC_GUARD(str);
1682     }
1683 
1684     rb_enc_cr_str_copy_for_substr(str2, str);
1685     OBJ_INFECT(str2, str);
1686 
1687     return str2;
1688 }
1689 
1690 static char *
1691 rb_str_subpos(VALUE str, long beg, long *lenp)
1692 {
1693     long len = *lenp;
1694     long slen = -1L;
1695     long blen = RSTRING_LEN(str);
1696     rb_encoding *enc = STR_ENC_GET(str);
1697     char *p, *s = RSTRING_PTR(str), *e = s + blen;
1698 
1699     if (len < 0) return 0;
1700     if (!blen) {
1701         len = 0;
1702     }
1703     if (single_byte_optimizable(str)) {
1704         if (beg > blen) return 0;
1705         if (beg < 0) {
1706             beg += blen;
1707             if (beg < 0) return 0;
1708         }
1709         if (beg + len > blen)
1710             len = blen - beg;
1711         if (len < 0) return 0;
1712         p = s + beg;
1713         goto end;
1714     }
1715     if (beg < 0) {
1716         if (len > -beg) len = -beg;
1717         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1718             beg = -beg;
1719             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1720             p = e;
1721             if (!p) return 0;
1722             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1723             if (!p) return 0;
1724             len = e - p;
1725             goto end;
1726         }
1727         else {
1728             slen = str_strlen(str, enc);
1729             beg += slen;
1730             if (beg < 0) return 0;
1731             p = s + beg;
1732             if (len == 0) goto end;
1733         }
1734     }
1735     else if (beg > 0 && beg > RSTRING_LEN(str)) {
1736         return 0;
1737     }
1738     if (len == 0) {
1739         if (beg > str_strlen(str, enc)) return 0;
1740         p = s + beg;
1741     }
1742 #ifdef NONASCII_MASK
1743     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1744         enc == rb_utf8_encoding()) {
1745         p = str_utf8_nth(s, e, &beg);
1746         if (beg > 0) return 0;
1747         len = str_utf8_offset(p, e, len);
1748     }
1749 #endif
1750     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1751         int char_sz = rb_enc_mbmaxlen(enc);
1752 
1753         p = s + beg * char_sz;
1754         if (p > e) {
1755             return 0;
1756         }
1757         else if (len * char_sz > e - p)
1758             len = e - p;
1759         else
1760             len *= char_sz;
1761     }
1762     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1763         if (beg > 0) return 0;
1764         len = 0;
1765     }
1766     else {
1767         len = str_offset(p, e, len, enc, 0);
1768     }
1769   end:
1770     *lenp = len;
1771     RB_GC_GUARD(str);
1772     return p;
1773 }
1774 
1775 VALUE
1776 rb_str_substr(VALUE str, long beg, long len)
1777 {
1778     VALUE str2;
1779     char *p = rb_str_subpos(str, beg, &len);
1780 
1781     if (!p) return Qnil;
1782     if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
1783         str2 = rb_str_new4(str);
1784         str2 = str_new3(rb_obj_class(str2), str2);
1785         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1786         RSTRING(str2)->as.heap.len = len;
1787     }
1788     else {
1789         str2 = rb_str_new5(str, p, len);
1790         rb_enc_cr_str_copy_for_substr(str2, str);
1791         OBJ_INFECT(str2, str);
1792         RB_GC_GUARD(str);
1793     }
1794 
1795     return str2;
1796 }
1797 
1798 VALUE
1799 rb_str_freeze(VALUE str)
1800 {
1801     if (STR_ASSOC_P(str)) {
1802         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1803         OBJ_FREEZE(ary);
1804     }
1805     return rb_obj_freeze(str);
1806 }
1807 
1808 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
1809 #define rb_str_dup_frozen rb_str_new_frozen
1810 
1811 VALUE
1812 rb_str_locktmp(VALUE str)
1813 {
1814     if (FL_TEST(str, STR_TMPLOCK)) {
1815         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1816     }
1817     FL_SET(str, STR_TMPLOCK);
1818     return str;
1819 }
1820 
1821 VALUE
1822 rb_str_unlocktmp(VALUE str)
1823 {
1824     if (!FL_TEST(str, STR_TMPLOCK)) {
1825         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1826     }
1827     FL_UNSET(str, STR_TMPLOCK);
1828     return str;
1829 }
1830 
1831 void
1832 rb_str_set_len(VALUE str, long len)
1833 {
1834     long capa;
1835 
1836     str_modifiable(str);
1837     if (STR_SHARED_P(str)) {
1838         rb_raise(rb_eRuntimeError, "can't set length of shared string");
1839     }
1840     if (len > (capa = (long)rb_str_capacity(str))) {
1841         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1842     }
1843     STR_SET_LEN(str, len);
1844     RSTRING_PTR(str)[len] = '\0';
1845 }
1846 
1847 VALUE
1848 rb_str_resize(VALUE str, long len)
1849 {
1850     long slen;
1851     int independent;
1852 
1853     if (len < 0) {
1854         rb_raise(rb_eArgError, "negative string size (or size too big)");
1855     }
1856 
1857     independent = str_independent(str);
1858     ENC_CODERANGE_CLEAR(str);
1859     slen = RSTRING_LEN(str);
1860     if (len != slen) {
1861         if (STR_EMBED_P(str)) {
1862             if (len <= RSTRING_EMBED_LEN_MAX) {
1863                 STR_SET_EMBED_LEN(str, len);
1864                 RSTRING(str)->as.ary[len] = '\0';
1865                 return str;
1866             }
1867             str_make_independent_expand(str, len - slen);
1868             STR_SET_NOEMBED(str);
1869         }
1870         else if (len <= RSTRING_EMBED_LEN_MAX) {
1871             char *ptr = RSTRING(str)->as.heap.ptr;
1872             STR_SET_EMBED(str);
1873             if (slen > len) slen = len;
1874             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1875             RSTRING(str)->as.ary[len] = '\0';
1876             STR_SET_EMBED_LEN(str, len);
1877             if (independent) xfree(ptr);
1878             return str;
1879         }
1880         else if (!independent) {
1881             str_make_independent_expand(str, len - slen);
1882         }
1883         else if (slen < len || slen - len > 1024) {
1884             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1885         }
1886         if (!STR_NOCAPA_P(str)) {
1887             RSTRING(str)->as.heap.aux.capa = len;
1888         }
1889         RSTRING(str)->as.heap.len = len;
1890         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1891     }
1892     return str;
1893 }
1894 
1895 static VALUE
1896 str_buf_cat(VALUE str, const char *ptr, long len)
1897 {
1898     long capa, total, off = -1;
1899 
1900     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1901         off = ptr - RSTRING_PTR(str);
1902     }
1903     rb_str_modify(str);
1904     if (len == 0) return 0;
1905     if (STR_ASSOC_P(str)) {
1906         FL_UNSET(str, STR_ASSOC);
1907         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1908     }
1909     else if (STR_EMBED_P(str)) {
1910         capa = RSTRING_EMBED_LEN_MAX;
1911     }
1912     else {
1913         capa = RSTRING(str)->as.heap.aux.capa;
1914     }
1915     if (RSTRING_LEN(str) >= LONG_MAX - len) {
1916         rb_raise(rb_eArgError, "string sizes too big");
1917     }
1918     total = RSTRING_LEN(str)+len;
1919     if (capa <= total) {
1920         while (total > capa) {
1921             if (capa + 1 >= LONG_MAX / 2) {
1922                 capa = (total + 4095) / 4096;
1923                 break;
1924             }
1925             capa = (capa + 1) * 2;
1926         }
1927         RESIZE_CAPA(str, capa);
1928     }
1929     if (off != -1) {
1930         ptr = RSTRING_PTR(str) + off;
1931     }
1932     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1933     STR_SET_LEN(str, total);
1934     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1935 
1936     return str;
1937 }
1938 
1939 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1940 
1941 VALUE
1942 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1943 {
1944     if (len == 0) return str;
1945     if (len < 0) {
1946         rb_raise(rb_eArgError, "negative string size (or size too big)");
1947     }
1948     return str_buf_cat(str, ptr, len);
1949 }
1950 
1951 VALUE
1952 rb_str_buf_cat2(VALUE str, const char *ptr)
1953 {
1954     return rb_str_buf_cat(str, ptr, strlen(ptr));
1955 }
1956 
1957 VALUE
1958 rb_str_cat(VALUE str, const char *ptr, long len)
1959 {
1960     if (len < 0) {
1961         rb_raise(rb_eArgError, "negative string size (or size too big)");
1962     }
1963     if (STR_ASSOC_P(str)) {
1964         char *p;
1965         rb_str_modify_expand(str, len);
1966         p = RSTRING(str)->as.heap.ptr;
1967         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1968         len = RSTRING(str)->as.heap.len += len;
1969         p[len] = '\0'; /* sentinel */
1970         return str;
1971     }
1972 
1973     return rb_str_buf_cat(str, ptr, len);
1974 }
1975 
1976 VALUE
1977 rb_str_cat2(VALUE str, const char *ptr)
1978 {
1979     return rb_str_cat(str, ptr, strlen(ptr));
1980 }
1981 
1982 static VALUE
1983 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1984     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1985 {
1986     int str_encindex = ENCODING_GET(str);
1987     int res_encindex;
1988     int str_cr, res_cr;
1989 
1990     str_cr = ENC_CODERANGE(str);
1991 
1992     if (str_encindex == ptr_encindex) {
1993         if (str_cr == ENC_CODERANGE_UNKNOWN)
1994             ptr_cr = ENC_CODERANGE_UNKNOWN;
1995         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1996             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1997         }
1998     }
1999     else {
2000         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
2001         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
2002         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2003             if (len == 0)
2004                 return str;
2005             if (RSTRING_LEN(str) == 0) {
2006                 rb_str_buf_cat(str, ptr, len);
2007                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2008                 return str;
2009             }
2010             goto incompatible;
2011         }
2012         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2013             ptr_cr = coderange_scan(ptr, len, ptr_enc);
2014         }
2015         if (str_cr == ENC_CODERANGE_UNKNOWN) {
2016             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2017                 str_cr = rb_enc_str_coderange(str);
2018             }
2019         }
2020     }
2021     if (ptr_cr_ret)
2022         *ptr_cr_ret = ptr_cr;
2023 
2024     if (str_encindex != ptr_encindex &&
2025         str_cr != ENC_CODERANGE_7BIT &&
2026         ptr_cr != ENC_CODERANGE_7BIT) {
2027       incompatible:
2028         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2029             rb_enc_name(rb_enc_from_index(str_encindex)),
2030             rb_enc_name(rb_enc_from_index(ptr_encindex)));
2031     }
2032 
2033     if (str_cr == ENC_CODERANGE_UNKNOWN) {
2034         res_encindex = str_encindex;
2035         res_cr = ENC_CODERANGE_UNKNOWN;
2036     }
2037     else if (str_cr == ENC_CODERANGE_7BIT) {
2038         if (ptr_cr == ENC_CODERANGE_7BIT) {
2039             res_encindex = str_encindex;
2040             res_cr = ENC_CODERANGE_7BIT;
2041         }
2042         else {
2043             res_encindex = ptr_encindex;
2044             res_cr = ptr_cr;
2045         }
2046     }
2047     else if (str_cr == ENC_CODERANGE_VALID) {
2048         res_encindex = str_encindex;
2049         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
2050             res_cr = str_cr;
2051         else
2052             res_cr = ptr_cr;
2053     }
2054     else { /* str_cr == ENC_CODERANGE_BROKEN */
2055         res_encindex = str_encindex;
2056         res_cr = str_cr;
2057         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2058     }
2059 
2060     if (len < 0) {
2061         rb_raise(rb_eArgError, "negative string size (or size too big)");
2062     }
2063     str_buf_cat(str, ptr, len);
2064     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2065     return str;
2066 }
2067 
2068 VALUE
2069 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2070 {
2071     return rb_enc_cr_str_buf_cat(str, ptr, len,
2072         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
2073 }
2074 
2075 VALUE
2076 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2077 {
2078     /* ptr must reference NUL terminated ASCII string. */
2079     int encindex = ENCODING_GET(str);
2080     rb_encoding *enc = rb_enc_from_index(encindex);
2081     if (rb_enc_asciicompat(enc)) {
2082         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2083             encindex, ENC_CODERANGE_7BIT, 0);
2084     }
2085     else {
2086         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2087         while (*ptr) {
2088             unsigned int c = (unsigned char)*ptr;
2089             int len = rb_enc_codelen(c, enc);
2090             rb_enc_mbcput(c, buf, enc);
2091             rb_enc_cr_str_buf_cat(str, buf, len,
2092                 encindex, ENC_CODERANGE_VALID, 0);
2093             ptr++;
2094         }
2095         return str;
2096     }
2097 }
2098 
2099 VALUE
2100 rb_str_buf_append(VALUE str, VALUE str2)
2101 {
2102     int str2_cr;
2103 
2104     str2_cr = ENC_CODERANGE(str2);
2105 
2106     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2107         ENCODING_GET(str2), str2_cr, &str2_cr);
2108 
2109     OBJ_INFECT(str, str2);
2110     ENC_CODERANGE_SET(str2, str2_cr);
2111 
2112     return str;
2113 }
2114 
2115 VALUE
2116 rb_str_append(VALUE str, VALUE str2)
2117 {
2118     rb_encoding *enc;
2119     int cr, cr2;
2120     long len2;
2121 
2122     StringValue(str2);
2123     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2124         long len = RSTRING_LEN(str) + len2;
2125         enc = rb_enc_check(str, str2);
2126         cr = ENC_CODERANGE(str);
2127         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2128         rb_str_modify_expand(str, len2);
2129         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2130                RSTRING_PTR(str2), len2+1);
2131         RSTRING(str)->as.heap.len = len;
2132         rb_enc_associate(str, enc);
2133         ENC_CODERANGE_SET(str, cr);
2134         OBJ_INFECT(str, str2);
2135         return str;
2136     }
2137     return rb_str_buf_append(str, str2);
2138 }
2139 
2140 /*
2141  *  call-seq:
2142  *     str << integer       -> str
2143  *     str.concat(integer)  -> str
2144  *     str << obj           -> str
2145  *     str.concat(obj)      -> str
2146  *
2147  *  Append---Concatenates the given object to <i>str</i>. If the object is a
2148  *  <code>Integer</code>, it is considered as a codepoint, and is converted
2149  *  to a character before concatenation.
2150  *
2151  *     a = "hello "
2152  *     a << "world"   #=> "hello world"
2153  *     a.concat(33)   #=> "hello world!"
2154  */
2155 
2156 VALUE
2157 rb_str_concat(VALUE str1, VALUE str2)
2158 {
2159     unsigned int code;
2160     rb_encoding *enc = STR_ENC_GET(str1);
2161 
2162     if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
2163         if (rb_num_to_uint(str2, &code) == 0) {
2164         }
2165         else if (FIXNUM_P(str2)) {
2166             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2167         }
2168         else {
2169             rb_raise(rb_eRangeError, "bignum out of char range");
2170         }
2171     }
2172     else {
2173         return rb_str_append(str1, str2);
2174     }
2175 
2176     if (enc == rb_usascii_encoding()) {
2177         /* US-ASCII automatically extended to ASCII-8BIT */
2178         char buf[1];
2179         buf[0] = (char)code;
2180         if (code > 0xFF) {
2181             rb_raise(rb_eRangeError, "%u out of char range", code);
2182         }
2183         rb_str_cat(str1, buf, 1);
2184         if (code > 127) {
2185             rb_enc_associate(str1, rb_ascii8bit_encoding());
2186             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
2187         }
2188     }
2189     else {
2190         long pos = RSTRING_LEN(str1);
2191         int cr = ENC_CODERANGE(str1);
2192         int len;
2193         char *buf;
2194 
2195         switch (len = rb_enc_codelen(code, enc)) {
2196           case ONIGERR_INVALID_CODE_POINT_VALUE:
2197             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2198             break;
2199           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
2200           case 0:
2201             rb_raise(rb_eRangeError, "%u out of char range", code);
2202             break;
2203         }
2204         buf = ALLOCA_N(char, len + 1);
2205         rb_enc_mbcput(code, buf, enc);
2206         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2207             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2208         }
2209         rb_str_resize(str1, pos+len);
2210         memcpy(RSTRING_PTR(str1) + pos, buf, len);
2211         if (cr == ENC_CODERANGE_7BIT && code > 127)
2212             cr = ENC_CODERANGE_VALID;
2213         ENC_CODERANGE_SET(str1, cr);
2214     }
2215     return str1;
2216 }
2217 
2218 /*
2219  *  call-seq:
2220  *     str.prepend(other_str)  -> str
2221  *
2222  *  Prepend---Prepend the given string to <i>str</i>.
2223  *
2224  *     a = "world"
2225  *     a.prepend("hello ") #=> "hello world"
2226  *     a                   #=> "hello world"
2227  */
2228 
2229 static VALUE
2230 rb_str_prepend(VALUE str, VALUE str2)
2231 {
2232     StringValue(str2);
2233     StringValue(str);
2234     rb_str_update(str, 0L, 0L, str2);
2235     return str;
2236 }
2237 
2238 st_index_t
2239 rb_str_hash(VALUE str)
2240 {
2241     int e = ENCODING_GET(str);
2242     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2243         e = 0;
2244     }
2245     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2246 }
2247 
2248 int
2249 rb_str_hash_cmp(VALUE str1, VALUE str2)
2250 {
2251     long len;
2252 
2253     if (!rb_str_comparable(str1, str2)) return 1;
2254     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2255         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2256         return 0;
2257     }
2258     return 1;
2259 }
2260 
2261 /*
2262  * call-seq:
2263  *    str.hash   -> fixnum
2264  *
2265  * Return a hash based on the string's length and content.
2266  */
2267 
2268 static VALUE
2269 rb_str_hash_m(VALUE str)
2270 {
2271     st_index_t hval = rb_str_hash(str);
2272     return INT2FIX(hval);
2273 }
2274 
2275 #define lesser(a,b) (((a)>(b))?(b):(a))
2276 
2277 int
2278 rb_str_comparable(VALUE str1, VALUE str2)
2279 {
2280     int idx1, idx2;
2281     int rc1, rc2;
2282 
2283     if (RSTRING_LEN(str1) == 0) return TRUE;
2284     if (RSTRING_LEN(str2) == 0) return TRUE;
2285     idx1 = ENCODING_GET(str1);
2286     idx2 = ENCODING_GET(str2);
2287     if (idx1 == idx2) return TRUE;
2288     rc1 = rb_enc_str_coderange(str1);
2289     rc2 = rb_enc_str_coderange(str2);
2290     if (rc1 == ENC_CODERANGE_7BIT) {
2291         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2292         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
2293             return TRUE;
2294     }
2295     if (rc2 == ENC_CODERANGE_7BIT) {
2296         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
2297             return TRUE;
2298     }
2299     return FALSE;
2300 }
2301 
2302 int
2303 rb_str_cmp(VALUE str1, VALUE str2)
2304 {
2305     long len1, len2;
2306     const char *ptr1, *ptr2;
2307     int retval;
2308 
2309     if (str1 == str2) return 0;
2310     RSTRING_GETMEM(str1, ptr1, len1);
2311     RSTRING_GETMEM(str2, ptr2, len2);
2312     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2313         if (len1 == len2) {
2314             if (!rb_str_comparable(str1, str2)) {
2315                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
2316                     return 1;
2317                 return -1;
2318             }
2319             return 0;
2320         }
2321         if (len1 > len2) return 1;
2322         return -1;
2323     }
2324     if (retval > 0) return 1;
2325     return -1;
2326 }
2327 
2328 /* expect tail call optimization */
2329 static VALUE
2330 str_eql(const VALUE str1, const VALUE str2)
2331 {
2332     const long len = RSTRING_LEN(str1);
2333     const char *ptr1, *ptr2;
2334 
2335     if (len != RSTRING_LEN(str2)) return Qfalse;
2336     if (!rb_str_comparable(str1, str2)) return Qfalse;
2337     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2338         return Qtrue;
2339     if (memcmp(ptr1, ptr2, len) == 0)
2340         return Qtrue;
2341     return Qfalse;
2342 }
2343 /*
2344  *  call-seq:
2345  *     str == obj   -> true or false
2346  *
2347  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
2348  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2349  *  <code><=></code> <i>obj</i> returns zero.
2350  */
2351 
2352 VALUE
2353 rb_str_equal(VALUE str1, VALUE str2)
2354 {
2355     if (str1 == str2) return Qtrue;
2356     if (!RB_TYPE_P(str2, T_STRING)) {
2357         if (!rb_respond_to(str2, rb_intern("to_str"))) {
2358             return Qfalse;
2359         }
2360         return rb_equal(str2, str1);
2361     }
2362     return str_eql(str1, str2);
2363 }
2364 
2365 /*
2366  * call-seq:
2367  *   str.eql?(other)   -> true or false
2368  *
2369  * Two strings are equal if they have the same length and content.
2370  */
2371 
2372 static VALUE
2373 rb_str_eql(VALUE str1, VALUE str2)
2374 {
2375     if (str1 == str2) return Qtrue;
2376     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
2377     return str_eql(str1, str2);
2378 }
2379 
2380 /*
2381  *  call-seq:
2382  *     string <=> other_string   -> -1, 0, +1 or nil
2383  *
2384  *
2385  *  Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less
2386  *  than, equal to, or greater than +other_string+.
2387  *
2388  *  +nil+ is returned if the two values are incomparable.
2389  *
2390  *  If the strings are of different lengths, and the strings are equal when
2391  *  compared up to the shortest length, then the longer string is considered
2392  *  greater than the shorter one.
2393  *
2394  *  <code><=></code> is the basis for the methods <code><</code>,
2395  *  <code><=</code>, <code>></code>, <code>>=</code>, and
2396  *  <code>between?</code>, included from module Comparable. The method
2397  *  String#== does not use Comparable#==.
2398  *
2399  *     "abcdef" <=> "abcde"     #=> 1
2400  *     "abcdef" <=> "abcdef"    #=> 0
2401  *     "abcdef" <=> "abcdefg"   #=> -1
2402  *     "abcdef" <=> "ABCDEF"    #=> 1
2403  */
2404 
2405 static VALUE
2406 rb_str_cmp_m(VALUE str1, VALUE str2)
2407 {
2408     int result;
2409 
2410     if (!RB_TYPE_P(str2, T_STRING)) {
2411         VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
2412         if (RB_TYPE_P(tmp, T_STRING)) {
2413             result = rb_str_cmp(str1, tmp);
2414         }
2415         else {
2416             return rb_invcmp(str1, str2);
2417         }
2418     }
2419     else {
2420         result = rb_str_cmp(str1, str2);
2421     }
2422     return INT2FIX(result);
2423 }
2424 
2425 /*
2426  *  call-seq:
2427  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
2428  *
2429  *  Case-insensitive version of <code>String#<=></code>.
2430  *
2431  *     "abcdef".casecmp("abcde")     #=> 1
2432  *     "aBcDeF".casecmp("abcdef")    #=> 0
2433  *     "abcdef".casecmp("abcdefg")   #=> -1
2434  *     "abcdef".casecmp("ABCDEF")    #=> 0
2435  */
2436 
2437 static VALUE
2438 rb_str_casecmp(VALUE str1, VALUE str2)
2439 {
2440     long len;
2441     rb_encoding *enc;
2442     char *p1, *p1end, *p2, *p2end;
2443 
2444     StringValue(str2);
2445     enc = rb_enc_compatible(str1, str2);
2446     if (!enc) {
2447         return Qnil;
2448     }
2449 
2450     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2451     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2452     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2453         while (p1 < p1end && p2 < p2end) {
2454             if (*p1 != *p2) {
2455                 unsigned int c1 = TOUPPER(*p1 & 0xff);
2456                 unsigned int c2 = TOUPPER(*p2 & 0xff);
2457                 if (c1 != c2)
2458                     return INT2FIX(c1 < c2 ? -1 : 1);
2459             }
2460             p1++;
2461             p2++;
2462         }
2463     }
2464     else {
2465         while (p1 < p1end && p2 < p2end) {
2466             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2467             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2468 
2469             if (0 <= c1 && 0 <= c2) {
2470                 c1 = TOUPPER(c1);
2471                 c2 = TOUPPER(c2);
2472                 if (c1 != c2)
2473                     return INT2FIX(c1 < c2 ? -1 : 1);
2474             }
2475             else {
2476                 int r;
2477                 l1 = rb_enc_mbclen(p1, p1end, enc);
2478                 l2 = rb_enc_mbclen(p2, p2end, enc);
2479                 len = l1 < l2 ? l1 : l2;
2480                 r = memcmp(p1, p2, len);
2481                 if (r != 0)
2482                     return INT2FIX(r < 0 ? -1 : 1);
2483                 if (l1 != l2)
2484                     return INT2FIX(l1 < l2 ? -1 : 1);
2485             }
2486             p1 += l1;
2487             p2 += l2;
2488         }
2489     }
2490     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2491     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2492     return INT2FIX(-1);
2493 }
2494 
2495 static long
2496 rb_str_index(VALUE str, VALUE sub, long offset)
2497 {
2498     long pos;
2499     char *s, *sptr, *e;
2500     long len, slen;
2501     rb_encoding *enc;
2502 
2503     enc = rb_enc_check(str, sub);
2504     if (is_broken_string(sub)) {
2505         return -1;
2506     }
2507     len = str_strlen(str, enc);
2508     slen = str_strlen(sub, enc);
2509     if (offset < 0) {
2510         offset += len;
2511         if (offset < 0) return -1;
2512     }
2513     if (len - offset < slen) return -1;
2514     s = RSTRING_PTR(str);
2515     e = s + RSTRING_LEN(str);
2516     if (offset) {
2517         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2518         s += offset;
2519     }
2520     if (slen == 0) return offset;
2521     /* need proceed one character at a time */
2522     sptr = RSTRING_PTR(sub);
2523     slen = RSTRING_LEN(sub);
2524     len = RSTRING_LEN(str) - offset;
2525     for (;;) {
2526         char *t;
2527         pos = rb_memsearch(sptr, slen, s, len, enc);
2528         if (pos < 0) return pos;
2529         t = rb_enc_right_char_head(s, s+pos, e, enc);
2530         if (t == s + pos) break;
2531         if ((len -= t - s) <= 0) return -1;
2532         offset += t - s;
2533         s = t;
2534     }
2535     return pos + offset;
2536 }
2537 
2538 
2539 /*
2540  *  call-seq:
2541  *     str.index(substring [, offset])   -> fixnum or nil
2542  *     str.index(regexp [, offset])      -> fixnum or nil
2543  *
2544  *  Returns the index of the first occurrence of the given <i>substring</i> or
2545  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2546  *  found. If the second parameter is present, it specifies the position in the
2547  *  string to begin the search.
2548  *
2549  *     "hello".index('e')             #=> 1
2550  *     "hello".index('lo')            #=> 3
2551  *     "hello".index('a')             #=> nil
2552  *     "hello".index(?e)              #=> 1
2553  *     "hello".index(/[aeiou]/, -3)   #=> 4
2554  */
2555 
2556 static VALUE
2557 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2558 {
2559     VALUE sub;
2560     VALUE initpos;
2561     long pos;
2562 
2563     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2564         pos = NUM2LONG(initpos);
2565     }
2566     else {
2567         pos = 0;
2568     }
2569     if (pos < 0) {
2570         pos += str_strlen(str, STR_ENC_GET(str));
2571         if (pos < 0) {
2572             if (RB_TYPE_P(sub, T_REGEXP)) {
2573                 rb_backref_set(Qnil);
2574             }
2575             return Qnil;
2576         }
2577     }
2578 
2579     if (SPECIAL_CONST_P(sub)) goto generic;
2580     switch (BUILTIN_TYPE(sub)) {
2581       case T_REGEXP:
2582         if (pos > str_strlen(str, STR_ENC_GET(str)))
2583             return Qnil;
2584         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2585                          rb_enc_check(str, sub), single_byte_optimizable(str));
2586 
2587         pos = rb_reg_search(sub, str, pos, 0);
2588         pos = rb_str_sublen(str, pos);
2589         break;
2590 
2591       generic:
2592       default: {
2593         VALUE tmp;
2594 
2595         tmp = rb_check_string_type(sub);
2596         if (NIL_P(tmp)) {
2597             rb_raise(rb_eTypeError, "type mismatch: %s given",
2598                      rb_obj_classname(sub));
2599         }
2600         sub = tmp;
2601       }
2602         /* fall through */
2603       case T_STRING:
2604         pos = rb_str_index(str, sub, pos);
2605         pos = rb_str_sublen(str, pos);
2606         break;
2607     }
2608 
2609     if (pos == -1) return Qnil;
2610     return LONG2NUM(pos);
2611 }
2612 
2613 static long
2614 rb_str_rindex(VALUE str, VALUE sub, long pos)
2615 {
2616     long len, slen;
2617     char *s, *sbeg, *e, *t;
2618     rb_encoding *enc;
2619     int singlebyte = single_byte_optimizable(str);
2620 
2621     enc = rb_enc_check(str, sub);
2622     if (is_broken_string(sub)) {
2623         return -1;
2624     }
2625     len = str_strlen(str, enc);
2626     slen = str_strlen(sub, enc);
2627     /* substring longer than string */
2628     if (len < slen) return -1;
2629     if (len - pos < slen) {
2630         pos = len - slen;
2631     }
2632     if (len == 0) {
2633         return pos;
2634     }
2635     sbeg = RSTRING_PTR(str);
2636     e = RSTRING_END(str);
2637     t = RSTRING_PTR(sub);
2638     slen = RSTRING_LEN(sub);
2639     s = str_nth(sbeg, e, pos, enc, singlebyte);
2640     while (s) {
2641         if (memcmp(s, t, slen) == 0) {
2642             return pos;
2643         }
2644         if (pos == 0) break;
2645         pos--;
2646         s = rb_enc_prev_char(sbeg, s, e, enc);
2647     }
2648     return -1;
2649 }
2650 
2651 
2652 /*
2653  *  call-seq:
2654  *     str.rindex(substring [, fixnum])   -> fixnum or nil
2655  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
2656  *
2657  *  Returns the index of the last occurrence of the given <i>substring</i> or
2658  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2659  *  found. If the second parameter is present, it specifies the position in the
2660  *  string to end the search---characters beyond this point will not be
2661  *  considered.
2662  *
2663  *     "hello".rindex('e')             #=> 1
2664  *     "hello".rindex('l')             #=> 3
2665  *     "hello".rindex('a')             #=> nil
2666  *     "hello".rindex(?e)              #=> 1
2667  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2668  */
2669 
2670 static VALUE
2671 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2672 {
2673     VALUE sub;
2674     VALUE vpos;
2675     rb_encoding *enc = STR_ENC_GET(str);
2676     long pos, len = str_strlen(str, enc);
2677 
2678     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2679         pos = NUM2LONG(vpos);
2680         if (pos < 0) {
2681             pos += len;
2682             if (pos < 0) {
2683                 if (RB_TYPE_P(sub, T_REGEXP)) {
2684                     rb_backref_set(Qnil);
2685                 }
2686                 return Qnil;
2687             }
2688         }
2689         if (pos > len) pos = len;
2690     }
2691     else {
2692         pos = len;
2693     }
2694 
2695     if (SPECIAL_CONST_P(sub)) goto generic;
2696     switch (BUILTIN_TYPE(sub)) {
2697       case T_REGEXP:
2698         /* enc = rb_get_check(str, sub); */
2699         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2700                          STR_ENC_GET(str), single_byte_optimizable(str));
2701 
2702         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2703             pos = rb_reg_search(sub, str, pos, 1);
2704             pos = rb_str_sublen(str, pos);
2705         }
2706         if (pos >= 0) return LONG2NUM(pos);
2707         break;
2708 
2709       generic:
2710       default: {
2711         VALUE tmp;
2712 
2713         tmp = rb_check_string_type(sub);
2714         if (NIL_P(tmp)) {
2715             rb_raise(rb_eTypeError, "type mismatch: %s given",
2716                      rb_obj_classname(sub));
2717         }
2718         sub = tmp;
2719       }
2720         /* fall through */
2721       case T_STRING:
2722         pos = rb_str_rindex(str, sub, pos);
2723         if (pos >= 0) return LONG2NUM(pos);
2724         break;
2725     }
2726     return Qnil;
2727 }
2728 
2729 /*
2730  *  call-seq:
2731  *     str =~ obj   -> fixnum or nil
2732  *
2733  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2734  *  against <i>str</i>,and returns the position the match starts, or
2735  *  <code>nil</code> if there is no match. Otherwise, invokes
2736  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2737  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2738  *
2739  *  Note: <code>str =~ regexp</code> is not the same as
2740  *  <code>regexp =~ str</code>. Strings captured from named capture groups
2741  *  are assigned to local variables only in the second case.
2742  *
2743  *     "cat o' 9 tails" =~ /\d/   #=> 7
2744  *     "cat o' 9 tails" =~ 9      #=> nil
2745  */
2746 
2747 static VALUE
2748 rb_str_match(VALUE x, VALUE y)
2749 {
2750     if (SPECIAL_CONST_P(y)) goto generic;
2751     switch (BUILTIN_TYPE(y)) {
2752       case T_STRING:
2753         rb_raise(rb_eTypeError, "type mismatch: String given");
2754 
2755       case T_REGEXP:
2756         return rb_reg_match(y, x);
2757 
2758       generic:
2759       default:
2760         return rb_funcall(y, rb_intern("=~"), 1, x);
2761     }
2762 }
2763 
2764 
2765 static VALUE get_pat(VALUE, int);
2766 
2767 
2768 /*
2769  *  call-seq:
2770  *     str.match(pattern)        -> matchdata or nil
2771  *     str.match(pattern, pos)   -> matchdata or nil
2772  *
2773  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2774  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2775  *  parameter is present, it specifies the position in the string to begin the
2776  *  search.
2777  *
2778  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2779  *     'hello'.match('(.)\1')[0]   #=> "ll"
2780  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2781  *     'hello'.match('xx')         #=> nil
2782  *
2783  *  If a block is given, invoke the block with MatchData if match succeed, so
2784  *  that you can write
2785  *
2786  *     str.match(pat) {|m| ...}
2787  *
2788  *  instead of
2789  *
2790  *     if m = str.match(pat)
2791  *       ...
2792  *     end
2793  *
2794  *  The return value is a value from block execution in this case.
2795  */
2796 
2797 static VALUE
2798 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2799 {
2800     VALUE re, result;
2801     if (argc < 1)
2802         rb_check_arity(argc, 1, 2);
2803     re = argv[0];
2804     argv[0] = str;
2805     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2806     if (!NIL_P(result) && rb_block_given_p()) {
2807         return rb_yield(result);
2808     }
2809     return result;
2810 }
2811 
2812 enum neighbor_char {
2813     NEIGHBOR_NOT_CHAR,
2814     NEIGHBOR_FOUND,
2815     NEIGHBOR_WRAPPED
2816 };
2817 
2818 static enum neighbor_char
2819 enc_succ_char(char *p, long len, rb_encoding *enc)
2820 {
2821     long i;
2822     int l;
2823     while (1) {
2824         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2825             p[i] = '\0';
2826         if (i < 0)
2827             return NEIGHBOR_WRAPPED;
2828         ++((unsigned char*)p)[i];
2829         l = rb_enc_precise_mbclen(p, p+len, enc);
2830         if (MBCLEN_CHARFOUND_P(l)) {
2831             l = MBCLEN_CHARFOUND_LEN(l);
2832             if (l == len) {
2833                 return NEIGHBOR_FOUND;
2834             }
2835             else {
2836                 memset(p+l, 0xff, len-l);
2837             }
2838         }
2839         if (MBCLEN_INVALID_P(l) && i < len-1) {
2840             long len2;
2841             int l2;
2842             for (len2 = len-1; 0 < len2; len2--) {
2843                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2844                 if (!MBCLEN_INVALID_P(l2))
2845                     break;
2846             }
2847             memset(p+len2+1, 0xff, len-(len2+1));
2848         }
2849     }
2850 }
2851 
2852 static enum neighbor_char
2853 enc_pred_char(char *p, long len, rb_encoding *enc)
2854 {
2855     long i;
2856     int l;
2857     while (1) {
2858         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2859             p[i] = '\xff';
2860         if (i < 0)
2861             return NEIGHBOR_WRAPPED;
2862         --((unsigned char*)p)[i];
2863         l = rb_enc_precise_mbclen(p, p+len, enc);
2864         if (MBCLEN_CHARFOUND_P(l)) {
2865             l = MBCLEN_CHARFOUND_LEN(l);
2866             if (l == len) {
2867                 return NEIGHBOR_FOUND;
2868             }
2869             else {
2870                 memset(p+l, 0, len-l);
2871             }
2872         }
2873         if (MBCLEN_INVALID_P(l) && i < len-1) {
2874             long len2;
2875             int l2;
2876             for (len2 = len-1; 0 < len2; len2--) {
2877                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2878                 if (!MBCLEN_INVALID_P(l2))
2879                     break;
2880             }
2881             memset(p+len2+1, 0, len-(len2+1));
2882         }
2883     }
2884 }
2885 
2886 /*
2887   overwrite +p+ by succeeding letter in +enc+ and returns
2888   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2889   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2890   assuming each ranges are successive, and mbclen
2891   never change in each ranges.
2892   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2893   character.
2894  */
2895 static enum neighbor_char
2896 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2897 {
2898     enum neighbor_char ret;
2899     unsigned int c;
2900     int ctype;
2901     int range;
2902     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2903 
2904     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2905     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2906         ctype = ONIGENC_CTYPE_DIGIT;
2907     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2908         ctype = ONIGENC_CTYPE_ALPHA;
2909     else
2910         return NEIGHBOR_NOT_CHAR;
2911 
2912     MEMCPY(save, p, char, len);
2913     ret = enc_succ_char(p, len, enc);
2914     if (ret == NEIGHBOR_FOUND) {
2915         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2916         if (rb_enc_isctype(c, ctype, enc))
2917             return NEIGHBOR_FOUND;
2918     }
2919     MEMCPY(p, save, char, len);
2920     range = 1;
2921     while (1) {
2922         MEMCPY(save, p, char, len);
2923         ret = enc_pred_char(p, len, enc);
2924         if (ret == NEIGHBOR_FOUND) {
2925             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2926             if (!rb_enc_isctype(c, ctype, enc)) {
2927                 MEMCPY(p, save, char, len);
2928                 break;
2929             }
2930         }
2931         else {
2932             MEMCPY(p, save, char, len);
2933             break;
2934         }
2935         range++;
2936     }
2937     if (range == 1) {
2938         return NEIGHBOR_NOT_CHAR;
2939     }
2940 
2941     if (ctype != ONIGENC_CTYPE_DIGIT) {
2942         MEMCPY(carry, p, char, len);
2943         return NEIGHBOR_WRAPPED;
2944     }
2945 
2946     MEMCPY(carry, p, char, len);
2947     enc_succ_char(carry, len, enc);
2948     return NEIGHBOR_WRAPPED;
2949 }
2950 
2951 
2952 /*
2953  *  call-seq:
2954  *     str.succ   -> new_str
2955  *     str.next   -> new_str
2956  *
2957  *  Returns the successor to <i>str</i>. The successor is calculated by
2958  *  incrementing characters starting from the rightmost alphanumeric (or
2959  *  the rightmost character if there are no alphanumerics) in the
2960  *  string. Incrementing a digit always results in another digit, and
2961  *  incrementing a letter results in another letter of the same case.
2962  *  Incrementing nonalphanumerics uses the underlying character set's
2963  *  collating sequence.
2964  *
2965  *  If the increment generates a ``carry,'' the character to the left of
2966  *  it is incremented. This process repeats until there is no carry,
2967  *  adding an additional character if necessary.
2968  *
2969  *     "abcd".succ        #=> "abce"
2970  *     "THX1138".succ     #=> "THX1139"
2971  *     "<<koala>>".succ   #=> "<<koalb>>"
2972  *     "1999zzz".succ     #=> "2000aaa"
2973  *     "ZZZ9999".succ     #=> "AAAA0000"
2974  *     "***".succ         #=> "**+"
2975  */
2976 
2977 VALUE
2978 rb_str_succ(VALUE orig)
2979 {
2980     rb_encoding *enc;
2981     VALUE str;
2982     char *sbeg, *s, *e, *last_alnum = 0;
2983     int c = -1;
2984     long l;
2985     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2986     long carry_pos = 0, carry_len = 1;
2987     enum neighbor_char neighbor = NEIGHBOR_FOUND;
2988 
2989     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2990     rb_enc_cr_str_copy_for_substr(str, orig);
2991     OBJ_INFECT(str, orig);
2992     if (RSTRING_LEN(str) == 0) return str;
2993 
2994     enc = STR_ENC_GET(orig);
2995     sbeg = RSTRING_PTR(str);
2996     s = e = sbeg + RSTRING_LEN(str);
2997 
2998     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2999         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
3000             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
3001                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
3002                 s = last_alnum;
3003                 break;
3004             }
3005         }
3006         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
3007         neighbor = enc_succ_alnum_char(s, l, enc, carry);
3008         switch (neighbor) {
3009           case NEIGHBOR_NOT_CHAR:
3010             continue;
3011           case NEIGHBOR_FOUND:
3012             return str;
3013           case NEIGHBOR_WRAPPED:
3014             last_alnum = s;
3015             break;
3016         }
3017         c = 1;
3018         carry_pos = s - sbeg;
3019         carry_len = l;
3020     }
3021     if (c == -1) {              /* str contains no alnum */
3022         s = e;
3023         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3024             enum neighbor_char neighbor;
3025             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
3026             neighbor = enc_succ_char(s, l, enc);
3027             if (neighbor == NEIGHBOR_FOUND)
3028                 return str;
3029             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
3030                 /* wrapped to \0...\0.  search next valid char. */
3031                 enc_succ_char(s, l, enc);
3032             }
3033             if (!rb_enc_asciicompat(enc)) {
3034                 MEMCPY(carry, s, char, l);
3035                 carry_len = l;
3036             }
3037             carry_pos = s - sbeg;
3038         }
3039     }
3040     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
3041     s = RSTRING_PTR(str) + carry_pos;
3042     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
3043     memmove(s, carry, carry_len);
3044     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
3045     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3046     rb_enc_str_coderange(str);
3047     return str;
3048 }
3049 
3050 
3051 /*
3052  *  call-seq:
3053  *     str.succ!   -> str
3054  *     str.next!   -> str
3055  *
3056  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
3057  *  place.
3058  */
3059 
3060 static VALUE
3061 rb_str_succ_bang(VALUE str)
3062 {
3063     rb_str_shared_replace(str, rb_str_succ(str));
3064 
3065     return str;
3066 }
3067 
3068 
3069 /*
3070  *  call-seq:
3071  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
3072  *     str.upto(other_str, exclusive=false)                -> an_enumerator
3073  *
3074  *  Iterates through successive values, starting at <i>str</i> and
3075  *  ending at <i>other_str</i> inclusive, passing each value in turn to
3076  *  the block. The <code>String#succ</code> method is used to generate
3077  *  each value.  If optional second argument exclusive is omitted or is false,
3078  *  the last value will be included; otherwise it will be excluded.
3079  *
3080  *  If no block is given, an enumerator is returned instead.
3081  *
3082  *     "a8".upto("b6") {|s| print s, ' ' }
3083  *     for s in "a8".."b6"
3084  *       print s, ' '
3085  *     end
3086  *
3087  *  <em>produces:</em>
3088  *
3089  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
3090  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
3091  *
3092  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3093  *  both are recognized as decimal numbers. In addition, the width of
3094  *  string (e.g. leading zeros) is handled appropriately.
3095  *
3096  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
3097  *     "25".upto("5").to_a   #=> []
3098  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
3099  */
3100 
3101 static VALUE
3102 rb_str_upto(int argc, VALUE *argv, VALUE beg)
3103 {
3104     VALUE end, exclusive;
3105     VALUE current, after_end;
3106     ID succ;
3107     int n, excl, ascii;
3108     rb_encoding *enc;
3109 
3110     rb_scan_args(argc, argv, "11", &end, &exclusive);
3111     RETURN_ENUMERATOR(beg, argc, argv);
3112     excl = RTEST(exclusive);
3113     CONST_ID(succ, "succ");
3114     StringValue(end);
3115     enc = rb_enc_check(beg, end);
3116     ascii = (is_ascii_string(beg) && is_ascii_string(end));
3117     /* single character */
3118     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3119         char c = RSTRING_PTR(beg)[0];
3120         char e = RSTRING_PTR(end)[0];
3121 
3122         if (c > e || (excl && c == e)) return beg;
3123         for (;;) {
3124             rb_yield(rb_enc_str_new(&c, 1, enc));
3125             if (!excl && c == e) break;
3126             c++;
3127             if (excl && c == e) break;
3128         }
3129         return beg;
3130     }
3131     /* both edges are all digits */
3132     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3133         char *s, *send;
3134         VALUE b, e;
3135         int width;
3136 
3137         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3138         width = rb_long2int(send - s);
3139         while (s < send) {
3140             if (!ISDIGIT(*s)) goto no_digits;
3141             s++;
3142         }
3143         s = RSTRING_PTR(end); send = RSTRING_END(end);
3144         while (s < send) {
3145             if (!ISDIGIT(*s)) goto no_digits;
3146             s++;
3147         }
3148         b = rb_str_to_inum(beg, 10, FALSE);
3149         e = rb_str_to_inum(end, 10, FALSE);
3150         if (FIXNUM_P(b) && FIXNUM_P(e)) {
3151             long bi = FIX2LONG(b);
3152             long ei = FIX2LONG(e);
3153             rb_encoding *usascii = rb_usascii_encoding();
3154 
3155             while (bi <= ei) {
3156                 if (excl && bi == ei) break;
3157                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3158                 bi++;
3159             }
3160         }
3161         else {
3162             ID op = excl ? '<' : rb_intern("<=");
3163             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3164 
3165             args[0] = INT2FIX(width);
3166             while (rb_funcall(b, op, 1, e)) {
3167                 args[1] = b;
3168                 rb_yield(rb_str_format(numberof(args), args, fmt));
3169                 b = rb_funcall(b, succ, 0, 0);
3170             }
3171         }
3172         return beg;
3173     }
3174     /* normal case */
3175   no_digits:
3176     n = rb_str_cmp(beg, end);
3177     if (n > 0 || (excl && n == 0)) return beg;
3178 
3179     after_end = rb_funcall(end, succ, 0, 0);
3180     current = rb_str_dup(beg);
3181     while (!rb_str_equal(current, after_end)) {
3182         VALUE next = Qnil;
3183         if (excl || !rb_str_equal(current, end))
3184             next = rb_funcall(current, succ, 0, 0);
3185         rb_yield(current);
3186         if (NIL_P(next)) break;
3187         current = next;
3188         StringValue(current);
3189         if (excl && rb_str_equal(current, end)) break;
3190         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3191             break;
3192     }
3193 
3194     return beg;
3195 }
3196 
3197 static VALUE
3198 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3199 {
3200     if (rb_reg_search(re, str, 0, 0) >= 0) {
3201         VALUE match = rb_backref_get();
3202         int nth = rb_reg_backref_number(match, backref);
3203         return rb_reg_nth_match(nth, match);
3204     }
3205     return Qnil;
3206 }
3207 
3208 static VALUE
3209 rb_str_aref(VALUE str, VALUE indx)
3210 {
3211     long idx;
3212 
3213     if (FIXNUM_P(indx)) {
3214         idx = FIX2LONG(indx);
3215 
3216       num_index:
3217         str = rb_str_substr(str, idx, 1);
3218         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3219         return str;
3220     }
3221 
3222     if (SPECIAL_CONST_P(indx)) goto generic;
3223     switch (BUILTIN_TYPE(indx)) {
3224       case T_REGEXP:
3225         return rb_str_subpat(str, indx, INT2FIX(0));
3226 
3227       case T_STRING:
3228         if (rb_str_index(str, indx, 0) != -1)
3229             return rb_str_dup(indx);
3230         return Qnil;
3231 
3232       generic:
3233       default:
3234         /* check if indx is Range */
3235         {
3236             long beg, len;
3237             VALUE tmp;
3238 
3239             len = str_strlen(str, STR_ENC_GET(str));
3240             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3241               case Qfalse:
3242                 break;
3243               case Qnil:
3244                 return Qnil;
3245               default:
3246                 tmp = rb_str_substr(str, beg, len);
3247                 return tmp;
3248             }
3249         }
3250         idx = NUM2LONG(indx);
3251         goto num_index;
3252     }
3253 
3254     UNREACHABLE;
3255 }
3256 
3257 
3258 /*
3259  *  call-seq:
3260  *     str[index]                 -> new_str or nil
3261  *     str[start, length]         -> new_str or nil
3262  *     str[range]                 -> new_str or nil
3263  *     str[regexp]                -> new_str or nil
3264  *     str[regexp, capture]       -> new_str or nil
3265  *     str[match_str]             -> new_str or nil
3266  *     str.slice(index)           -> new_str or nil
3267  *     str.slice(start, length)   -> new_str or nil
3268  *     str.slice(range)           -> new_str or nil
3269  *     str.slice(regexp)          -> new_str or nil
3270  *     str.slice(regexp, capture) -> new_str or nil
3271  *     str.slice(match_str)       -> new_str or nil
3272  *
3273  *  Element Reference --- If passed a single +index+, returns a substring of
3274  *  one character at that index. If passed a +start+ index and a +length+,
3275  *  returns a substring containing +length+ characters starting at the
3276  *  +index+. If passed a +range+, its beginning and end are interpreted as
3277  *  offsets delimiting the substring to be returned.
3278  *
3279  *  In these three cases, if an index is negative, it is counted from the end
3280  *  of the string.  For the +start+ and +range+ cases the starting index
3281  *  is just before a character and an index matching the string's size.
3282  *  Additionally, an empty string is returned when the starting index for a
3283  *  character range is at the end of the string.
3284  *
3285  *  Returns +nil+ if the initial index falls outside the string or the length
3286  *  is negative.
3287  *
3288  *  If a +Regexp+ is supplied, the matching portion of the string is
3289  *  returned.  If a +capture+ follows the regular expression, which may be a
3290  *  capture group index or name, follows the regular expression that component
3291  *  of the MatchData is returned instead.
3292  *
3293  *  If a +match_str+ is given, that string is returned if it occurs in
3294  *  the string.
3295  *
3296  *  Returns +nil+ if the regular expression does not match or the match string
3297  *  cannot be found.
3298  *
3299  *     a = "hello there"
3300  *
3301  *     a[1]                   #=> "e"
3302  *     a[2, 3]                #=> "llo"
3303  *     a[2..3]                #=> "ll"
3304  *
3305  *     a[-3, 2]               #=> "er"
3306  *     a[7..-2]               #=> "her"
3307  *     a[-4..-2]              #=> "her"
3308  *     a[-2..-4]              #=> ""
3309  *
3310  *     a[11, 0]               #=> ""
3311  *     a[11]                  #=> nil
3312  *     a[12, 0]               #=> nil
3313  *     a[12..-1]              #=> nil
3314  *
3315  *     a[/[aeiou](.)\1/]      #=> "ell"
3316  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
3317  *     a[/[aeiou](.)\1/, 1]   #=> "l"
3318  *     a[/[aeiou](.)\1/, 2]   #=> nil
3319  *
3320  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
3321  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"]     #=> "e"
3322  *
3323  *     a["lo"]                #=> "lo"
3324  *     a["bye"]               #=> nil
3325  */
3326 
3327 static VALUE
3328 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
3329 {
3330     if (argc == 2) {
3331         if (RB_TYPE_P(argv[0], T_REGEXP)) {
3332             return rb_str_subpat(str, argv[0], argv[1]);
3333         }
3334         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3335     }
3336     rb_check_arity(argc, 1, 2);
3337     return rb_str_aref(str, argv[0]);
3338 }
3339 
3340 VALUE
3341 rb_str_drop_bytes(VALUE str, long len)
3342 {
3343     char *ptr = RSTRING_PTR(str);
3344     long olen = RSTRING_LEN(str), nlen;
3345 
3346     str_modifiable(str);
3347     if (len > olen) len = olen;
3348     nlen = olen - len;
3349     if (nlen <= RSTRING_EMBED_LEN_MAX) {
3350         char *oldptr = ptr;
3351         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3352         STR_SET_EMBED(str);
3353         STR_SET_EMBED_LEN(str, nlen);
3354         ptr = RSTRING(str)->as.ary;
3355         memmove(ptr, oldptr + len, nlen);
3356         if (fl == STR_NOEMBED) xfree(oldptr);
3357     }
3358     else {
3359         if (!STR_SHARED_P(str)) rb_str_new4(str);
3360         ptr = RSTRING(str)->as.heap.ptr += len;
3361         RSTRING(str)->as.heap.len = nlen;
3362     }
3363     ptr[nlen] = 0;
3364     ENC_CODERANGE_CLEAR(str);
3365     return str;
3366 }
3367 
3368 static void
3369 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3370 {
3371     if (beg == 0 && RSTRING_LEN(val) == 0) {
3372         rb_str_drop_bytes(str, len);
3373         OBJ_INFECT(str, val);
3374         return;
3375     }
3376 
3377     rb_str_modify(str);
3378     if (len < RSTRING_LEN(val)) {
3379         /* expand string */
3380         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3381     }
3382 
3383     if (RSTRING_LEN(val) != len) {
3384         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3385                 RSTRING_PTR(str) + beg + len,
3386                 RSTRING_LEN(str) - (beg + len));
3387     }
3388     if (RSTRING_LEN(val) < beg && len < 0) {
3389         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3390     }
3391     if (RSTRING_LEN(val) > 0) {
3392         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3393     }
3394     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3395     if (RSTRING_PTR(str)) {
3396         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3397     }
3398     OBJ_INFECT(str, val);
3399 }
3400 
3401 static void
3402 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3403 {
3404     long slen;
3405     char *p, *e;
3406     rb_encoding *enc;
3407     int singlebyte = single_byte_optimizable(str);
3408     int cr;
3409 
3410     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3411 
3412     StringValue(val);
3413     enc = rb_enc_check(str, val);
3414     slen = str_strlen(str, enc);
3415 
3416     if (slen < beg) {
3417       out_of_range:
3418         rb_raise(rb_eIndexError, "index %ld out of string", beg);
3419     }
3420     if (beg < 0) {
3421         if (-beg > slen) {
3422             goto out_of_range;
3423         }
3424         beg += slen;
3425     }
3426     if (slen < len || slen < beg + len) {
3427         len = slen - beg;
3428     }
3429     str_modify_keep_cr(str);
3430     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3431     if (!p) p = RSTRING_END(str);
3432     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3433     if (!e) e = RSTRING_END(str);
3434     /* error check */
3435     beg = p - RSTRING_PTR(str); /* physical position */
3436     len = e - p;                /* physical length */
3437     rb_str_splice_0(str, beg, len, val);
3438     rb_enc_associate(str, enc);
3439     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
3440     if (cr != ENC_CODERANGE_BROKEN)
3441         ENC_CODERANGE_SET(str, cr);
3442 }
3443 
3444 void
3445 rb_str_update(VALUE str, long beg, long len, VALUE val)
3446 {
3447     rb_str_splice(str, beg, len, val);
3448 }
3449 
3450 static void
3451 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3452 {
3453     int nth;
3454     VALUE match;
3455     long start, end, len;
3456     rb_encoding *enc;
3457     struct re_registers *regs;
3458 
3459     if (rb_reg_search(re, str, 0, 0) < 0) {
3460         rb_raise(rb_eIndexError, "regexp not matched");
3461     }
3462     match = rb_backref_get();
3463     nth = rb_reg_backref_number(match, backref);
3464     regs = RMATCH_REGS(match);
3465     if (nth >= regs->num_regs) {
3466       out_of_range:
3467         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3468     }
3469     if (nth < 0) {
3470         if (-nth >= regs->num_regs) {
3471             goto out_of_range;
3472         }
3473         nth += regs->num_regs;
3474     }
3475 
3476     start = BEG(nth);
3477     if (start == -1) {
3478         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3479     }
3480     end = END(nth);
3481     len = end - start;
3482     StringValue(val);
3483     enc = rb_enc_check(str, val);
3484     rb_str_splice_0(str, start, len, val);
3485     rb_enc_associate(str, enc);
3486 }
3487 
3488 static VALUE
3489 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3490 {
3491     long idx, beg;
3492 
3493     if (FIXNUM_P(indx)) {
3494         idx = FIX2LONG(indx);
3495       num_index:
3496         rb_str_splice(str, idx, 1, val);
3497         return val;
3498     }
3499 
3500     if (SPECIAL_CONST_P(indx)) goto generic;
3501     switch (TYPE(indx)) {
3502       case T_REGEXP:
3503         rb_str_subpat_set(str, indx, INT2FIX(0), val);
3504         return val;
3505 
3506       case T_STRING:
3507         beg = rb_str_index(str, indx, 0);
3508         if (beg < 0) {
3509             rb_raise(rb_eIndexError, "string not matched");
3510         }
3511         beg = rb_str_sublen(str, beg);
3512         rb_str_splice(str, beg, str_strlen(indx, 0), val);
3513         return val;
3514 
3515       generic:
3516       default:
3517         /* check if indx is Range */
3518         {
3519             long beg, len;
3520             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3521                 rb_str_splice(str, beg, len, val);
3522                 return val;
3523             }
3524         }
3525         idx = NUM2LONG(indx);
3526         goto num_index;
3527     }
3528 }
3529 
3530 /*
3531  *  call-seq:
3532  *     str[fixnum] = new_str
3533  *     str[fixnum, fixnum] = new_str
3534  *     str[range] = aString
3535  *     str[regexp] = new_str
3536  *     str[regexp, fixnum] = new_str
3537  *     str[regexp, name] = new_str
3538  *     str[other_str] = new_str
3539  *
3540  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
3541  *  portion of the string affected is determined using the same criteria as
3542  *  <code>String#[]</code>. If the replacement string is not the same length as
3543  *  the text it is replacing, the string will be adjusted accordingly. If the
3544  *  regular expression or string is used as the index doesn't match a position
3545  *  in the string, <code>IndexError</code> is raised. If the regular expression
3546  *  form is used, the optional second <code>Fixnum</code> allows you to specify
3547  *  which portion of the match to replace (effectively using the
3548  *  <code>MatchData</code> indexing rules. The forms that take a
3549  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3550  *  out of range; the <code>Range</code> form will raise a
3551  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3552  *  will raise an <code>IndexError</code> on negative match.
3553  */
3554 
3555 static VALUE
3556 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
3557 {
3558     if (argc == 3) {
3559         if (RB_TYPE_P(argv[0], T_REGEXP)) {
3560             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3561         }
3562         else {
3563             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3564         }
3565         return argv[2];
3566     }
3567     rb_check_arity(argc, 2, 3);
3568     return rb_str_aset(str, argv[0], argv[1]);
3569 }
3570 
3571 /*
3572  *  call-seq:
3573  *     str.insert(index, other_str)   -> str
3574  *
3575  *  Inserts <i>other_str</i> before the character at the given
3576  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3577  *  end of the string, and insert <em>after</em> the given character.
3578  *  The intent is insert <i>aString</i> so that it starts at the given
3579  *  <i>index</i>.
3580  *
3581  *     "abcd".insert(0, 'X')    #=> "Xabcd"
3582  *     "abcd".insert(3, 'X')    #=> "abcXd"
3583  *     "abcd".insert(4, 'X')    #=> "abcdX"
3584  *     "abcd".insert(-3, 'X')   #=> "abXcd"
3585  *     "abcd".insert(-1, 'X')   #=> "abcdX"
3586  */
3587 
3588 static VALUE
3589 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3590 {
3591     long pos = NUM2LONG(idx);
3592 
3593     if (pos == -1) {
3594         return rb_str_append(str, str2);
3595     }
3596     else if (pos < 0) {
3597         pos++;
3598     }
3599     rb_str_splice(str, pos, 0, str2);
3600     return str;
3601 }
3602 
3603 
3604 /*
3605  *  call-seq:
3606  *     str.slice!(fixnum)           -> fixnum or nil
3607  *     str.slice!(fixnum, fixnum)   -> new_str or nil
3608  *     str.slice!(range)            -> new_str or nil
3609  *     str.slice!(regexp)           -> new_str or nil
3610  *     str.slice!(other_str)        -> new_str or nil
3611  *
3612  *  Deletes the specified portion from <i>str</i>, and returns the portion
3613  *  deleted.
3614  *
3615  *     string = "this is a string"
3616  *     string.slice!(2)        #=> "i"
3617  *     string.slice!(3..6)     #=> " is "
3618  *     string.slice!(/s.*t/)   #=> "sa st"
3619  *     string.slice!("r")      #=> "r"
3620  *     string                  #=> "thing"
3621  */
3622 
3623 static VALUE
3624 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3625 {
3626     VALUE result;
3627     VALUE buf[3];
3628     int i;
3629 
3630     rb_check_arity(argc, 1, 2);
3631     for (i=0; i<argc; i++) {
3632         buf[i] = argv[i];
3633     }
3634     str_modify_keep_cr(str);
3635     result = rb_str_aref_m(argc, buf, str);
3636     if (!NIL_P(result)) {
3637         buf[i] = rb_str_new(0,0);
3638         rb_str_aset_m(argc+1, buf, str);
3639     }
3640     return result;
3641 }
3642 
3643 static VALUE
3644 get_pat(VALUE pat, int quote)
3645 {
3646     VALUE val;
3647 
3648     switch (TYPE(pat)) {
3649       case T_REGEXP:
3650         return pat;
3651 
3652       case T_STRING:
3653         break;
3654 
3655       default:
3656         val = rb_check_string_type(pat);
3657         if (NIL_P(val)) {
3658             Check_Type(pat, T_REGEXP);
3659         }
3660         pat = val;
3661     }
3662 
3663     if (quote) {
3664         pat = rb_reg_quote(pat);
3665     }
3666 
3667     return rb_reg_regcomp(pat);
3668 }
3669 
3670 
3671 /*
3672  *  call-seq:
3673  *     str.sub!(pattern, replacement)          -> str or nil
3674  *     str.sub!(pattern) {|match| block }      -> str or nil
3675  *
3676  *  Performs the same substitution as String#sub in-place.
3677  *
3678  *  Returns +str+ if a substitution was performed or +nil+ if no substitution
3679  *  was performed.
3680  */
3681 
3682 static VALUE
3683 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3684 {
3685     VALUE pat, repl, hash = Qnil;
3686     int iter = 0;
3687     int tainted = 0;
3688     int untrusted = 0;
3689     long plen;
3690     int min_arity = rb_block_given_p() ? 1 : 2;
3691 
3692     rb_check_arity(argc, min_arity, 2);
3693     if (argc == 1) {
3694         iter = 1;
3695     }
3696     else {
3697         repl = argv[1];
3698         hash = rb_check_hash_type(argv[1]);
3699         if (NIL_P(hash)) {
3700             StringValue(repl);
3701         }
3702         if (OBJ_TAINTED(repl)) tainted = 1;
3703         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3704     }
3705 
3706     pat = get_pat(argv[0], 1);
3707     str_modifiable(str);
3708     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3709         rb_encoding *enc;
3710         int cr = ENC_CODERANGE(str);
3711         VALUE match = rb_backref_get();
3712         struct re_registers *regs = RMATCH_REGS(match);
3713         long beg0 = BEG(0);
3714         long end0 = END(0);
3715         char *p, *rp;
3716         long len, rlen;
3717 
3718         if (iter || !NIL_P(hash)) {
3719             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3720 
3721             if (iter) {
3722                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3723             }
3724             else {
3725                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3726                 repl = rb_obj_as_string(repl);
3727             }
3728             str_mod_check(str, p, len);
3729             rb_check_frozen(str);
3730         }
3731         else {
3732             repl = rb_reg_regsub(repl, str, regs, pat);
3733         }
3734         enc = rb_enc_compatible(str, repl);
3735         if (!enc) {
3736             rb_encoding *str_enc = STR_ENC_GET(str);
3737             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3738             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3739                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3740                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3741                          rb_enc_name(str_enc),
3742                          rb_enc_name(STR_ENC_GET(repl)));
3743             }
3744             enc = STR_ENC_GET(repl);
3745         }
3746         rb_str_modify(str);
3747         rb_enc_associate(str, enc);
3748         if (OBJ_TAINTED(repl)) tainted = 1;
3749         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3750         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3751             int cr2 = ENC_CODERANGE(repl);
3752             if (cr2 == ENC_CODERANGE_BROKEN ||
3753                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3754                 cr = ENC_CODERANGE_UNKNOWN;
3755             else
3756                 cr = cr2;
3757         }
3758         plen = end0 - beg0;
3759         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3760         len = RSTRING_LEN(str);
3761         if (rlen > plen) {
3762             RESIZE_CAPA(str, len + rlen - plen);
3763         }
3764         p = RSTRING_PTR(str);
3765         if (rlen != plen) {
3766             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3767         }
3768         memcpy(p + beg0, rp, rlen);
3769         len += rlen - plen;
3770         STR_SET_LEN(str, len);
3771         RSTRING_PTR(str)[len] = '\0';
3772         ENC_CODERANGE_SET(str, cr);
3773         if (tainted) OBJ_TAINT(str);
3774         if (untrusted) OBJ_UNTRUST(str);
3775 
3776         return str;
3777     }
3778     return Qnil;
3779 }
3780 
3781 
3782 /*
3783  *  call-seq:
3784  *     str.sub(pattern, replacement)         -> new_str
3785  *     str.sub(pattern, hash)                -> new_str
3786  *     str.sub(pattern) {|match| block }     -> new_str
3787  *
3788  *  Returns a copy of +str+ with the _first_ occurrence of +pattern+
3789  *  replaced by the second argument. The +pattern+ is typically a Regexp; if
3790  *  given as a String, any regular expression metacharacters it contains will
3791  *  be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash
3792  *  followed by 'd', instead of a digit.
3793  *
3794  *  If +replacement+ is a String it will be substituted for the matched text.
3795  *  It may contain back-references to the pattern's capture groups of the form
3796  *  <code>"\\d"</code>, where <i>d</i> is a group number, or
3797  *  <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
3798  *  double-quoted string, both back-references must be preceded by an
3799  *  additional backslash. However, within +replacement+ the special match
3800  *  variables, such as <code>&$</code>, will not refer to the current match.
3801  *
3802  *  If the second argument is a Hash, and the matched text is one of its keys,
3803  *  the corresponding value is the replacement string.
3804  *
3805  *  In the block form, the current match string is passed in as a parameter,
3806  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3807  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3808  *  returned by the block will be substituted for the match on each call.
3809  *
3810  *  The result inherits any tainting in the original string or any supplied
3811  *  replacement string.
3812  *
3813  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3814  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3815  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
3816  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3817  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3818  *      #=> "Is /bin/bash your preferred shell?"
3819  */
3820 
3821 static VALUE
3822 rb_str_sub(int argc, VALUE *argv, VALUE str)
3823 {
3824     str = rb_str_dup(str);
3825     rb_str_sub_bang(argc, argv, str);
3826     return str;
3827 }
3828 
3829 static VALUE
3830 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3831 {
3832     VALUE pat, val, repl, match, dest, hash = Qnil;
3833     struct re_registers *regs;
3834     long beg, n;
3835     long beg0, end0;
3836     long offset, blen, slen, len, last;
3837     int iter = 0;
3838     char *sp, *cp;
3839     int tainted = 0;
3840     rb_encoding *str_enc;
3841 
3842     switch (argc) {
3843       case 1:
3844         RETURN_ENUMERATOR(str, argc, argv);
3845         iter = 1;
3846         break;
3847       case 2:
3848         repl = argv[1];
3849         hash = rb_check_hash_type(argv[1]);
3850         if (NIL_P(hash)) {
3851             StringValue(repl);
3852         }
3853         if (OBJ_TAINTED(repl)) tainted = 1;
3854         break;
3855       default:
3856         rb_check_arity(argc, 1, 2);
3857     }
3858 
3859     pat = get_pat(argv[0], 1);
3860     beg = rb_reg_search(pat, str, 0, 0);
3861     if (beg < 0) {
3862         if (bang) return Qnil;  /* no match, no substitution */
3863         return rb_str_dup(str);
3864     }
3865 
3866     offset = 0;
3867     n = 0;
3868     blen = RSTRING_LEN(str) + 30; /* len + margin */
3869     dest = rb_str_buf_new(blen);
3870     sp = RSTRING_PTR(str);
3871     slen = RSTRING_LEN(str);
3872     cp = sp;
3873     str_enc = STR_ENC_GET(str);
3874     rb_enc_associate(dest, str_enc);
3875     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
3876 
3877     do {
3878         n++;
3879         match = rb_backref_get();
3880         regs = RMATCH_REGS(match);
3881         beg0 = BEG(0);
3882         end0 = END(0);
3883         if (iter || !NIL_P(hash)) {
3884             if (iter) {
3885                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3886             }
3887             else {
3888                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3889                 val = rb_obj_as_string(val);
3890             }
3891             str_mod_check(str, sp, slen);
3892             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3893                 rb_raise(rb_eRuntimeError, "block should not cheat");
3894             }
3895         }
3896         else {
3897             val = rb_reg_regsub(repl, str, regs, pat);
3898         }
3899 
3900         if (OBJ_TAINTED(val)) tainted = 1;
3901 
3902         len = beg - offset;     /* copy pre-match substr */
3903         if (len) {
3904             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3905         }
3906 
3907         rb_str_buf_append(dest, val);
3908 
3909         last = offset;
3910         offset = end0;
3911         if (beg0 == end0) {
3912             /*
3913              * Always consume at least one character of the input string
3914              * in order to prevent infinite loops.
3915              */
3916             if (RSTRING_LEN(str) <= end0) break;
3917             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3918             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3919             offset = end0 + len;
3920         }
3921         cp = RSTRING_PTR(str) + offset;
3922         if (offset > RSTRING_LEN(str)) break;
3923         beg = rb_reg_search(pat, str, offset, 0);
3924     } while (beg >= 0);
3925     if (RSTRING_LEN(str) > offset) {
3926         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3927     }
3928     rb_reg_search(pat, str, last, 0);
3929     if (bang) {
3930         rb_str_shared_replace(str, dest);
3931     }
3932     else {
3933         RBASIC(dest)->klass = rb_obj_class(str);
3934         OBJ_INFECT(dest, str);
3935         str = dest;
3936     }
3937 
3938     if (tainted) OBJ_TAINT(str);
3939     return str;
3940 }
3941 
3942 
3943 /*
3944  *  call-seq:
3945  *     str.gsub!(pattern, replacement)        -> str or nil
3946  *     str.gsub!(pattern) {|match| block }    -> str or nil
3947  *     str.gsub!(pattern)                     -> an_enumerator
3948  *
3949  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3950  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3951  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
3952  */
3953 
3954 static VALUE
3955 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3956 {
3957     str_modify_keep_cr(str);
3958     return str_gsub(argc, argv, str, 1);
3959 }
3960 
3961 
3962 /*
3963  *  call-seq:
3964  *     str.gsub(pattern, replacement)       -> new_str
3965  *     str.gsub(pattern, hash)              -> new_str
3966  *     str.gsub(pattern) {|match| block }   -> new_str
3967  *     str.gsub(pattern)                    -> enumerator
3968  *
3969  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
3970  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3971  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
3972  *  regular expression metacharacters it contains will be interpreted
3973  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3974  *  instead of a digit.
3975  *
3976  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
3977  *  the matched text. It may contain back-references to the pattern's capture
3978  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3979  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3980  *  double-quoted string, both back-references must be preceded by an
3981  *  additional backslash. However, within <i>replacement</i> the special match
3982  *  variables, such as <code>$&</code>, will not refer to the current match.
3983  *
3984  *  If the second argument is a <code>Hash</code>, and the matched text is one
3985  *  of its keys, the corresponding value is the replacement string.
3986  *
3987  *  In the block form, the current match string is passed in as a parameter,
3988  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3989  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3990  *  returned by the block will be substituted for the match on each call.
3991  *
3992  *  The result inherits any tainting in the original string or any supplied
3993  *  replacement string.
3994  *
3995  *  When neither a block nor a second argument is supplied, an
3996  *  <code>Enumerator</code> is returned.
3997  *
3998  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3999  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
4000  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
4001  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
4002  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
4003  */
4004 
4005 static VALUE
4006 rb_str_gsub(int argc, VALUE *argv, VALUE str)
4007 {
4008     return str_gsub(argc, argv, str, 0);
4009 }
4010 
4011 
4012 /*
4013  *  call-seq:
4014  *     str.replace(other_str)   -> str
4015  *
4016  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
4017  *  values in <i>other_str</i>.
4018  *
4019  *     s = "hello"         #=> "hello"
4020  *     s.replace "world"   #=> "world"
4021  */
4022 
4023 VALUE
4024 rb_str_replace(VALUE str, VALUE str2)
4025 {
4026     str_modifiable(str);
4027     if (str == str2) return str;
4028 
4029     StringValue(str2);
4030     str_discard(str);
4031     return str_replace(str, str2);
4032 }
4033 
4034 /*
4035  *  call-seq:
4036  *     string.clear    ->  string
4037  *
4038  *  Makes string empty.
4039  *
4040  *     a = "abcde"
4041  *     a.clear    #=> ""
4042  */
4043 
4044 static VALUE
4045 rb_str_clear(VALUE str)
4046 {
4047     str_discard(str);
4048     STR_SET_EMBED(str);
4049     STR_SET_EMBED_LEN(str, 0);
4050     RSTRING_PTR(str)[0] = 0;
4051     if (rb_enc_asciicompat(STR_ENC_GET(str)))
4052         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4053     else
4054         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4055     return str;
4056 }
4057 
4058 /*
4059  *  call-seq:
4060  *     string.chr    ->  string
4061  *
4062  *  Returns a one-character string at the beginning of the string.
4063  *
4064  *     a = "abcde"
4065  *     a.chr    #=> "a"
4066  */
4067 
4068 static VALUE
4069 rb_str_chr(VALUE str)
4070 {
4071     return rb_str_substr(str, 0, 1);
4072 }
4073 
4074 /*
4075  *  call-seq:
4076  *     str.getbyte(index)          -> 0 .. 255
4077  *
4078  *  returns the <i>index</i>th byte as an integer.
4079  */
4080 static VALUE
4081 rb_str_getbyte(VALUE str, VALUE index)
4082 {
4083     long pos = NUM2LONG(index);
4084 
4085     if (pos < 0)
4086         pos += RSTRING_LEN(str);
4087     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
4088         return Qnil;
4089 
4090     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
4091 }
4092 
4093 /*
4094  *  call-seq:
4095  *     str.setbyte(index, int) -> int
4096  *
4097  *  modifies the <i>index</i>th byte as <i>int</i>.
4098  */
4099 static VALUE
4100 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4101 {
4102     long pos = NUM2LONG(index);
4103     int byte = NUM2INT(value);
4104 
4105     rb_str_modify(str);
4106 
4107     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4108         rb_raise(rb_eIndexError, "index %ld out of string", pos);
4109     if (pos < 0)
4110         pos += RSTRING_LEN(str);
4111 
4112     RSTRING_PTR(str)[pos] = byte;
4113 
4114     return value;
4115 }
4116 
4117 static VALUE
4118 str_byte_substr(VALUE str, long beg, long len)
4119 {
4120     char *p, *s = RSTRING_PTR(str);
4121     long n = RSTRING_LEN(str);
4122     VALUE str2;
4123 
4124     if (beg > n || len < 0) return Qnil;
4125     if (beg < 0) {
4126         beg += n;
4127         if (beg < 0) return Qnil;
4128     }
4129     if (beg + len > n)
4130         len = n - beg;
4131     if (len <= 0) {
4132         len = 0;
4133         p = 0;
4134     }
4135     else
4136         p = s + beg;
4137 
4138     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4139         str2 = rb_str_new4(str);
4140         str2 = str_new3(rb_obj_class(str2), str2);
4141         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4142         RSTRING(str2)->as.heap.len = len;
4143     }
4144     else {
4145         str2 = rb_str_new5(str, p, len);
4146     }
4147 
4148     str_enc_copy(str2, str);
4149 
4150     if (RSTRING_LEN(str2) == 0) {
4151         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4152             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
4153         else
4154             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4155     }
4156     else {
4157         switch (ENC_CODERANGE(str)) {
4158           case ENC_CODERANGE_7BIT:
4159             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4160             break;
4161           default:
4162             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
4163             break;
4164         }
4165     }
4166 
4167     OBJ_INFECT(str2, str);
4168 
4169     return str2;
4170 }
4171 
4172 static VALUE
4173 str_byte_aref(VALUE str, VALUE indx)
4174 {
4175     long idx;
4176     switch (TYPE(indx)) {
4177       case T_FIXNUM:
4178         idx = FIX2LONG(indx);
4179 
4180       num_index:
4181         str = str_byte_substr(str, idx, 1);
4182         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4183         return str;
4184 
4185       default:
4186         /* check if indx is Range */
4187         {
4188             long beg, len = RSTRING_LEN(str);
4189 
4190             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4191               case Qfalse:
4192                 break;
4193               case Qnil:
4194                 return Qnil;
4195               default:
4196                 return str_byte_substr(str, beg, len);
4197             }
4198         }
4199         idx = NUM2LONG(indx);
4200         goto num_index;
4201     }
4202 
4203     UNREACHABLE;
4204 }
4205 
4206 /*
4207  *  call-seq:
4208  *     str.byteslice(fixnum)           -> new_str or nil
4209  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
4210  *     str.byteslice(range)            -> new_str or nil
4211  *
4212  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
4213  *  substring of one byte at that position. If passed two <code>Fixnum</code>
4214  *  objects, returns a substring starting at the offset given by the first, and
4215  *  a length given by the second. If given a <code>Range</code>, a substring containing
4216  *  bytes at offsets given by the range is returned. In all three cases, if
4217  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
4218  *  <code>nil</code> if the initial offset falls outside the string, the length
4219  *  is negative, or the beginning of the range is greater than the end.
4220  *  The encoding of the resulted string keeps original encoding.
4221  *
4222  *     "hello".byteslice(1)     #=> "e"
4223  *     "hello".byteslice(-1)    #=> "o"
4224  *     "hello".byteslice(1, 2)  #=> "el"
4225  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4226  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
4227  */
4228 
4229 static VALUE
4230 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
4231 {
4232     if (argc == 2) {
4233         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4234     }
4235     rb_check_arity(argc, 1, 2);
4236     return str_byte_aref(str, argv[0]);
4237 }
4238 
4239 /*
4240  *  call-seq:
4241  *     str.reverse   -> new_str
4242  *
4243  *  Returns a new string with the characters from <i>str</i> in reverse order.
4244  *
4245  *     "stressed".reverse   #=> "desserts"
4246  */
4247 
4248 static VALUE
4249 rb_str_reverse(VALUE str)
4250 {
4251     rb_encoding *enc;
4252     VALUE rev;
4253     char *s, *e, *p;
4254     int single = 1;
4255 
4256     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4257     enc = STR_ENC_GET(str);
4258     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4259     s = RSTRING_PTR(str); e = RSTRING_END(str);
4260     p = RSTRING_END(rev);
4261 
4262     if (RSTRING_LEN(str) > 1) {
4263         if (single_byte_optimizable(str)) {
4264             while (s < e) {
4265                 *--p = *s++;
4266             }
4267         }
4268         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4269             while (s < e) {
4270                 int clen = rb_enc_fast_mbclen(s, e, enc);
4271 
4272                 if (clen > 1 || (*s & 0x80)) single = 0;
4273                 p -= clen;
4274                 memcpy(p, s, clen);
4275                 s += clen;
4276             }
4277         }
4278         else {
4279             while (s < e) {
4280                 int clen = rb_enc_mbclen(s, e, enc);
4281 
4282                 if (clen > 1 || (*s & 0x80)) single = 0;
4283                 p -= clen;
4284                 memcpy(p, s, clen);
4285                 s += clen;
4286             }
4287         }
4288     }
4289     STR_SET_LEN(rev, RSTRING_LEN(str));
4290     OBJ_INFECT(rev, str);
4291     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4292         if (single) {
4293             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4294         }
4295         else {
4296             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4297         }
4298     }
4299     rb_enc_cr_str_copy_for_substr(rev, str);
4300 
4301     return rev;
4302 }
4303 
4304 
4305 /*
4306  *  call-seq:
4307  *     str.reverse!   -> str
4308  *
4309  *  Reverses <i>str</i> in place.
4310  */
4311 
4312 static VALUE
4313 rb_str_reverse_bang(VALUE str)
4314 {
4315     if (RSTRING_LEN(str) > 1) {
4316         if (single_byte_optimizable(str)) {
4317             char *s, *e, c;
4318 
4319             str_modify_keep_cr(str);
4320             s = RSTRING_PTR(str);
4321             e = RSTRING_END(str) - 1;
4322             while (s < e) {
4323                 c = *s;
4324                 *s++ = *e;
4325                 *e-- = c;
4326             }
4327         }
4328         else {
4329             rb_str_shared_replace(str, rb_str_reverse(str));
4330         }
4331     }
4332     else {
4333         str_modify_keep_cr(str);
4334     }
4335     return str;
4336 }
4337 
4338 
4339 /*
4340  *  call-seq:
4341  *     str.include? other_str   -> true or false
4342  *
4343  *  Returns <code>true</code> if <i>str</i> contains the given string or
4344  *  character.
4345  *
4346  *     "hello".include? "lo"   #=> true
4347  *     "hello".include? "ol"   #=> false
4348  *     "hello".include? ?h     #=> true
4349  */
4350 
4351 static VALUE
4352 rb_str_include(VALUE str, VALUE arg)
4353 {
4354     long i;
4355 
4356     StringValue(arg);
4357     i = rb_str_index(str, arg, 0);
4358 
4359     if (i == -1) return Qfalse;
4360     return Qtrue;
4361 }
4362 
4363 
4364 /*
4365  *  call-seq:
4366  *     str.to_i(base=10)   -> integer
4367  *
4368  *  Returns the result of interpreting leading characters in <i>str</i> as an
4369  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4370  *  end of a valid number are ignored. If there is not a valid number at the
4371  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
4372  *  exception when <i>base</i> is valid.
4373  *
4374  *     "12345".to_i             #=> 12345
4375  *     "99 red balloons".to_i   #=> 99
4376  *     "0a".to_i                #=> 0
4377  *     "0a".to_i(16)            #=> 10
4378  *     "hello".to_i             #=> 0
4379  *     "1100101".to_i(2)        #=> 101
4380  *     "1100101".to_i(8)        #=> 294977
4381  *     "1100101".to_i(10)       #=> 1100101
4382  *     "1100101".to_i(16)       #=> 17826049
4383  */
4384 
4385 static VALUE
4386 rb_str_to_i(int argc, VALUE *argv, VALUE str)
4387 {
4388     int base;
4389 
4390     if (argc == 0) base = 10;
4391     else {
4392         VALUE b;
4393 
4394         rb_scan_args(argc, argv, "01", &b);
4395         base = NUM2INT(b);
4396     }
4397     if (base < 0) {
4398         rb_raise(rb_eArgError, "invalid radix %d", base);
4399     }
4400     return rb_str_to_inum(str, base, FALSE);
4401 }
4402 
4403 
4404 /*
4405  *  call-seq:
4406  *     str.to_f   -> float
4407  *
4408  *  Returns the result of interpreting leading characters in <i>str</i> as a
4409  *  floating point number. Extraneous characters past the end of a valid number
4410  *  are ignored. If there is not a valid number at the start of <i>str</i>,
4411  *  <code>0.0</code> is returned. This method never raises an exception.
4412  *
4413  *     "123.45e1".to_f        #=> 1234.5
4414  *     "45.67 degrees".to_f   #=> 45.67
4415  *     "thx1138".to_f         #=> 0.0
4416  */
4417 
4418 static VALUE
4419 rb_str_to_f(VALUE str)
4420 {
4421     return DBL2NUM(rb_str_to_dbl(str, FALSE));
4422 }
4423 
4424 
4425 /*
4426  *  call-seq:
4427  *     str.to_s     -> str
4428  *     str.to_str   -> str
4429  *
4430  *  Returns the receiver.
4431  */
4432 
4433 static VALUE
4434 rb_str_to_s(VALUE str)
4435 {
4436     if (rb_obj_class(str) != rb_cString) {
4437         return str_duplicate(rb_cString, str);
4438     }
4439     return str;
4440 }
4441 
4442 #if 0
4443 static void
4444 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4445 {
4446     char s[RUBY_MAX_CHAR_LEN];
4447     int n = rb_enc_codelen(c, enc);
4448 
4449     rb_enc_mbcput(c, s, enc);
4450     rb_enc_str_buf_cat(str, s, n, enc);
4451 }
4452 #endif
4453 
4454 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4455 
4456 int
4457 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4458 {
4459     char buf[CHAR_ESC_LEN + 1];
4460     int l;
4461 
4462 #if SIZEOF_INT > 4
4463     c &= 0xffffffff;
4464 #endif
4465     if (unicode_p) {
4466         if (c < 0x7F && ISPRINT(c)) {
4467             snprintf(buf, CHAR_ESC_LEN, "%c", c);
4468         }
4469         else if (c < 0x10000) {
4470             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4471         }
4472         else {
4473             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4474         }
4475     }
4476     else {
4477         if (c < 0x100) {
4478             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4479         }
4480         else {
4481             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4482         }
4483     }
4484     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
4485     rb_str_buf_cat(result, buf, l);
4486     return l;
4487 }
4488 
4489 /*
4490  * call-seq:
4491  *   str.inspect   -> string
4492  *
4493  * Returns a printable version of _str_, surrounded by quote marks,
4494  * with special characters escaped.
4495  *
4496  *    str = "hello"
4497  *    str[3] = "\b"
4498  *    str.inspect       #=> "\"hel\\bo\""
4499  */
4500 
4501 VALUE
4502 rb_str_inspect(VALUE str)
4503 {
4504     rb_encoding *enc = STR_ENC_GET(str);
4505     const char *p, *pend, *prev;
4506     char buf[CHAR_ESC_LEN + 1];
4507     VALUE result = rb_str_buf_new(0);
4508     rb_encoding *resenc = rb_default_internal_encoding();
4509     int unicode_p = rb_enc_unicode_p(enc);
4510     int asciicompat = rb_enc_asciicompat(enc);
4511     static rb_encoding *utf16, *utf32;
4512 
4513     if (!utf16) utf16 = rb_enc_find("UTF-16");
4514     if (!utf32) utf32 = rb_enc_find("UTF-32");
4515     if (resenc == NULL) resenc = rb_default_external_encoding();
4516     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4517     rb_enc_associate(result, resenc);
4518     str_buf_cat2(result, "\"");
4519 
4520     p = RSTRING_PTR(str); pend = RSTRING_END(str);
4521     prev = p;
4522     if (enc == utf16) {
4523         const unsigned char *q = (const unsigned char *)p;
4524         if (q[0] == 0xFE && q[1] == 0xFF)
4525             enc = rb_enc_find("UTF-16BE");
4526         else if (q[0] == 0xFF && q[1] == 0xFE)
4527             enc = rb_enc_find("UTF-16LE");
4528         else
4529             unicode_p = 0;
4530     }
4531     else if (enc == utf32) {
4532         const unsigned char *q = (const unsigned char *)p;
4533         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4534             enc = rb_enc_find("UTF-32BE");
4535         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4536             enc = rb_enc_find("UTF-32LE");
4537         else
4538             unicode_p = 0;
4539     }
4540     while (p < pend) {
4541         unsigned int c, cc;
4542         int n;
4543 
4544         n = rb_enc_precise_mbclen(p, pend, enc);
4545         if (!MBCLEN_CHARFOUND_P(n)) {
4546             if (p > prev) str_buf_cat(result, prev, p - prev);
4547             n = rb_enc_mbminlen(enc);
4548             if (pend < p + n)
4549                 n = (int)(pend - p);
4550             while (n--) {
4551                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4552                 str_buf_cat(result, buf, strlen(buf));
4553                 prev = ++p;
4554             }
4555             continue;
4556         }
4557         n = MBCLEN_CHARFOUND_LEN(n);
4558         c = rb_enc_mbc_to_codepoint(p, pend, enc);
4559         p += n;
4560         if ((asciicompat || unicode_p) &&
4561           (c == '"'|| c == '\\' ||
4562             (c == '#' &&
4563              p < pend &&
4564              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
4565              (cc = rb_enc_codepoint(p,pend,enc),
4566               (cc == '$' || cc == '@' || cc == '{'))))) {
4567             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4568             str_buf_cat2(result, "\\");
4569             if (asciicompat || enc == resenc) {
4570                 prev = p - n;
4571                 continue;
4572             }
4573         }
4574         switch (c) {
4575           case '\0': cc = '0'; break;
4576           case '\n': cc = 'n'; break;
4577           case '\r': cc = 'r'; break;
4578           case '\t': cc = 't'; break;
4579           case '\f': cc = 'f'; break;
4580           case '\013': cc = 'v'; break;
4581           case '\010': cc = 'b'; break;
4582           case '\007': cc = 'a'; break;
4583           case 033: cc = 'e'; break;
4584           default: cc = 0; break;
4585         }
4586         if (cc) {
4587             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4588             buf[0] = '\\';
4589             buf[1] = (char)cc;
4590             str_buf_cat(result, buf, 2);
4591             prev = p;
4592             continue;
4593         }
4594         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4595             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4596             continue;
4597         }
4598         else {
4599             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4600             rb_str_buf_cat_escaped_char(result, c, unicode_p);
4601             prev = p;
4602             continue;
4603         }
4604     }
4605     if (p > prev) str_buf_cat(result, prev, p - prev);
4606     str_buf_cat2(result, "\"");
4607 
4608     OBJ_INFECT(result, str);
4609     return result;
4610 }
4611 
4612 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4613 
4614 /*
4615  *  call-seq:
4616  *     str.dump   -> new_str
4617  *
4618  *  Produces a version of +str+ with all non-printing characters replaced by
4619  *  <code>\nnn</code> notation and all special characters escaped.
4620  *
4621  *    "hello \n ''".dump  #=> "\"hello \\n ''\"
4622  */
4623 
4624 VALUE
4625 rb_str_dump(VALUE str)
4626 {
4627     rb_encoding *enc = rb_enc_get(str);
4628     long len;
4629     const char *p, *pend;
4630     char *q, *qend;
4631     VALUE result;
4632     int u8 = (enc == rb_utf8_encoding());
4633 
4634     len = 2;                    /* "" */
4635     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4636     while (p < pend) {
4637         unsigned char c = *p++;
4638         switch (c) {
4639           case '"':  case '\\':
4640           case '\n': case '\r':
4641           case '\t': case '\f':
4642           case '\013': case '\010': case '\007': case '\033':
4643             len += 2;
4644             break;
4645 
4646           case '#':
4647             len += IS_EVSTR(p, pend) ? 2 : 1;
4648             break;
4649 
4650           default:
4651             if (ISPRINT(c)) {
4652                 len++;
4653             }
4654             else {
4655                 if (u8) {       /* \u{NN} */
4656                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
4657                     if (MBCLEN_CHARFOUND_P(n-1)) {
4658                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4659                         while (cc >>= 4) len++;
4660                         len += 5;
4661                         p += MBCLEN_CHARFOUND_LEN(n)-1;
4662                         break;
4663                     }
4664                 }
4665                 len += 4;       /* \xNN */
4666             }
4667             break;
4668         }
4669     }
4670     if (!rb_enc_asciicompat(enc)) {
4671         len += 19;              /* ".force_encoding('')" */
4672         len += strlen(enc->name);
4673     }
4674 
4675     result = rb_str_new5(str, 0, len);
4676     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4677     q = RSTRING_PTR(result); qend = q + len + 1;
4678 
4679     *q++ = '"';
4680     while (p < pend) {
4681         unsigned char c = *p++;
4682 
4683         if (c == '"' || c == '\\') {
4684             *q++ = '\\';
4685             *q++ = c;
4686         }
4687         else if (c == '#') {
4688             if (IS_EVSTR(p, pend)) *q++ = '\\';
4689             *q++ = '#';
4690         }
4691         else if (c == '\n') {
4692             *q++ = '\\';
4693             *q++ = 'n';
4694         }
4695         else if (c == '\r') {
4696             *q++ = '\\';
4697             *q++ = 'r';
4698         }
4699         else if (c == '\t') {
4700             *q++ = '\\';
4701             *q++ = 't';
4702         }
4703         else if (c == '\f') {
4704             *q++ = '\\';
4705             *q++ = 'f';
4706         }
4707         else if (c == '\013') {
4708             *q++ = '\\';
4709             *q++ = 'v';
4710         }
4711         else if (c == '\010') {
4712             *q++ = '\\';
4713             *q++ = 'b';
4714         }
4715         else if (c == '\007') {
4716             *q++ = '\\';
4717             *q++ = 'a';
4718         }
4719         else if (c == '\033') {
4720             *q++ = '\\';
4721             *q++ = 'e';
4722         }
4723         else if (ISPRINT(c)) {
4724             *q++ = c;
4725         }
4726         else {
4727             *q++ = '\\';
4728             if (u8) {
4729                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4730                 if (MBCLEN_CHARFOUND_P(n)) {
4731                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4732                     p += n;
4733                     snprintf(q, qend-q, "u{%x}", cc);
4734                     q += strlen(q);
4735                     continue;
4736                 }
4737             }
4738             snprintf(q, qend-q, "x%02X", c);
4739             q += 3;
4740         }
4741     }
4742     *q++ = '"';
4743     *q = '\0';
4744     if (!rb_enc_asciicompat(enc)) {
4745         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4746         enc = rb_ascii8bit_encoding();
4747     }
4748     OBJ_INFECT(result, str);
4749     /* result from dump is ASCII */
4750     rb_enc_associate(result, enc);
4751     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
4752     return result;
4753 }
4754 
4755 
4756 static void
4757 rb_str_check_dummy_enc(rb_encoding *enc)
4758 {
4759     if (rb_enc_dummy_p(enc)) {
4760         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4761                  rb_enc_name(enc));
4762     }
4763 }
4764 
4765 /*
4766  *  call-seq:
4767  *     str.upcase!   -> str or nil
4768  *
4769  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4770  *  were made.
4771  *  Note: case replacement is effective only in ASCII region.
4772  */
4773 
4774 static VALUE
4775 rb_str_upcase_bang(VALUE str)
4776 {
4777     rb_encoding *enc;
4778     char *s, *send;
4779     int modify = 0;
4780     int n;
4781 
4782     str_modify_keep_cr(str);
4783     enc = STR_ENC_GET(str);
4784     rb_str_check_dummy_enc(enc);
4785     s = RSTRING_PTR(str); send = RSTRING_END(str);
4786     if (single_byte_optimizable(str)) {
4787         while (s < send) {
4788             unsigned int c = *(unsigned char*)s;
4789 
4790             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4791                 *s = 'A' + (c - 'a');
4792                 modify = 1;
4793             }
4794             s++;
4795         }
4796     }
4797     else {
4798         int ascompat = rb_enc_asciicompat(enc);
4799 
4800         while (s < send) {
4801             unsigned int c;
4802 
4803             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4804                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4805                     *s = 'A' + (c - 'a');
4806                     modify = 1;
4807                 }
4808                 s++;
4809             }
4810             else {
4811                 c = rb_enc_codepoint_len(s, send, &n, enc);
4812                 if (rb_enc_islower(c, enc)) {
4813                     /* assuming toupper returns codepoint with same size */
4814                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4815                     modify = 1;
4816                 }
4817                 s += n;
4818             }
4819         }
4820     }
4821 
4822     if (modify) return str;
4823     return Qnil;
4824 }
4825 
4826 
4827 /*
4828  *  call-seq:
4829  *     str.upcase   -> new_str
4830  *
4831  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4832  *  uppercase counterparts. The operation is locale insensitive---only
4833  *  characters ``a'' to ``z'' are affected.
4834  *  Note: case replacement is effective only in ASCII region.
4835  *
4836  *     "hEllO".upcase   #=> "HELLO"
4837  */
4838 
4839 static VALUE
4840 rb_str_upcase(VALUE str)
4841 {
4842     str = rb_str_dup(str);
4843     rb_str_upcase_bang(str);
4844     return str;
4845 }
4846 
4847 
4848 /*
4849  *  call-seq:
4850  *     str.downcase!   -> str or nil
4851  *
4852  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4853  *  changes were made.
4854  *  Note: case replacement is effective only in ASCII region.
4855  */
4856 
4857 static VALUE
4858 rb_str_downcase_bang(VALUE str)
4859 {
4860     rb_encoding *enc;
4861     char *s, *send;
4862     int modify = 0;
4863 
4864     str_modify_keep_cr(str);
4865     enc = STR_ENC_GET(str);
4866     rb_str_check_dummy_enc(enc);
4867     s = RSTRING_PTR(str); send = RSTRING_END(str);
4868     if (single_byte_optimizable(str)) {
4869         while (s < send) {
4870             unsigned int c = *(unsigned char*)s;
4871 
4872             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4873                 *s = 'a' + (c - 'A');
4874                 modify = 1;
4875             }
4876             s++;
4877         }
4878     }
4879     else {
4880         int ascompat = rb_enc_asciicompat(enc);
4881 
4882         while (s < send) {
4883             unsigned int c;
4884             int n;
4885 
4886             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4887                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4888                     *s = 'a' + (c - 'A');
4889                     modify = 1;
4890                 }
4891                 s++;
4892             }
4893             else {
4894                 c = rb_enc_codepoint_len(s, send, &n, enc);
4895                 if (rb_enc_isupper(c, enc)) {
4896                     /* assuming toupper returns codepoint with same size */
4897                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4898                     modify = 1;
4899                 }
4900                 s += n;
4901             }
4902         }
4903     }
4904 
4905     if (modify) return str;
4906     return Qnil;
4907 }
4908 
4909 
4910 /*
4911  *  call-seq:
4912  *     str.downcase   -> new_str
4913  *
4914  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4915  *  lowercase counterparts. The operation is locale insensitive---only
4916  *  characters ``A'' to ``Z'' are affected.
4917  *  Note: case replacement is effective only in ASCII region.
4918  *
4919  *     "hEllO".downcase   #=> "hello"
4920  */
4921 
4922 static VALUE
4923 rb_str_downcase(VALUE str)
4924 {
4925     str = rb_str_dup(str);
4926     rb_str_downcase_bang(str);
4927     return str;
4928 }
4929 
4930 
4931 /*
4932  *  call-seq:
4933  *     str.capitalize!   -> str or nil
4934  *
4935  *  Modifies <i>str</i> by converting the first character to uppercase and the
4936  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4937  *  Note: case conversion is effective only in ASCII region.
4938  *
4939  *     a = "hello"
4940  *     a.capitalize!   #=> "Hello"
4941  *     a               #=> "Hello"
4942  *     a.capitalize!   #=> nil
4943  */
4944 
4945 static VALUE
4946 rb_str_capitalize_bang(VALUE str)
4947 {
4948     rb_encoding *enc;
4949     char *s, *send;
4950     int modify = 0;
4951     unsigned int c;
4952     int n;
4953 
4954     str_modify_keep_cr(str);
4955     enc = STR_ENC_GET(str);
4956     rb_str_check_dummy_enc(enc);
4957     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4958     s = RSTRING_PTR(str); send = RSTRING_END(str);
4959 
4960     c = rb_enc_codepoint_len(s, send, &n, enc);
4961     if (rb_enc_islower(c, enc)) {
4962         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4963         modify = 1;
4964     }
4965     s += n;
4966     while (s < send) {
4967         c = rb_enc_codepoint_len(s, send, &n, enc);
4968         if (rb_enc_isupper(c, enc)) {
4969             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4970             modify = 1;
4971         }
4972         s += n;
4973     }
4974 
4975     if (modify) return str;
4976     return Qnil;
4977 }
4978 
4979 
4980 /*
4981  *  call-seq:
4982  *     str.capitalize   -> new_str
4983  *
4984  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4985  *  and the remainder to lowercase.
4986  *  Note: case conversion is effective only in ASCII region.
4987  *
4988  *     "hello".capitalize    #=> "Hello"
4989  *     "HELLO".capitalize    #=> "Hello"
4990  *     "123ABC".capitalize   #=> "123abc"
4991  */
4992 
4993 static VALUE
4994 rb_str_capitalize(VALUE str)
4995 {
4996     str = rb_str_dup(str);
4997     rb_str_capitalize_bang(str);
4998     return str;
4999 }
5000 
5001 
5002 /*
5003  *  call-seq:
5004  *     str.swapcase!   -> str or nil
5005  *
5006  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
5007  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
5008  *  Note: case conversion is effective only in ASCII region.
5009  */
5010 
5011 static VALUE
5012 rb_str_swapcase_bang(VALUE str)
5013 {
5014     rb_encoding *enc;
5015     char *s, *send;
5016     int modify = 0;
5017     int n;
5018 
5019     str_modify_keep_cr(str);
5020     enc = STR_ENC_GET(str);
5021     rb_str_check_dummy_enc(enc);
5022     s = RSTRING_PTR(str); send = RSTRING_END(str);
5023     while (s < send) {
5024         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
5025 
5026         if (rb_enc_isupper(c, enc)) {
5027             /* assuming toupper returns codepoint with same size */
5028             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
5029             modify = 1;
5030         }
5031         else if (rb_enc_islower(c, enc)) {
5032             /* assuming tolower returns codepoint with same size */
5033             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
5034             modify = 1;
5035         }
5036         s += n;
5037     }
5038 
5039     if (modify) return str;
5040     return Qnil;
5041 }
5042 
5043 
5044 /*
5045  *  call-seq:
5046  *     str.swapcase   -> new_str
5047  *
5048  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
5049  *  to lowercase and lowercase characters converted to uppercase.
5050  *  Note: case conversion is effective only in ASCII region.
5051  *
5052  *     "Hello".swapcase          #=> "hELLO"
5053  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
5054  */
5055 
5056 static VALUE
5057 rb_str_swapcase(VALUE str)
5058 {
5059     str = rb_str_dup(str);
5060     rb_str_swapcase_bang(str);
5061     return str;
5062 }
5063 
5064 typedef unsigned char *USTR;
5065 
5066 struct tr {
5067     int gen;
5068     unsigned int now, max;
5069     char *p, *pend;
5070 };
5071 
5072 static unsigned int
5073 trnext(struct tr *t, rb_encoding *enc)
5074 {
5075     int n;
5076 
5077     for (;;) {
5078         if (!t->gen) {
5079 nextpart:
5080             if (t->p == t->pend) return -1;
5081             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
5082                 t->p += n;
5083             }
5084             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5085             t->p += n;
5086             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
5087                 t->p += n;
5088                 if (t->p < t->pend) {
5089                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5090                     t->p += n;
5091                     if (t->now > c) {
5092                         if (t->now < 0x80 && c < 0x80) {
5093                             rb_raise(rb_eArgError,
5094                                      "invalid range \"%c-%c\" in string transliteration",
5095                                      t->now, c);
5096                         }
5097                         else {
5098                             rb_raise(rb_eArgError, "invalid range in string transliteration");
5099                         }
5100                         continue; /* not reached */
5101                     }
5102                     t->gen = 1;
5103                     t->max = c;
5104                 }
5105             }
5106             return t->now;
5107         }
5108         else {
5109             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
5110                 if (t->now == t->max) {
5111                     t->gen = 0;
5112                     goto nextpart;
5113                 }
5114             }
5115             if (t->now < t->max) {
5116                 return t->now;
5117             }
5118             else {
5119                 t->gen = 0;
5120                 return t->max;
5121             }
5122         }
5123     }
5124 }
5125 
5126 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5127 
5128 static VALUE
5129 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5130 {
5131     const unsigned int errc = -1;
5132     unsigned int trans[256];
5133     rb_encoding *enc, *e1, *e2;
5134     struct tr trsrc, trrepl;
5135     int cflag = 0;
5136     unsigned int c, c0, last = 0;
5137     int modify = 0, i, l;
5138     char *s, *send;
5139     VALUE hash = 0;
5140     int singlebyte = single_byte_optimizable(str);
5141     int cr;
5142 
5143 #define CHECK_IF_ASCII(c) \
5144     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5145            (cr = ENC_CODERANGE_VALID) : 0)
5146 
5147     StringValue(src);
5148     StringValue(repl);
5149     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5150     if (RSTRING_LEN(repl) == 0) {
5151         return rb_str_delete_bang(1, &src, str);
5152     }
5153 
5154     cr = ENC_CODERANGE(str);
5155     e1 = rb_enc_check(str, src);
5156     e2 = rb_enc_check(str, repl);
5157     if (e1 == e2) {
5158         enc = e1;
5159     }
5160     else {
5161         enc = rb_enc_check(src, repl);
5162     }
5163     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5164     if (RSTRING_LEN(src) > 1 &&
5165         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5166         trsrc.p + l < trsrc.pend) {
5167         cflag = 1;
5168         trsrc.p += l;
5169     }
5170     trrepl.p = RSTRING_PTR(repl);
5171     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5172     trsrc.gen = trrepl.gen = 0;
5173     trsrc.now = trrepl.now = 0;
5174     trsrc.max = trrepl.max = 0;
5175 
5176     if (cflag) {
5177         for (i=0; i<256; i++) {
5178             trans[i] = 1;
5179         }
5180         while ((c = trnext(&trsrc, enc)) != errc) {
5181             if (c < 256) {
5182                 trans[c] = errc;
5183             }
5184             else {
5185                 if (!hash) hash = rb_hash_new();
5186                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5187             }
5188         }
5189         while ((c = trnext(&trrepl, enc)) != errc)
5190             /* retrieve last replacer */;
5191         last = trrepl.now;
5192         for (i=0; i<256; i++) {
5193             if (trans[i] != errc) {
5194                 trans[i] = last;
5195             }
5196         }
5197     }
5198     else {
5199         unsigned int r;
5200 
5201         for (i=0; i<256; i++) {
5202             trans[i] = errc;
5203         }
5204         while ((c = trnext(&trsrc, enc)) != errc) {
5205             r = trnext(&trrepl, enc);
5206             if (r == errc) r = trrepl.now;
5207             if (c < 256) {
5208                 trans[c] = r;
5209                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5210             }
5211             else {
5212                 if (!hash) hash = rb_hash_new();
5213                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5214             }
5215         }
5216     }
5217 
5218     if (cr == ENC_CODERANGE_VALID)
5219         cr = ENC_CODERANGE_7BIT;
5220     str_modify_keep_cr(str);
5221     s = RSTRING_PTR(str); send = RSTRING_END(str);
5222     if (sflag) {
5223         int clen, tlen;
5224         long offset, max = RSTRING_LEN(str);
5225         unsigned int save = -1;
5226         char *buf = ALLOC_N(char, max), *t = buf;
5227 
5228         while (s < send) {
5229             int may_modify = 0;
5230 
5231             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5232             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5233 
5234             s += clen;
5235             if (c < 256) {
5236                 c = trans[c];
5237             }
5238             else if (hash) {
5239                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5240                 if (NIL_P(tmp)) {
5241                     if (cflag) c = last;
5242                     else c = errc;
5243                 }
5244                 else if (cflag) c = errc;
5245                 else c = NUM2INT(tmp);
5246             }
5247             else {
5248                 c = errc;
5249             }
5250             if (c != (unsigned int)-1) {
5251                 if (save == c) {
5252                     CHECK_IF_ASCII(c);
5253                     continue;
5254                 }
5255                 save = c;
5256                 tlen = rb_enc_codelen(c, enc);
5257                 modify = 1;
5258             }
5259             else {
5260                 save = -1;
5261                 c = c0;
5262                 if (enc != e1) may_modify = 1;
5263             }
5264             while (t - buf + tlen >= max) {
5265                 offset = t - buf;
5266                 max *= 2;
5267                 REALLOC_N(buf, char, max);
5268                 t = buf + offset;
5269             }
5270             rb_enc_mbcput(c, t, enc);
5271             if (may_modify && memcmp(s, t, tlen) != 0) {
5272                 modify = 1;
5273             }
5274             CHECK_IF_ASCII(c);
5275             t += tlen;
5276         }
5277         if (!STR_EMBED_P(str)) {
5278             xfree(RSTRING(str)->as.heap.ptr);
5279         }
5280         *t = '\0';
5281         RSTRING(str)->as.heap.ptr = buf;
5282         RSTRING(str)->as.heap.len = t - buf;
5283         STR_SET_NOEMBED(str);
5284         RSTRING(str)->as.heap.aux.capa = max;
5285     }
5286     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5287         while (s < send) {
5288             c = (unsigned char)*s;
5289             if (trans[c] != errc) {
5290                 if (!cflag) {
5291                     c = trans[c];
5292                     *s = c;
5293                     modify = 1;
5294                 }
5295                 else {
5296                     *s = last;
5297                     modify = 1;
5298                 }
5299             }
5300             CHECK_IF_ASCII(c);
5301             s++;
5302         }
5303     }
5304     else {
5305         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5306         long offset;
5307         char *buf = ALLOC_N(char, max), *t = buf;
5308 
5309         while (s < send) {
5310             int may_modify = 0;
5311             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5312             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5313 
5314             if (c < 256) {
5315                 c = trans[c];
5316             }
5317             else if (hash) {
5318                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5319                 if (NIL_P(tmp)) {
5320                     if (cflag) c = last;
5321                     else c = errc;
5322                 }
5323                 else if (cflag) c = errc;
5324                 else c = NUM2INT(tmp);
5325             }
5326             else {
5327                 c = cflag ? last : errc;
5328             }
5329             if (c != errc) {
5330                 tlen = rb_enc_codelen(c, enc);
5331                 modify = 1;
5332             }
5333             else {
5334                 c = c0;
5335                 if (enc != e1) may_modify = 1;
5336             }
5337             while (t - buf + tlen >= max) {
5338                 offset = t - buf;
5339                 max *= 2;
5340                 REALLOC_N(buf, char, max);
5341                 t = buf + offset;
5342             }
5343             if (s != t) {
5344                 rb_enc_mbcput(c, t, enc);
5345                 if (may_modify && memcmp(s, t, tlen) != 0) {
5346                     modify = 1;
5347                 }
5348             }
5349             CHECK_IF_ASCII(c);
5350             s += clen;
5351             t += tlen;
5352         }
5353         if (!STR_EMBED_P(str)) {
5354             xfree(RSTRING(str)->as.heap.ptr);
5355         }
5356         *t = '\0';
5357         RSTRING(str)->as.heap.ptr = buf;
5358         RSTRING(str)->as.heap.len = t - buf;
5359         STR_SET_NOEMBED(str);
5360         RSTRING(str)->as.heap.aux.capa = max;
5361     }
5362 
5363     if (modify) {
5364         if (cr != ENC_CODERANGE_BROKEN)
5365             ENC_CODERANGE_SET(str, cr);
5366         rb_enc_associate(str, enc);
5367         return str;
5368     }
5369     return Qnil;
5370 }
5371 
5372 
5373 /*
5374  *  call-seq:
5375  *     str.tr!(from_str, to_str)   -> str or nil
5376  *
5377  *  Translates <i>str</i> in place, using the same rules as
5378  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5379  *  changes were made.
5380  */
5381 
5382 static VALUE
5383 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
5384 {
5385     return tr_trans(str, src, repl, 0);
5386 }
5387 
5388 
5389 /*
5390  *  call-seq:
5391  *     str.tr(from_str, to_str)   => new_str
5392  *
5393  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
5394  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
5395  *  +from_str+, it is padded with its last character in order to maintain the
5396  *  correspondence.
5397  *
5398  *     "hello".tr('el', 'ip')      #=> "hippo"
5399  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
5400  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
5401  *
5402  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
5403  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
5404  *  all characters except those listed.
5405  *
5406  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
5407  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
5408  *
5409  *  The backslash character <code>\</code> can be used to escape
5410  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
5411  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
5412  *
5413  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
5414  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
5415  *
5416  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
5417  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
5418  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
5419  *
5420  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
5421  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
5422  */
5423 
5424 static VALUE
5425 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5426 {
5427     str = rb_str_dup(str);
5428     tr_trans(str, src, repl, 0);
5429     return str;
5430 }
5431 
5432 #define TR_TABLE_SIZE 257
5433 static void
5434 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5435                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5436 {
5437     const unsigned int errc = -1;
5438     char buf[256];
5439     struct tr tr;
5440     unsigned int c;
5441     VALUE table = 0, ptable = 0;
5442     int i, l, cflag = 0;
5443 
5444     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5445     tr.gen = tr.now = tr.max = 0;
5446 
5447     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5448         cflag = 1;
5449         tr.p += l;
5450     }
5451     if (first) {
5452         for (i=0; i<256; i++) {
5453             stable[i] = 1;
5454         }
5455         stable[256] = cflag;
5456     }
5457     else if (stable[256] && !cflag) {
5458         stable[256] = 0;
5459     }
5460     for (i=0; i<256; i++) {
5461         buf[i] = cflag;
5462     }
5463 
5464     while ((c = trnext(&tr, enc)) != errc) {
5465         if (c < 256) {
5466             buf[c & 0xff] = !cflag;
5467         }
5468         else {
5469             VALUE key = UINT2NUM(c);
5470 
5471             if (!table && (first || *tablep || stable[256])) {
5472                 if (cflag) {
5473                     ptable = *ctablep;
5474                     table = ptable ? ptable : rb_hash_new();
5475                     *ctablep = table;
5476                 }
5477                 else {
5478                     table = rb_hash_new();
5479                     ptable = *tablep;
5480                     *tablep = table;
5481                 }
5482             }
5483             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
5484                 rb_hash_aset(table, key, Qtrue);
5485             }
5486         }
5487     }
5488     for (i=0; i<256; i++) {
5489         stable[i] = stable[i] && buf[i];
5490     }
5491     if (!table && !cflag) {
5492         *tablep = 0;
5493     }
5494 }
5495 
5496 
5497 static int
5498 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5499 {
5500     if (c < 256) {
5501         return table[c] != 0;
5502     }
5503     else {
5504         VALUE v = UINT2NUM(c);
5505 
5506         if (del) {
5507             if (!NIL_P(rb_hash_lookup(del, v)) &&
5508                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5509                 return TRUE;
5510             }
5511         }
5512         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5513             return FALSE;
5514         }
5515         return table[256] ? TRUE : FALSE;
5516     }
5517 }
5518 
5519 /*
5520  *  call-seq:
5521  *     str.delete!([other_str]+)   -> str or nil
5522  *
5523  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5524  *  <code>nil</code> if <i>str</i> was not modified.
5525  */
5526 
5527 static VALUE
5528 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
5529 {
5530     char squeez[TR_TABLE_SIZE];
5531     rb_encoding *enc = 0;
5532     char *s, *send, *t;
5533     VALUE del = 0, nodel = 0;
5534     int modify = 0;
5535     int i, ascompat, cr;
5536 
5537     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5538     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5539     for (i=0; i<argc; i++) {
5540         VALUE s = argv[i];
5541 
5542         StringValue(s);
5543         enc = rb_enc_check(str, s);
5544         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5545     }
5546 
5547     str_modify_keep_cr(str);
5548     ascompat = rb_enc_asciicompat(enc);
5549     s = t = RSTRING_PTR(str);
5550     send = RSTRING_END(str);
5551     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5552     while (s < send) {
5553         unsigned int c;
5554         int clen;
5555 
5556         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5557             if (squeez[c]) {
5558                 modify = 1;
5559             }
5560             else {
5561                 if (t != s) *t = c;
5562                 t++;
5563             }
5564             s++;
5565         }
5566         else {
5567             c = rb_enc_codepoint_len(s, send, &clen, enc);
5568 
5569             if (tr_find(c, squeez, del, nodel)) {
5570                 modify = 1;
5571             }
5572             else {
5573                 if (t != s) rb_enc_mbcput(c, t, enc);
5574                 t += clen;
5575                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5576             }
5577             s += clen;
5578         }
5579     }
5580     *t = '\0';
5581     STR_SET_LEN(str, t - RSTRING_PTR(str));
5582     ENC_CODERANGE_SET(str, cr);
5583 
5584     if (modify) return str;
5585     return Qnil;
5586 }
5587 
5588 
5589 /*
5590  *  call-seq:
5591  *     str.delete([other_str]+)   -> new_str
5592  *
5593  *  Returns a copy of <i>str</i> with all characters in the intersection of its
5594  *  arguments deleted. Uses the same rules for building the set of characters as
5595  *  <code>String#count</code>.
5596  *
5597  *     "hello".delete "l","lo"        #=> "heo"
5598  *     "hello".delete "lo"            #=> "he"
5599  *     "hello".delete "aeiou", "^e"   #=> "hell"
5600  *     "hello".delete "ej-m"          #=> "ho"
5601  */
5602 
5603 static VALUE
5604 rb_str_delete(int argc, VALUE *argv, VALUE str)
5605 {
5606     str = rb_str_dup(str);
5607     rb_str_delete_bang(argc, argv, str);
5608     return str;
5609 }
5610 
5611 
5612 /*
5613  *  call-seq:
5614  *     str.squeeze!([other_str]*)   -> str or nil
5615  *
5616  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
5617  *  <code>nil</code> if no changes were made.
5618  */
5619 
5620 static VALUE
5621 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
5622 {
5623     char squeez[TR_TABLE_SIZE];
5624     rb_encoding *enc = 0;
5625     VALUE del = 0, nodel = 0;
5626     char *s, *send, *t;
5627     int i, modify = 0;
5628     int ascompat, singlebyte = single_byte_optimizable(str);
5629     unsigned int save;
5630 
5631     if (argc == 0) {
5632         enc = STR_ENC_GET(str);
5633     }
5634     else {
5635         for (i=0; i<argc; i++) {
5636             VALUE s = argv[i];
5637 
5638             StringValue(s);
5639             enc = rb_enc_check(str, s);
5640             if (singlebyte && !single_byte_optimizable(s))
5641                 singlebyte = 0;
5642             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5643         }
5644     }
5645 
5646     str_modify_keep_cr(str);
5647     s = t = RSTRING_PTR(str);
5648     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5649     send = RSTRING_END(str);
5650     save = -1;
5651     ascompat = rb_enc_asciicompat(enc);
5652 
5653     if (singlebyte) {
5654         while (s < send) {
5655             unsigned int c = *(unsigned char*)s++;
5656             if (c != save || (argc > 0 && !squeez[c])) {
5657                 *t++ = save = c;
5658             }
5659         }
5660     } else {
5661         while (s < send) {
5662             unsigned int c;
5663             int clen;
5664 
5665             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5666                 if (c != save || (argc > 0 && !squeez[c])) {
5667                     *t++ = save = c;
5668                 }
5669                 s++;
5670             }
5671             else {
5672                 c = rb_enc_codepoint_len(s, send, &clen, enc);
5673 
5674                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5675                     if (t != s) rb_enc_mbcput(c, t, enc);
5676                     save = c;
5677                     t += clen;
5678                 }
5679                 s += clen;
5680             }
5681         }
5682     }
5683 
5684     *t = '\0';
5685     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5686         STR_SET_LEN(str, t - RSTRING_PTR(str));
5687         modify = 1;
5688     }
5689 
5690     if (modify) return str;
5691     return Qnil;
5692 }
5693 
5694 
5695 /*
5696  *  call-seq:
5697  *     str.squeeze([other_str]*)    -> new_str
5698  *
5699  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
5700  *  procedure described for <code>String#count</code>. Returns a new string
5701  *  where runs of the same character that occur in this set are replaced by a
5702  *  single character. If no arguments are given, all runs of identical
5703  *  characters are replaced by a single character.
5704  *
5705  *     "yellow moon".squeeze                  #=> "yelow mon"
5706  *     "  now   is  the".squeeze(" ")         #=> " now is the"
5707  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
5708  */
5709 
5710 static VALUE
5711 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
5712 {
5713     str = rb_str_dup(str);
5714     rb_str_squeeze_bang(argc, argv, str);
5715     return str;
5716 }
5717 
5718 
5719 /*
5720  *  call-seq:
5721  *     str.tr_s!(from_str, to_str)   -> str or nil
5722  *
5723  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5724  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
5725  */
5726 
5727 static VALUE
5728 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
5729 {
5730     return tr_trans(str, src, repl, 1);
5731 }
5732 
5733 
5734 /*
5735  *  call-seq:
5736  *     str.tr_s(from_str, to_str)   -> new_str
5737  *
5738  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5739  *  then removes duplicate characters in regions that were affected by the
5740  *  translation.
5741  *
5742  *     "hello".tr_s('l', 'r')     #=> "hero"
5743  *     "hello".tr_s('el', '*')    #=> "h*o"
5744  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
5745  */
5746 
5747 static VALUE
5748 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5749 {
5750     str = rb_str_dup(str);
5751     tr_trans(str, src, repl, 1);
5752     return str;
5753 }
5754 
5755 
5756 /*
5757  *  call-seq:
5758  *     str.count([other_str]+)   -> fixnum
5759  *
5760  *  Each +other_str+ parameter defines a set of characters to count.  The
5761  *  intersection of these sets defines the characters to count in +str+.  Any
5762  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
5763  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
5764  *  backslash character <code>\</code> can be used to escape <code>^</code> or
5765  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
5766  *  sequence or the end of a +other_str+.
5767  *
5768  *     a = "hello world"
5769  *     a.count "lo"                   #=> 5
5770  *     a.count "lo", "o"              #=> 2
5771  *     a.count "hello", "^l"          #=> 4
5772  *     a.count "ej-m"                 #=> 4
5773  *
5774  *     "hello^world".count "\\^aeiou" #=> 4
5775  *     "hello-world".count "a\\-eo"   #=> 4
5776  *
5777  *     c = "hello world\\r\\n"
5778  *     c.count "\\"                   #=> 2
5779  *     c.count "\\A"                  #=> 0
5780  *     c.count "X-\\w"                #=> 3
5781  */
5782 
5783 static VALUE
5784 rb_str_count(int argc, VALUE *argv, VALUE str)
5785 {
5786     char table[TR_TABLE_SIZE];
5787     rb_encoding *enc = 0;
5788     VALUE del = 0, nodel = 0;
5789     char *s, *send;
5790     int i;
5791     int ascompat;
5792 
5793     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5794     for (i=0; i<argc; i++) {
5795         VALUE tstr = argv[i];
5796         unsigned char c;
5797 
5798         StringValue(tstr);
5799         enc = rb_enc_check(str, tstr);
5800         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5801             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5802             int n = 0;
5803 
5804             s = RSTRING_PTR(str);
5805             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5806             send = RSTRING_END(str);
5807             while (s < send) {
5808                 if (*(unsigned char*)s++ == c) n++;
5809             }
5810             return INT2NUM(n);
5811         }
5812         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5813     }
5814 
5815     s = RSTRING_PTR(str);
5816     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5817     send = RSTRING_END(str);
5818     ascompat = rb_enc_asciicompat(enc);
5819     i = 0;
5820     while (s < send) {
5821         unsigned int c;
5822 
5823         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5824             if (table[c]) {
5825                 i++;
5826             }
5827             s++;
5828         }
5829         else {
5830             int clen;
5831             c = rb_enc_codepoint_len(s, send, &clen, enc);
5832             if (tr_find(c, table, del, nodel)) {
5833                 i++;
5834             }
5835             s += clen;
5836         }
5837     }
5838 
5839     return INT2NUM(i);
5840 }
5841 
5842 static const char isspacetable[256] = {
5843     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5844     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5845     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5846     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5847     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5848     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5849     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5850     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5851     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5852     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5853     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5854     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5855     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5856     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5857     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5858     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5859 };
5860 
5861 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5862 
5863 /*
5864  *  call-seq:
5865  *     str.split(pattern=$;, [limit])   -> anArray
5866  *
5867  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
5868  *  of these substrings.
5869  *
5870  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
5871  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5872  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
5873  *  of contiguous whitespace characters ignored.
5874  *
5875  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5876  *  pattern matches. Whenever the pattern matches a zero-length string,
5877  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
5878  *  groups, the respective matches will be returned in the array as well.
5879  *
5880  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
5881  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5882  *  split on whitespace as if ` ' were specified.
5883  *
5884  *  If the <i>limit</i> parameter is omitted, trailing null fields are
5885  *  suppressed. If <i>limit</i> is a positive number, at most that number of
5886  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5887  *  string is returned as the only entry in an array). If negative, there is no
5888  *  limit to the number of fields returned, and trailing null fields are not
5889  *  suppressed.
5890  *
5891  *  When the input +str+ is empty an empty Array is returned as the string is
5892  *  considered to have no fields to split.
5893  *
5894  *     " now's  the time".split        #=> ["now's", "the", "time"]
5895  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
5896  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
5897  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5898  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
5899  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
5900  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
5901  *
5902  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
5903  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
5904  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
5905  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
5906  *
5907  *     "".split(',', -1)               #=> []
5908  */
5909 
5910 static VALUE
5911 rb_str_split_m(int argc, VALUE *argv, VALUE str)
5912 {
5913     rb_encoding *enc;
5914     VALUE spat;
5915     VALUE limit;
5916     enum {awk, string, regexp} split_type;
5917     long beg, end, i = 0;
5918     int lim = 0;
5919     VALUE result, tmp;
5920 
5921     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5922         lim = NUM2INT(limit);
5923         if (lim <= 0) limit = Qnil;
5924         else if (lim == 1) {
5925             if (RSTRING_LEN(str) == 0)
5926                 return rb_ary_new2(0);
5927             return rb_ary_new3(1, str);
5928         }
5929         i = 1;
5930     }
5931 
5932     enc = STR_ENC_GET(str);
5933     if (NIL_P(spat)) {
5934         if (!NIL_P(rb_fs)) {
5935             spat = rb_fs;
5936             goto fs_set;
5937         }
5938         split_type = awk;
5939     }
5940     else {
5941       fs_set:
5942         if (RB_TYPE_P(spat, T_STRING)) {
5943             rb_encoding *enc2 = STR_ENC_GET(spat);
5944 
5945             split_type = string;
5946             if (RSTRING_LEN(spat) == 0) {
5947                 /* Special case - split into chars */
5948                 spat = rb_reg_regcomp(spat);
5949                 split_type = regexp;
5950             }
5951             else if (rb_enc_asciicompat(enc2) == 1) {
5952                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5953                     split_type = awk;
5954                 }
5955             }
5956             else {
5957                 int l;
5958                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5959                     RSTRING_LEN(spat) == l) {
5960                     split_type = awk;
5961                 }
5962             }
5963         }
5964         else {
5965             spat = get_pat(spat, 1);
5966             split_type = regexp;
5967         }
5968     }
5969 
5970     result = rb_ary_new();
5971     beg = 0;
5972     if (split_type == awk) {
5973         char *ptr = RSTRING_PTR(str);
5974         char *eptr = RSTRING_END(str);
5975         char *bptr = ptr;
5976         int skip = 1;
5977         unsigned int c;
5978 
5979         end = beg;
5980         if (is_ascii_string(str)) {
5981             while (ptr < eptr) {
5982                 c = (unsigned char)*ptr++;
5983                 if (skip) {
5984                     if (ascii_isspace(c)) {
5985                         beg = ptr - bptr;
5986                     }
5987                     else {
5988                         end = ptr - bptr;
5989                         skip = 0;
5990                         if (!NIL_P(limit) && lim <= i) break;
5991                     }
5992                 }
5993                 else if (ascii_isspace(c)) {
5994                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5995                     skip = 1;
5996                     beg = ptr - bptr;
5997                     if (!NIL_P(limit)) ++i;
5998                 }
5999                 else {
6000                     end = ptr - bptr;
6001                 }
6002             }
6003         }
6004         else {
6005             while (ptr < eptr) {
6006                 int n;
6007 
6008                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
6009                 ptr += n;
6010                 if (skip) {
6011                     if (rb_isspace(c)) {
6012                         beg = ptr - bptr;
6013                     }
6014                     else {
6015                         end = ptr - bptr;
6016                         skip = 0;
6017                         if (!NIL_P(limit) && lim <= i) break;
6018                     }
6019                 }
6020                 else if (rb_isspace(c)) {
6021                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6022                     skip = 1;
6023                     beg = ptr - bptr;
6024                     if (!NIL_P(limit)) ++i;
6025                 }
6026                 else {
6027                     end = ptr - bptr;
6028                 }
6029             }
6030         }
6031     }
6032     else if (split_type == string) {
6033         char *ptr = RSTRING_PTR(str);
6034         char *temp = ptr;
6035         char *eptr = RSTRING_END(str);
6036         char *sptr = RSTRING_PTR(spat);
6037         long slen = RSTRING_LEN(spat);
6038 
6039         if (is_broken_string(str)) {
6040             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
6041         }
6042         if (is_broken_string(spat)) {
6043             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
6044         }
6045         enc = rb_enc_check(str, spat);
6046         while (ptr < eptr &&
6047                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
6048             /* Check we are at the start of a char */
6049             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
6050             if (t != ptr + end) {
6051                 ptr = t;
6052                 continue;
6053             }
6054             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
6055             ptr += end + slen;
6056             if (!NIL_P(limit) && lim <= ++i) break;
6057         }
6058         beg = ptr - temp;
6059     }
6060     else {
6061         char *ptr = RSTRING_PTR(str);
6062         long len = RSTRING_LEN(str);
6063         long start = beg;
6064         long idx;
6065         int last_null = 0;
6066         struct re_registers *regs;
6067 
6068         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
6069             regs = RMATCH_REGS(rb_backref_get());
6070             if (start == end && BEG(0) == END(0)) {
6071                 if (!ptr) {
6072                     rb_ary_push(result, str_new_empty(str));
6073                     break;
6074                 }
6075                 else if (last_null == 1) {
6076                     rb_ary_push(result, rb_str_subseq(str, beg,
6077                                                       rb_enc_fast_mbclen(ptr+beg,
6078                                                                          ptr+len,
6079                                                                          enc)));
6080                     beg = start;
6081                 }
6082                 else {
6083                     if (ptr+start == ptr+len)
6084                         start++;
6085                     else
6086                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
6087                     last_null = 1;
6088                     continue;
6089                 }
6090             }
6091             else {
6092                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6093                 beg = start = END(0);
6094             }
6095             last_null = 0;
6096 
6097             for (idx=1; idx < regs->num_regs; idx++) {
6098                 if (BEG(idx) == -1) continue;
6099                 if (BEG(idx) == END(idx))
6100                     tmp = str_new_empty(str);
6101                 else
6102                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
6103                 rb_ary_push(result, tmp);
6104             }
6105             if (!NIL_P(limit) && lim <= ++i) break;
6106         }
6107     }
6108     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
6109         if (RSTRING_LEN(str) == beg)
6110             tmp = str_new_empty(str);
6111         else
6112             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
6113         rb_ary_push(result, tmp);
6114     }
6115     if (NIL_P(limit) && lim == 0) {
6116         long len;
6117         while ((len = RARRAY_LEN(result)) > 0 &&
6118                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
6119             rb_ary_pop(result);
6120     }
6121 
6122     return result;
6123 }
6124 
6125 VALUE
6126 rb_str_split(VALUE str, const char *sep0)
6127 {
6128     VALUE sep;
6129 
6130     StringValue(str);
6131     sep = rb_str_new2(sep0);
6132     return rb_str_split_m(1, &sep, str);
6133 }
6134 
6135 
6136 static VALUE
6137 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
6138 {
6139     rb_encoding *enc;
6140     VALUE rs;
6141     unsigned int newline;
6142     const char *p, *pend, *s, *ptr;
6143     long len, rslen;
6144     VALUE line;
6145     int n;
6146     VALUE orig = str;
6147     VALUE UNINITIALIZED_VAR(ary);
6148 
6149     if (argc == 0) {
6150         rs = rb_rs;
6151     }
6152     else {
6153         rb_scan_args(argc, argv, "01", &rs);
6154     }
6155 
6156     if (rb_block_given_p()) {
6157         if (wantarray) {
6158 #if STRING_ENUMERATORS_WANTARRAY
6159             rb_warn("given block not used");
6160             ary = rb_ary_new();
6161 #else
6162             rb_warning("passing a block to String#lines is deprecated");
6163             wantarray = 0;
6164 #endif
6165         }
6166     }
6167     else {
6168         if (wantarray)
6169             ary = rb_ary_new();
6170         else
6171             RETURN_ENUMERATOR(str, argc, argv);
6172     }
6173 
6174     if (NIL_P(rs)) {
6175         if (wantarray) {
6176             rb_ary_push(ary, str);
6177             return ary;
6178         }
6179         else {
6180             rb_yield(str);
6181             return orig;
6182         }
6183     }
6184     str = rb_str_new4(str);
6185     ptr = p = s = RSTRING_PTR(str);
6186     pend = p + RSTRING_LEN(str);
6187     len = RSTRING_LEN(str);
6188     StringValue(rs);
6189     if (rs == rb_default_rs) {
6190         enc = rb_enc_get(str);
6191         while (p < pend) {
6192             char *p0;
6193 
6194             p = memchr(p, '\n', pend - p);
6195             if (!p) break;
6196             p0 = rb_enc_left_char_head(s, p, pend, enc);
6197             if (!rb_enc_is_newline(p0, pend, enc)) {
6198                 p++;
6199                 continue;
6200             }
6201             p = p0 + rb_enc_mbclen(p0, pend, enc);
6202             line = rb_str_subseq(str, s - ptr, p - s);
6203             if (wantarray)
6204                 rb_ary_push(ary, line);
6205             else
6206                 rb_yield(line);
6207             str_mod_check(str, ptr, len);
6208             s = p;
6209         }
6210         goto finish;
6211     }
6212 
6213     enc = rb_enc_check(str, rs);
6214     rslen = RSTRING_LEN(rs);
6215     if (rslen == 0) {
6216         newline = '\n';
6217     }
6218     else {
6219         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6220     }
6221 
6222     while (p < pend) {
6223         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6224 
6225       again:
6226         if (rslen == 0 && c == newline) {
6227             p += n;
6228             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6229                 goto again;
6230             }
6231             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6232                 p += n;
6233             }
6234             p -= n;
6235         }
6236         if (c == newline &&
6237             (rslen <= 1 ||
6238              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6239             const char *pp = p + (rslen ? rslen : n);
6240             line = rb_str_subseq(str, s - ptr, pp - s);
6241             if (wantarray)
6242                 rb_ary_push(ary, line);
6243             else
6244                 rb_yield(line);
6245             str_mod_check(str, ptr, len);
6246             s = pp;
6247         }
6248         p += n;
6249     }
6250 
6251   finish:
6252     if (s != pend) {
6253         line = rb_str_subseq(str, s - ptr, pend - s);
6254         if (wantarray)
6255             rb_ary_push(ary, line);
6256         else
6257             rb_yield(line);
6258         RB_GC_GUARD(str);
6259     }
6260 
6261     if (wantarray)
6262         return ary;
6263     else
6264         return orig;
6265 }
6266 
6267 /*
6268  *  call-seq:
6269  *     str.each_line(separator=$/) {|substr| block }   -> str
6270  *     str.each_line(separator=$/)                     -> an_enumerator
6271  *
6272  *  Splits <i>str</i> using the supplied parameter as the record
6273  *  separator (<code>$/</code> by default), passing each substring in
6274  *  turn to the supplied block.  If a zero-length record separator is
6275  *  supplied, the string is split into paragraphs delimited by
6276  *  multiple successive newlines.
6277  *
6278  *  If no block is given, an enumerator is returned instead.
6279  *
6280  *     print "Example one\n"
6281  *     "hello\nworld".each_line {|s| p s}
6282  *     print "Example two\n"
6283  *     "hello\nworld".each_line('l') {|s| p s}
6284  *     print "Example three\n"
6285  *     "hello\n\n\nworld".each_line('') {|s| p s}
6286  *
6287  *  <em>produces:</em>
6288  *
6289  *     Example one
6290  *     "hello\n"
6291  *     "world"
6292  *     Example two
6293  *     "hel"
6294  *     "l"
6295  *     "o\nworl"
6296  *     "d"
6297  *     Example three
6298  *     "hello\n\n\n"
6299  *     "world"
6300  */
6301 
6302 static VALUE
6303 rb_str_each_line(int argc, VALUE *argv, VALUE str)
6304 {
6305     return rb_str_enumerate_lines(argc, argv, str, 0);
6306 }
6307 
6308 /*
6309  *  call-seq:
6310  *     str.lines(separator=$/)  -> an_array
6311  *
6312  *  Returns an array of lines in <i>str</i> split using the supplied
6313  *  record separator (<code>$/</code> by default).  This is a
6314  *  shorthand for <code>str.each_line(separator).to_a</code>.
6315  *
6316  *  If a block is given, which is a deprecated form, works the same as
6317  *  <code>each_line</code>.
6318  */
6319 
6320 static VALUE
6321 rb_str_lines(int argc, VALUE *argv, VALUE str)
6322 {
6323     return rb_str_enumerate_lines(argc, argv, str, 1);
6324 }
6325 
6326 static VALUE
6327 rb_str_each_byte_size(VALUE str, VALUE args)
6328 {
6329     return LONG2FIX(RSTRING_LEN(str));
6330 }
6331 
6332 static VALUE
6333 rb_str_enumerate_bytes(VALUE str, int wantarray)
6334 {
6335     long i;
6336     VALUE UNINITIALIZED_VAR(ary);
6337 
6338     if (rb_block_given_p()) {
6339         if (wantarray) {
6340 #if STRING_ENUMERATORS_WANTARRAY
6341             rb_warn("given block not used");
6342             ary = rb_ary_new();
6343 #else
6344             rb_warning("passing a block to String#bytes is deprecated");
6345             wantarray = 0;
6346 #endif
6347         }
6348     }
6349     else {
6350         if (wantarray)
6351             ary = rb_ary_new2(RSTRING_LEN(str));
6352         else
6353             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
6354     }
6355 
6356     for (i=0; i<RSTRING_LEN(str); i++) {
6357         if (wantarray)
6358             rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6359         else
6360             rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6361     }
6362     if (wantarray)
6363         return ary;
6364     else
6365         return str;
6366 }
6367 
6368 /*
6369  *  call-seq:
6370  *     str.each_byte {|fixnum| block }    -> str
6371  *     str.each_byte                      -> an_enumerator
6372  *
6373  *  Passes each byte in <i>str</i> to the given block, or returns an
6374  *  enumerator if no block is given.
6375  *
6376  *     "hello".each_byte {|c| print c, ' ' }
6377  *
6378  *  <em>produces:</em>
6379  *
6380  *     104 101 108 108 111
6381  */
6382 
6383 static VALUE
6384 rb_str_each_byte(VALUE str)
6385 {
6386     return rb_str_enumerate_bytes(str, 0);
6387 }
6388 
6389 /*
6390  *  call-seq:
6391  *     str.bytes    -> an_array
6392  *
6393  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
6394  *  <code>str.each_byte.to_a</code>.
6395  *
6396  *  If a block is given, which is a deprecated form, works the same as
6397  *  <code>each_byte</code>.
6398  */
6399 
6400 static VALUE
6401 rb_str_bytes(VALUE str)
6402 {
6403     return rb_str_enumerate_bytes(str, 1);
6404 }
6405 
6406 static VALUE
6407 rb_str_each_char_size(VALUE str)
6408 {
6409     long len = RSTRING_LEN(str);
6410     if (!single_byte_optimizable(str)) {
6411         const char *ptr = RSTRING_PTR(str);
6412         rb_encoding *enc = rb_enc_get(str);
6413         const char *end_ptr = ptr + len;
6414         for (len = 0; ptr < end_ptr; ++len) {
6415             ptr += rb_enc_mbclen(ptr, end_ptr, enc);
6416         }
6417     }
6418     return LONG2FIX(len);
6419 }
6420 
6421 static VALUE
6422 rb_str_enumerate_chars(VALUE str, int wantarray)
6423 {
6424     VALUE orig = str;
6425     VALUE substr;
6426     long i, len, n;
6427     const char *ptr;
6428     rb_encoding *enc;
6429     VALUE UNINITIALIZED_VAR(ary);
6430 
6431     if (rb_block_given_p()) {
6432         if (wantarray) {
6433 #if STRING_ENUMERATORS_WANTARRAY
6434             rb_warn("given block not used");
6435             ary = rb_ary_new();
6436 #else
6437             rb_warning("passing a block to String#chars is deprecated");
6438             wantarray = 0;
6439 #endif
6440         }
6441     }
6442     else {
6443         if (wantarray)
6444             ary = rb_ary_new();
6445         else
6446             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6447     }
6448 
6449     str = rb_str_new4(str);
6450     ptr = RSTRING_PTR(str);
6451     len = RSTRING_LEN(str);
6452     enc = rb_enc_get(str);
6453     switch (ENC_CODERANGE(str)) {
6454       case ENC_CODERANGE_VALID:
6455       case ENC_CODERANGE_7BIT:
6456         for (i = 0; i < len; i += n) {
6457             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6458             substr = rb_str_subseq(str, i, n);
6459             if (wantarray)
6460                 rb_ary_push(ary, substr);
6461             else
6462                 rb_yield(substr);
6463         }
6464         break;
6465       default:
6466         for (i = 0; i < len; i += n) {
6467             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6468             substr = rb_str_subseq(str, i, n);
6469             if (wantarray)
6470                 rb_ary_push(ary, substr);
6471             else
6472                 rb_yield(substr);
6473         }
6474     }
6475     RB_GC_GUARD(str);
6476     if (wantarray)
6477         return ary;
6478     else
6479         return orig;
6480 }
6481 
6482 /*
6483  *  call-seq:
6484  *     str.each_char {|cstr| block }    -> str
6485  *     str.each_char                    -> an_enumerator
6486  *
6487  *  Passes each character in <i>str</i> to the given block, or returns
6488  *  an enumerator if no block is given.
6489  *
6490  *     "hello".each_char {|c| print c, ' ' }
6491  *
6492  *  <em>produces:</em>
6493  *
6494  *     h e l l o
6495  */
6496 
6497 static VALUE
6498 rb_str_each_char(VALUE str)
6499 {
6500     return rb_str_enumerate_chars(str, 0);
6501 }
6502 
6503 /*
6504  *  call-seq:
6505  *     str.chars    -> an_array
6506  *
6507  *  Returns an array of characters in <i>str</i>.  This is a shorthand
6508  *  for <code>str.each_char.to_a</code>.
6509  *
6510  *  If a block is given, which is a deprecated form, works the same as
6511  *  <code>each_char</code>.
6512  */
6513 
6514 static VALUE
6515 rb_str_chars(VALUE str)
6516 {
6517     return rb_str_enumerate_chars(str, 1);
6518 }
6519 
6520 
6521 static VALUE
6522 rb_str_enumerate_codepoints(VALUE str, int wantarray)
6523 {
6524     VALUE orig = str;
6525     int n;
6526     unsigned int c;
6527     const char *ptr, *end;
6528     rb_encoding *enc;
6529     VALUE UNINITIALIZED_VAR(ary);
6530 
6531     if (single_byte_optimizable(str))
6532         return rb_str_enumerate_bytes(str, wantarray);
6533 
6534     if (rb_block_given_p()) {
6535         if (wantarray) {
6536 #if STRING_ENUMERATORS_WANTARRAY
6537             rb_warn("given block not used");
6538             ary = rb_ary_new();
6539 #else
6540             rb_warning("passing a block to String#codepoints is deprecated");
6541             wantarray = 0;
6542 #endif
6543         }
6544     }
6545     else {
6546         if (wantarray)
6547             ary = rb_ary_new();
6548         else
6549             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6550     }
6551 
6552     str = rb_str_new4(str);
6553     ptr = RSTRING_PTR(str);
6554     end = RSTRING_END(str);
6555     enc = STR_ENC_GET(str);
6556     while (ptr < end) {
6557         c = rb_enc_codepoint_len(ptr, end, &n, enc);
6558         if (wantarray)
6559             rb_ary_push(ary, UINT2NUM(c));
6560         else
6561             rb_yield(UINT2NUM(c));
6562         ptr += n;
6563     }
6564     RB_GC_GUARD(str);
6565     if (wantarray)
6566         return ary;
6567     else
6568         return orig;
6569 }
6570 
6571 /*
6572  *  call-seq:
6573  *     str.each_codepoint {|integer| block }    -> str
6574  *     str.each_codepoint                       -> an_enumerator
6575  *
6576  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6577  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
6578  *  given block.
6579  *
6580  *  If no block is given, an enumerator is returned instead.
6581  *
6582  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
6583  *
6584  *  <em>produces:</em>
6585  *
6586  *     104 101 108 108 111 1593
6587  */
6588 
6589 static VALUE
6590 rb_str_each_codepoint(VALUE str)
6591 {
6592     return rb_str_enumerate_codepoints(str, 0);
6593 }
6594 
6595 /*
6596  *  call-seq:
6597  *     str.codepoints   -> an_array
6598  *
6599  *  Returns an array of the <code>Integer</code> ordinals of the
6600  *  characters in <i>str</i>.  This is a shorthand for
6601  *  <code>str.each_codepoint.to_a</code>.
6602  *
6603  *  If a block is given, which is a deprecated form, works the same as
6604  *  <code>each_codepoint</code>.
6605  */
6606 
6607 static VALUE
6608 rb_str_codepoints(VALUE str)
6609 {
6610     return rb_str_enumerate_codepoints(str, 1);
6611 }
6612 
6613 
6614 static long
6615 chopped_length(VALUE str)
6616 {
6617     rb_encoding *enc = STR_ENC_GET(str);
6618     const char *p, *p2, *beg, *end;
6619 
6620     beg = RSTRING_PTR(str);
6621     end = beg + RSTRING_LEN(str);
6622     if (beg > end) return 0;
6623     p = rb_enc_prev_char(beg, end, end, enc);
6624     if (!p) return 0;
6625     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6626         p2 = rb_enc_prev_char(beg, p, end, enc);
6627         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6628     }
6629     return p - beg;
6630 }
6631 
6632 /*
6633  *  call-seq:
6634  *     str.chop!   -> str or nil
6635  *
6636  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6637  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
6638  *  <code>String#chomp!</code>.
6639  */
6640 
6641 static VALUE
6642 rb_str_chop_bang(VALUE str)
6643 {
6644     str_modify_keep_cr(str);
6645     if (RSTRING_LEN(str) > 0) {
6646         long len;
6647         len = chopped_length(str);
6648         STR_SET_LEN(str, len);
6649         RSTRING_PTR(str)[len] = '\0';
6650         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6651             ENC_CODERANGE_CLEAR(str);
6652         }
6653         return str;
6654     }
6655     return Qnil;
6656 }
6657 
6658 
6659 /*
6660  *  call-seq:
6661  *     str.chop   -> new_str
6662  *
6663  *  Returns a new <code>String</code> with the last character removed.  If the
6664  *  string ends with <code>\r\n</code>, both characters are removed. Applying
6665  *  <code>chop</code> to an empty string returns an empty
6666  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
6667  *  the string unchanged if it doesn't end in a record separator.
6668  *
6669  *     "string\r\n".chop   #=> "string"
6670  *     "string\n\r".chop   #=> "string\n"
6671  *     "string\n".chop     #=> "string"
6672  *     "string".chop       #=> "strin"
6673  *     "x".chop.chop       #=> ""
6674  */
6675 
6676 static VALUE
6677 rb_str_chop(VALUE str)
6678 {
6679     return rb_str_subseq(str, 0, chopped_length(str));
6680 }
6681 
6682 
6683 /*
6684  *  call-seq:
6685  *     str.chomp!(separator=$/)   -> str or nil
6686  *
6687  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6688  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
6689  */
6690 
6691 static VALUE
6692 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
6693 {
6694     rb_encoding *enc;
6695     VALUE rs;
6696     int newline;
6697     char *p, *pp, *e;
6698     long len, rslen;
6699 
6700     str_modify_keep_cr(str);
6701     len = RSTRING_LEN(str);
6702     if (len == 0) return Qnil;
6703     p = RSTRING_PTR(str);
6704     e = p + len;
6705     if (argc == 0) {
6706         rs = rb_rs;
6707         if (rs == rb_default_rs) {
6708           smart_chomp:
6709             enc = rb_enc_get(str);
6710             if (rb_enc_mbminlen(enc) > 1) {
6711                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6712                 if (rb_enc_is_newline(pp, e, enc)) {
6713                     e = pp;
6714                 }
6715                 pp = e - rb_enc_mbminlen(enc);
6716                 if (pp >= p) {
6717                     pp = rb_enc_left_char_head(p, pp, e, enc);
6718                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6719                         e = pp;
6720                     }
6721                 }
6722                 if (e == RSTRING_END(str)) {
6723                     return Qnil;
6724                 }
6725                 len = e - RSTRING_PTR(str);
6726                 STR_SET_LEN(str, len);
6727             }
6728             else {
6729                 if (RSTRING_PTR(str)[len-1] == '\n') {
6730                     STR_DEC_LEN(str);
6731                     if (RSTRING_LEN(str) > 0 &&
6732                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6733                         STR_DEC_LEN(str);
6734                     }
6735                 }
6736                 else if (RSTRING_PTR(str)[len-1] == '\r') {
6737                     STR_DEC_LEN(str);
6738                 }
6739                 else {
6740                     return Qnil;
6741                 }
6742             }
6743             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6744             return str;
6745         }
6746     }
6747     else {
6748         rb_scan_args(argc, argv, "01", &rs);
6749     }
6750     if (NIL_P(rs)) return Qnil;
6751     StringValue(rs);
6752     rslen = RSTRING_LEN(rs);
6753     if (rslen == 0) {
6754         while (len>0 && p[len-1] == '\n') {
6755             len--;
6756             if (len>0 && p[len-1] == '\r')
6757                 len--;
6758         }
6759         if (len < RSTRING_LEN(str)) {
6760             STR_SET_LEN(str, len);
6761             RSTRING_PTR(str)[len] = '\0';
6762             return str;
6763         }
6764         return Qnil;
6765     }
6766     if (rslen > len) return Qnil;
6767     newline = RSTRING_PTR(rs)[rslen-1];
6768     if (rslen == 1 && newline == '\n')
6769         goto smart_chomp;
6770 
6771     enc = rb_enc_check(str, rs);
6772     if (is_broken_string(rs)) {
6773         return Qnil;
6774     }
6775     pp = e - rslen;
6776     if (p[len-1] == newline &&
6777         (rslen <= 1 ||
6778          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6779         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6780             return Qnil;
6781         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6782             ENC_CODERANGE_CLEAR(str);
6783         }
6784         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6785         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6786         return str;
6787     }
6788     return Qnil;
6789 }
6790 
6791 
6792 /*
6793  *  call-seq:
6794  *     str.chomp(separator=$/)   -> new_str
6795  *
6796  *  Returns a new <code>String</code> with the given record separator removed
6797  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
6798  *  changed from the default Ruby record separator, then <code>chomp</code> also
6799  *  removes carriage return characters (that is it will remove <code>\n</code>,
6800  *  <code>\r</code>, and <code>\r\n</code>).
6801  *
6802  *     "hello".chomp            #=> "hello"
6803  *     "hello\n".chomp          #=> "hello"
6804  *     "hello\r\n".chomp        #=> "hello"
6805  *     "hello\n\r".chomp        #=> "hello\n"
6806  *     "hello\r".chomp          #=> "hello"
6807  *     "hello \n there".chomp   #=> "hello \n there"
6808  *     "hello".chomp("llo")     #=> "he"
6809  */
6810 
6811 static VALUE
6812 rb_str_chomp(int argc, VALUE *argv, VALUE str)
6813 {
6814     str = rb_str_dup(str);
6815     rb_str_chomp_bang(argc, argv, str);
6816     return str;
6817 }
6818 
6819 /*
6820  *  call-seq:
6821  *     str.lstrip!   -> self or nil
6822  *
6823  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6824  *  change was made. See also <code>String#rstrip!</code> and
6825  *  <code>String#strip!</code>.
6826  *
6827  *     "  hello  ".lstrip   #=> "hello  "
6828  *     "hello".lstrip!      #=> nil
6829  */
6830 
6831 static VALUE
6832 rb_str_lstrip_bang(VALUE str)
6833 {
6834     rb_encoding *enc;
6835     char *s, *t, *e;
6836 
6837     str_modify_keep_cr(str);
6838     enc = STR_ENC_GET(str);
6839     s = RSTRING_PTR(str);
6840     if (!s || RSTRING_LEN(str) == 0) return Qnil;
6841     e = t = RSTRING_END(str);
6842     /* remove spaces at head */
6843     while (s < e) {
6844         int n;
6845         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6846 
6847         if (!rb_isspace(cc)) break;
6848         s += n;
6849     }
6850 
6851     if (s > RSTRING_PTR(str)) {
6852         STR_SET_LEN(str, t-s);
6853         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6854         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6855         return str;
6856     }
6857     return Qnil;
6858 }
6859 
6860 
6861 /*
6862  *  call-seq:
6863  *     str.lstrip   -> new_str
6864  *
6865  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
6866  *  <code>String#rstrip</code> and <code>String#strip</code>.
6867  *
6868  *     "  hello  ".lstrip   #=> "hello  "
6869  *     "hello".lstrip       #=> "hello"
6870  */
6871 
6872 static VALUE
6873 rb_str_lstrip(VALUE str)
6874 {
6875     str = rb_str_dup(str);
6876     rb_str_lstrip_bang(str);
6877     return str;
6878 }
6879 
6880 
6881 /*
6882  *  call-seq:
6883  *     str.rstrip!   -> self or nil
6884  *
6885  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
6886  *  no change was made. See also <code>String#lstrip!</code> and
6887  *  <code>String#strip!</code>.
6888  *
6889  *     "  hello  ".rstrip   #=> "  hello"
6890  *     "hello".rstrip!      #=> nil
6891  */
6892 
6893 static VALUE
6894 rb_str_rstrip_bang(VALUE str)
6895 {
6896     rb_encoding *enc;
6897     char *s, *t, *e;
6898 
6899     str_modify_keep_cr(str);