The Ruby Cross Reference

Implementation: mri jruby rubinius
Version: 1.8.7-p374 1.9.1-p431 1.9.2-p381 1.9.3-p547 2.0.0-p481 2.1.0-p0 2.1.1 2.1.2 HEAD
001 /**********************************************************************
002 
003   string.c -
004 
005   $Author$
006   created at: Mon Aug  9 17:12:58 JST 1993
007 
008   Copyright (C) 1993-2007 Yukihiro Matsumoto
009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
011 
012 **********************************************************************/
013 
014 #include "ruby/ruby.h"
015 #include "ruby/re.h"
016 #include "ruby/encoding.h"
017 #include "vm_core.h"
018 #include "internal.h"
019 #include "probes.h"
020 #include <assert.h>
021 
022 #define BEG(no) (regs->beg[(no)])
023 #define END(no) (regs->end[(no)])
024 
025 #include <math.h>
026 #include <ctype.h>
027 
028 #ifdef HAVE_UNISTD_H
029 #include <unistd.h>
030 #endif
031 
032 #define STRING_ENUMERATORS_WANTARRAY 0 /* next major */
033 
034 #undef rb_str_new_cstr
035 #undef rb_tainted_str_new_cstr
036 #undef rb_usascii_str_new_cstr
037 #undef rb_external_str_new_cstr
038 #undef rb_locale_str_new_cstr
039 #undef rb_str_dup_frozen
040 #undef rb_str_buf_new_cstr
041 #undef rb_str_buf_cat2
042 #undef rb_str_cat2
043 
044 static VALUE rb_str_clear(VALUE str);
045 
046 VALUE rb_cString;
047 VALUE rb_cSymbol;
048 
049 #define RUBY_MAX_CHAR_LEN 16
050 #define STR_TMPLOCK FL_USER7
051 #define STR_NOEMBED FL_USER1
052 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
053 #define STR_ASSOC   FL_USER3
054 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
055 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
056 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
057 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
058 #define STR_UNSET_NOCAPA(s) do {\
059     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
060 } while (0)
061 
062 #define STR_SET_NOEMBED(str) do {\
063     FL_SET((str), STR_NOEMBED);\
064     STR_SET_EMBED_LEN((str), 0);\
065 } while (0)
066 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
067 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
068 #define STR_SET_EMBED_LEN(str, n) do { \
069     long tmp_n = (n);\
070     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
071     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
072 } while (0)
073 
074 #define STR_SET_LEN(str, n) do { \
075     if (STR_EMBED_P(str)) {\
076         STR_SET_EMBED_LEN((str), (n));\
077     }\
078     else {\
079         RSTRING(str)->as.heap.len = (n);\
080     }\
081 } while (0)
082 
083 #define STR_DEC_LEN(str) do {\
084     if (STR_EMBED_P(str)) {\
085         long n = RSTRING_LEN(str);\
086         n--;\
087         STR_SET_EMBED_LEN((str), n);\
088     }\
089     else {\
090         RSTRING(str)->as.heap.len--;\
091     }\
092 } while (0)
093 
094 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
095 #define TERM_FILL(ptr, termlen) do {\
096     char *const term_fill_ptr = (ptr);\
097     const int term_fill_len = (termlen);\
098     *term_fill_ptr = '\0';\
099     if (UNLIKELY(term_fill_len > 1))\
100         memset(term_fill_ptr, 0, term_fill_len);\
101 } while (0)
102 
103 #define RESIZE_CAPA(str,capacity) do {\
104     const int termlen = TERM_LEN(str);\
105     if (STR_EMBED_P(str)) {\
106         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
107             char *const tmp = ALLOC_N(char, (capacity)+termlen);\
108             const long tlen = RSTRING_LEN(str);\
109             memcpy(tmp, RSTRING_PTR(str), tlen);\
110             RSTRING(str)->as.heap.ptr = tmp;\
111             RSTRING(str)->as.heap.len = tlen;\
112             STR_SET_NOEMBED(str);\
113             RSTRING(str)->as.heap.aux.capa = (capacity);\
114         }\
115     }\
116     else {\
117         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+termlen);\
118         if (!STR_NOCAPA_P(str))\
119             RSTRING(str)->as.heap.aux.capa = (capacity);\
120     }\
121 } while (0)
122 
123 #define STR_SET_SHARED(str, shared_str) do { \
124     OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
125     FL_SET((str), ELTS_SHARED); \
126 } while (0)
127 
128 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
129 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
130 
131 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
132 
133 static inline int
134 single_byte_optimizable(VALUE str)
135 {
136     rb_encoding *enc;
137 
138     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
139     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
140         return 1;
141 
142     enc = STR_ENC_GET(str);
143     if (rb_enc_mbmaxlen(enc) == 1)
144         return 1;
145 
146     /* Conservative.  Possibly single byte.
147      * "\xa1" in Shift_JIS for example. */
148     return 0;
149 }
150 
151 VALUE rb_fs;
152 
153 static inline const char *
154 search_nonascii(const char *p, const char *e)
155 {
156 #if SIZEOF_VALUE == 8
157 # define NONASCII_MASK 0x8080808080808080ULL
158 #elif SIZEOF_VALUE == 4
159 # define NONASCII_MASK 0x80808080UL
160 #endif
161 #ifdef NONASCII_MASK
162     if ((int)sizeof(VALUE) * 2 < e - p) {
163         const VALUE *s, *t;
164         const VALUE lowbits = sizeof(VALUE) - 1;
165         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
166         while (p < (const char *)s) {
167             if (!ISASCII(*p))
168                 return p;
169             p++;
170         }
171         t = (const VALUE*)(~lowbits & (VALUE)e);
172         while (s < t) {
173             if (*s & NONASCII_MASK) {
174                 t = s;
175                 break;
176             }
177             s++;
178         }
179         p = (const char *)t;
180     }
181 #endif
182     while (p < e) {
183         if (!ISASCII(*p))
184             return p;
185         p++;
186     }
187     return NULL;
188 }
189 
190 static int
191 coderange_scan(const char *p, long len, rb_encoding *enc)
192 {
193     const char *e = p + len;
194 
195     if (rb_enc_to_index(enc) == 0) {
196         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
197         p = search_nonascii(p, e);
198         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
199     }
200 
201     if (rb_enc_asciicompat(enc)) {
202         p = search_nonascii(p, e);
203         if (!p) {
204             return ENC_CODERANGE_7BIT;
205         }
206         while (p < e) {
207             int ret = rb_enc_precise_mbclen(p, e, enc);
208             if (!MBCLEN_CHARFOUND_P(ret)) {
209                 return ENC_CODERANGE_BROKEN;
210             }
211             p += MBCLEN_CHARFOUND_LEN(ret);
212             if (p < e) {
213                 p = search_nonascii(p, e);
214                 if (!p) {
215                     return ENC_CODERANGE_VALID;
216                 }
217             }
218         }
219         if (e < p) {
220             return ENC_CODERANGE_BROKEN;
221         }
222         return ENC_CODERANGE_VALID;
223     }
224 
225     while (p < e) {
226         int ret = rb_enc_precise_mbclen(p, e, enc);
227 
228         if (!MBCLEN_CHARFOUND_P(ret)) {
229             return ENC_CODERANGE_BROKEN;
230         }
231         p += MBCLEN_CHARFOUND_LEN(ret);
232     }
233     if (e < p) {
234         return ENC_CODERANGE_BROKEN;
235     }
236     return ENC_CODERANGE_VALID;
237 }
238 
239 long
240 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
241 {
242     const char *p = s;
243 
244     if (*cr == ENC_CODERANGE_BROKEN)
245         return e - s;
246 
247     if (rb_enc_to_index(enc) == 0) {
248         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
249         p = search_nonascii(p, e);
250         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
251         return e - s;
252     }
253     else if (rb_enc_asciicompat(enc)) {
254         p = search_nonascii(p, e);
255         if (!p) {
256             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
257             return e - s;
258         }
259         while (p < e) {
260             int ret = rb_enc_precise_mbclen(p, e, enc);
261             if (!MBCLEN_CHARFOUND_P(ret)) {
262                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
263                 return p - s;
264             }
265             p += MBCLEN_CHARFOUND_LEN(ret);
266             if (p < e) {
267                 p = search_nonascii(p, e);
268                 if (!p) {
269                     *cr = ENC_CODERANGE_VALID;
270                     return e - s;
271                 }
272             }
273         }
274         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
275         return p - s;
276     }
277     else {
278         while (p < e) {
279             int ret = rb_enc_precise_mbclen(p, e, enc);
280             if (!MBCLEN_CHARFOUND_P(ret)) {
281                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
282                 return p - s;
283             }
284             p += MBCLEN_CHARFOUND_LEN(ret);
285         }
286         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
287         return p - s;
288     }
289 }
290 
291 static inline void
292 str_enc_copy(VALUE str1, VALUE str2)
293 {
294     rb_enc_set_index(str1, ENCODING_GET(str2));
295 }
296 
297 static void
298 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
299 {
300     /* this function is designed for copying encoding and coderange
301      * from src to new string "dest" which is made from the part of src.
302      */
303     str_enc_copy(dest, src);
304     if (RSTRING_LEN(dest) == 0) {
305         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
306             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
307         else
308             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
309         return;
310     }
311     switch (ENC_CODERANGE(src)) {
312       case ENC_CODERANGE_7BIT:
313         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
314         break;
315       case ENC_CODERANGE_VALID:
316         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
317             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
318             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
319         else
320             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
321         break;
322       default:
323         break;
324     }
325 }
326 
327 static void
328 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
329 {
330     str_enc_copy(dest, src);
331     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
332 }
333 
334 int
335 rb_enc_str_coderange(VALUE str)
336 {
337     int cr = ENC_CODERANGE(str);
338 
339     if (cr == ENC_CODERANGE_UNKNOWN) {
340         rb_encoding *enc = STR_ENC_GET(str);
341         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
342         ENC_CODERANGE_SET(str, cr);
343     }
344     return cr;
345 }
346 
347 int
348 rb_enc_str_asciionly_p(VALUE str)
349 {
350     rb_encoding *enc = STR_ENC_GET(str);
351 
352     if (!rb_enc_asciicompat(enc))
353         return FALSE;
354     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
355         return TRUE;
356     return FALSE;
357 }
358 
359 static inline void
360 str_mod_check(VALUE s, const char *p, long len)
361 {
362     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
363         rb_raise(rb_eRuntimeError, "string modified");
364     }
365 }
366 
367 size_t
368 rb_str_capacity(VALUE str)
369 {
370     if (STR_EMBED_P(str)) {
371         return RSTRING_EMBED_LEN_MAX;
372     }
373     else if (STR_NOCAPA_P(str)) {
374         return RSTRING(str)->as.heap.len;
375     }
376     else {
377         return RSTRING(str)->as.heap.aux.capa;
378     }
379 }
380 
381 static inline VALUE
382 str_alloc(VALUE klass)
383 {
384     NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0));
385 
386     str->as.heap.ptr = 0;
387     str->as.heap.len = 0;
388     str->as.heap.aux.capa = 0;
389 
390     return (VALUE)str;
391 }
392 
393 static inline VALUE
394 empty_str_alloc(VALUE klass)
395 {
396     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
397         RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
398     }
399     return str_alloc(klass);
400 }
401 
402 static VALUE
403 str_new0(VALUE klass, const char *ptr, long len, int termlen)
404 {
405     VALUE str;
406 
407     if (len < 0) {
408         rb_raise(rb_eArgError, "negative string size (or size too big)");
409     }
410 
411     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
412         RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
413     }
414 
415     str = str_alloc(klass);
416     if (len > RSTRING_EMBED_LEN_MAX) {
417         RSTRING(str)->as.heap.aux.capa = len;
418         RSTRING(str)->as.heap.ptr = ALLOC_N(char, len + termlen);
419         STR_SET_NOEMBED(str);
420     }
421     else if (len == 0) {
422         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
423     }
424     if (ptr) {
425         memcpy(RSTRING_PTR(str), ptr, len);
426     }
427     STR_SET_LEN(str, len);
428     TERM_FILL(RSTRING_PTR(str) + len, termlen);
429     return str;
430 }
431 
432 static VALUE
433 str_new(VALUE klass, const char *ptr, long len)
434 {
435     return str_new0(klass, ptr, len, 1);
436 }
437 
438 VALUE
439 rb_str_new(const char *ptr, long len)
440 {
441     return str_new(rb_cString, ptr, len);
442 }
443 
444 VALUE
445 rb_usascii_str_new(const char *ptr, long len)
446 {
447     VALUE str = rb_str_new(ptr, len);
448     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
449     return str;
450 }
451 
452 VALUE
453 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
454 {
455     VALUE str;
456 
457     if (!enc) return rb_str_new(ptr, len);
458 
459     str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
460     rb_enc_associate(str, enc);
461     return str;
462 }
463 
464 VALUE
465 rb_str_new_cstr(const char *ptr)
466 {
467     if (!ptr) {
468         rb_raise(rb_eArgError, "NULL pointer given");
469     }
470     return rb_str_new(ptr, strlen(ptr));
471 }
472 
473 VALUE
474 rb_usascii_str_new_cstr(const char *ptr)
475 {
476     VALUE str = rb_str_new2(ptr);
477     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
478     return str;
479 }
480 
481 VALUE
482 rb_tainted_str_new(const char *ptr, long len)
483 {
484     VALUE str = rb_str_new(ptr, len);
485 
486     OBJ_TAINT(str);
487     return str;
488 }
489 
490 VALUE
491 rb_tainted_str_new_cstr(const char *ptr)
492 {
493     VALUE str = rb_str_new2(ptr);
494 
495     OBJ_TAINT(str);
496     return str;
497 }
498 
499 VALUE
500 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
501 {
502     extern VALUE rb_cEncodingConverter;
503     rb_econv_t *ec;
504     rb_econv_result_t ret;
505     long len, olen;
506     VALUE econv_wrapper;
507     VALUE newstr;
508     const unsigned char *start, *sp;
509     unsigned char *dest, *dp;
510     size_t converted_output = 0;
511 
512     if (!to) return str;
513     if (!from) from = rb_enc_get(str);
514     if (from == to) return str;
515     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
516         to == rb_ascii8bit_encoding()) {
517         if (STR_ENC_GET(str) != to) {
518             str = rb_str_dup(str);
519             rb_enc_associate(str, to);
520         }
521         return str;
522     }
523 
524     len = RSTRING_LEN(str);
525     newstr = rb_str_new(0, len);
526     OBJ_INFECT(newstr, str);
527     olen = len;
528 
529     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
530     RBASIC_CLEAR_CLASS(econv_wrapper);
531     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
532     if (!ec) return str;
533     DATA_PTR(econv_wrapper) = ec;
534 
535     sp = (unsigned char*)RSTRING_PTR(str);
536     start = sp;
537     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
538            (dp = dest + converted_output),
539            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
540            ret == econv_destination_buffer_full) {
541         /* destination buffer short */
542         size_t converted_input = sp - start;
543         size_t rest = len - converted_input;
544         converted_output = dp - dest;
545         rb_str_set_len(newstr, converted_output);
546         if (converted_input && converted_output &&
547             rest < (LONG_MAX / converted_output)) {
548             rest = (rest * converted_output) / converted_input;
549         }
550         else {
551             rest = olen;
552         }
553         olen += rest < 2 ? 2 : rest;
554         rb_str_resize(newstr, olen);
555     }
556     DATA_PTR(econv_wrapper) = 0;
557     rb_econv_close(ec);
558     rb_gc_force_recycle(econv_wrapper);
559     switch (ret) {
560       case econv_finished:
561         len = dp - (unsigned char*)RSTRING_PTR(newstr);
562         rb_str_set_len(newstr, len);
563         rb_enc_associate(newstr, to);
564         return newstr;
565 
566       default:
567         /* some error, return original */
568         return str;
569     }
570 }
571 
572 VALUE
573 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
574 {
575     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
576 }
577 
578 VALUE
579 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
580 {
581     VALUE str;
582 
583     str = rb_tainted_str_new(ptr, len);
584     if (eenc == rb_usascii_encoding() &&
585         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
586         rb_enc_associate(str, rb_ascii8bit_encoding());
587         return str;
588     }
589     rb_enc_associate(str, eenc);
590     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
591 }
592 
593 VALUE
594 rb_external_str_new(const char *ptr, long len)
595 {
596     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
597 }
598 
599 VALUE
600 rb_external_str_new_cstr(const char *ptr)
601 {
602     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
603 }
604 
605 VALUE
606 rb_locale_str_new(const char *ptr, long len)
607 {
608     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
609 }
610 
611 VALUE
612 rb_locale_str_new_cstr(const char *ptr)
613 {
614     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
615 }
616 
617 VALUE
618 rb_filesystem_str_new(const char *ptr, long len)
619 {
620     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
621 }
622 
623 VALUE
624 rb_filesystem_str_new_cstr(const char *ptr)
625 {
626     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
627 }
628 
629 VALUE
630 rb_str_export(VALUE str)
631 {
632     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
633 }
634 
635 VALUE
636 rb_str_export_locale(VALUE str)
637 {
638     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
639 }
640 
641 VALUE
642 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
643 {
644     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
645 }
646 
647 static VALUE
648 str_replace_shared_without_enc(VALUE str2, VALUE str)
649 {
650     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
651         STR_SET_EMBED(str2);
652         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
653         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
654     }
655     else {
656         str = rb_str_new_frozen(str);
657         FL_SET(str2, STR_NOEMBED);
658         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
659         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
660         STR_SET_SHARED(str2, str);
661     }
662     return str2;
663 }
664 
665 static VALUE
666 str_replace_shared(VALUE str2, VALUE str)
667 {
668     str_replace_shared_without_enc(str2, str);
669     rb_enc_cr_str_exact_copy(str2, str);
670     return str2;
671 }
672 
673 static VALUE
674 str_new_shared(VALUE klass, VALUE str)
675 {
676     return str_replace_shared(str_alloc(klass), str);
677 }
678 
679 static VALUE
680 str_new3(VALUE klass, VALUE str)
681 {
682     return str_new_shared(klass, str);
683 }
684 
685 VALUE
686 rb_str_new_shared(VALUE str)
687 {
688     VALUE str2 = str_new3(rb_obj_class(str), str);
689 
690     OBJ_INFECT(str2, str);
691     return str2;
692 }
693 
694 static VALUE
695 str_new4(VALUE klass, VALUE str)
696 {
697     VALUE str2;
698 
699     str2 = str_alloc(klass);
700     STR_SET_NOEMBED(str2);
701     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
702     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
703     if (STR_SHARED_P(str)) {
704         VALUE shared = RSTRING(str)->as.heap.aux.shared;
705         assert(OBJ_FROZEN(shared));
706         STR_SET_SHARED(str2, shared); /* TODO: WB is not needed because str2 is *new* object */
707     }
708     else {
709         STR_SET_SHARED(str, str2);
710     }
711     rb_enc_cr_str_exact_copy(str2, str);
712     OBJ_INFECT(str2, str);
713     return str2;
714 }
715 
716 VALUE
717 rb_str_new_frozen(VALUE orig)
718 {
719     VALUE klass, str;
720 
721     if (OBJ_FROZEN(orig)) return orig;
722     klass = rb_obj_class(orig);
723     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
724         long ofs;
725         assert(OBJ_FROZEN(str));
726         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
727         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
728             ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & FL_TAINT) ||
729             ENCODING_GET(str) != ENCODING_GET(orig)) {
730             str = str_new3(klass, str);
731             RSTRING(str)->as.heap.ptr += ofs;
732             RSTRING(str)->as.heap.len -= ofs;
733             rb_enc_cr_str_exact_copy(str, orig);
734             OBJ_INFECT(str, orig);
735         }
736     }
737     else if (STR_EMBED_P(orig)) {
738         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
739         rb_enc_cr_str_exact_copy(str, orig);
740         OBJ_INFECT(str, orig);
741     }
742     else if (STR_ASSOC_P(orig)) {
743         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
744         FL_UNSET(orig, STR_ASSOC);
745         str = str_new4(klass, orig);
746         FL_SET(str, STR_ASSOC);
747         OBJ_WRITE(str, &RSTRING(str)->as.heap.aux.shared, assoc);
748         /* TODO: WB is not needed because str is new object */
749     }
750     else {
751         str = str_new4(klass, orig);
752     }
753     OBJ_FREEZE(str);
754     return str;
755 }
756 
757 VALUE
758 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
759 {
760     return str_new(rb_obj_class(obj), ptr, len);
761 }
762 
763 static VALUE
764 str_new_empty(VALUE str)
765 {
766     VALUE v = rb_str_new5(str, 0, 0);
767     rb_enc_copy(v, str);
768     OBJ_INFECT(v, str);
769     return v;
770 }
771 
772 #define STR_BUF_MIN_SIZE 128
773 
774 VALUE
775 rb_str_buf_new(long capa)
776 {
777     VALUE str = str_alloc(rb_cString);
778 
779     if (capa < STR_BUF_MIN_SIZE) {
780         capa = STR_BUF_MIN_SIZE;
781     }
782     FL_SET(str, STR_NOEMBED);
783     RSTRING(str)->as.heap.aux.capa = capa;
784     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
785     RSTRING(str)->as.heap.ptr[0] = '\0';
786 
787     return str;
788 }
789 
790 VALUE
791 rb_str_buf_new_cstr(const char *ptr)
792 {
793     VALUE str;
794     long len = strlen(ptr);
795 
796     str = rb_str_buf_new(len);
797     rb_str_buf_cat(str, ptr, len);
798 
799     return str;
800 }
801 
802 VALUE
803 rb_str_tmp_new(long len)
804 {
805     return str_new(0, 0, len);
806 }
807 
808 void *
809 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
810 {
811     VALUE s = rb_str_tmp_new(len);
812     *store = s;
813     return RSTRING_PTR(s);
814 }
815 
816 void
817 rb_free_tmp_buffer(volatile VALUE *store)
818 {
819     VALUE s = *store;
820     *store = 0;
821     if (s) rb_str_clear(s);
822 }
823 
824 void
825 rb_str_free(VALUE str)
826 {
827     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
828         xfree(RSTRING(str)->as.heap.ptr);
829     }
830 }
831 
832 RUBY_FUNC_EXPORTED size_t
833 rb_str_memsize(VALUE str)
834 {
835     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
836         return RSTRING(str)->as.heap.aux.capa;
837     }
838     else {
839         return 0;
840     }
841 }
842 
843 VALUE
844 rb_str_to_str(VALUE str)
845 {
846     return rb_convert_type(str, T_STRING, "String", "to_str");
847 }
848 
849 static inline void str_discard(VALUE str);
850 
851 void
852 rb_str_shared_replace(VALUE str, VALUE str2)
853 {
854     rb_encoding *enc;
855     int cr;
856     if (str == str2) return;
857     enc = STR_ENC_GET(str2);
858     cr = ENC_CODERANGE(str2);
859     str_discard(str);
860     OBJ_INFECT(str, str2);
861     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
862         STR_SET_EMBED(str);
863         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
864         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
865         rb_enc_associate(str, enc);
866         ENC_CODERANGE_SET(str, cr);
867         return;
868     }
869     STR_SET_NOEMBED(str);
870     STR_UNSET_NOCAPA(str);
871     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
872     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
873     if (STR_NOCAPA_P(str2)) {
874         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
875         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
876         OBJ_WRITE(str, &RSTRING(str)->as.heap.aux.shared, shared);
877     }
878     else {
879         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
880     }
881     STR_SET_EMBED(str2);        /* abandon str2 */
882     RSTRING_PTR(str2)[0] = 0;
883     STR_SET_EMBED_LEN(str2, 0);
884     rb_enc_associate(str, enc);
885     ENC_CODERANGE_SET(str, cr);
886 }
887 
888 static ID id_to_s;
889 
890 VALUE
891 rb_obj_as_string(VALUE obj)
892 {
893     VALUE str;
894 
895     if (RB_TYPE_P(obj, T_STRING)) {
896         return obj;
897     }
898     str = rb_funcall(obj, id_to_s, 0);
899     if (!RB_TYPE_P(str, T_STRING))
900         return rb_any_to_s(obj);
901     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
902     return str;
903 }
904 
905 static VALUE
906 str_replace(VALUE str, VALUE str2)
907 {
908     long len;
909 
910     len = RSTRING_LEN(str2);
911     if (STR_ASSOC_P(str2)) {
912         str2 = rb_str_new4(str2);
913     }
914     if (STR_SHARED_P(str2)) {
915         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
916         assert(OBJ_FROZEN(shared));
917         STR_SET_NOEMBED(str);
918         RSTRING(str)->as.heap.len = len;
919         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
920         FL_SET(str, ELTS_SHARED);
921         FL_UNSET(str, STR_ASSOC);
922         STR_SET_SHARED(str, shared);
923     }
924     else {
925         str_replace_shared(str, str2);
926     }
927 
928     OBJ_INFECT(str, str2);
929     rb_enc_cr_str_exact_copy(str, str2);
930     return str;
931 }
932 
933 static VALUE
934 str_duplicate(VALUE klass, VALUE str)
935 {
936     VALUE dup = str_alloc(klass);
937     str_replace(dup, str);
938     return dup;
939 }
940 
941 VALUE
942 rb_str_dup(VALUE str)
943 {
944     return str_duplicate(rb_obj_class(str), str);
945 }
946 
947 VALUE
948 rb_str_resurrect(VALUE str)
949 {
950     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
951         RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
952                                   rb_sourcefile(), rb_sourceline());
953     }
954     return str_replace(str_alloc(rb_cString), str);
955 }
956 
957 /*
958  *  call-seq:
959  *     String.new(str="")   -> new_str
960  *
961  *  Returns a new string object containing a copy of <i>str</i>.
962  */
963 
964 static VALUE
965 rb_str_init(int argc, VALUE *argv, VALUE str)
966 {
967     VALUE orig;
968 
969     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
970         rb_str_replace(str, orig);
971     return str;
972 }
973 
974 static inline long
975 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
976 {
977     long c;
978     const char *q;
979 
980     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
981         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
982     }
983     else if (rb_enc_asciicompat(enc)) {
984         c = 0;
985         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
986             while (p < e) {
987                 if (ISASCII(*p)) {
988                     q = search_nonascii(p, e);
989                     if (!q)
990                         return c + (e - p);
991                     c += q - p;
992                     p = q;
993                 }
994                 p += rb_enc_fast_mbclen(p, e, enc);
995                 c++;
996             }
997         }
998         else {
999             while (p < e) {
1000                 if (ISASCII(*p)) {
1001                     q = search_nonascii(p, e);
1002                     if (!q)
1003                         return c + (e - p);
1004                     c += q - p;
1005                     p = q;
1006                 }
1007                 p += rb_enc_mbclen(p, e, enc);
1008                 c++;
1009             }
1010         }
1011         return c;
1012     }
1013 
1014     for (c=0; p<e; c++) {
1015         p += rb_enc_mbclen(p, e, enc);
1016     }
1017     return c;
1018 }
1019 
1020 long
1021 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1022 {
1023     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1024 }
1025 
1026 long
1027 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1028 {
1029     long c;
1030     const char *q;
1031     int ret;
1032 
1033     *cr = 0;
1034     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1035         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
1036     }
1037     else if (rb_enc_asciicompat(enc)) {
1038         c = 0;
1039         while (p < e) {
1040             if (ISASCII(*p)) {
1041                 q = search_nonascii(p, e);
1042                 if (!q) {
1043                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
1044                     return c + (e - p);
1045                 }
1046                 c += q - p;
1047                 p = q;
1048             }
1049             ret = rb_enc_precise_mbclen(p, e, enc);
1050             if (MBCLEN_CHARFOUND_P(ret)) {
1051                 *cr |= ENC_CODERANGE_VALID;
1052                 p += MBCLEN_CHARFOUND_LEN(ret);
1053             }
1054             else {
1055                 *cr = ENC_CODERANGE_BROKEN;
1056                 p++;
1057             }
1058             c++;
1059         }
1060         if (!*cr) *cr = ENC_CODERANGE_7BIT;
1061         return c;
1062     }
1063 
1064     for (c=0; p<e; c++) {
1065         ret = rb_enc_precise_mbclen(p, e, enc);
1066         if (MBCLEN_CHARFOUND_P(ret)) {
1067             *cr |= ENC_CODERANGE_VALID;
1068             p += MBCLEN_CHARFOUND_LEN(ret);
1069         }
1070         else {
1071             *cr = ENC_CODERANGE_BROKEN;
1072             if (p + rb_enc_mbminlen(enc) <= e)
1073                 p += rb_enc_mbminlen(enc);
1074             else
1075                 p = e;
1076         }
1077     }
1078     if (!*cr) *cr = ENC_CODERANGE_7BIT;
1079     return c;
1080 }
1081 
1082 #ifdef NONASCII_MASK
1083 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1084 
1085 /*
1086  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1087  * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1088  * Therefore, following pseudo code can detect UTF-8 leading byte.
1089  *
1090  * if (!(byte & 0x80))
1091  *   byte |= 0x40;          // turn on bit6
1092  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
1093  *
1094  * This function calculate every bytes in the argument word `s'
1095  * using the above logic concurrently. and gather every bytes result.
1096  */
1097 static inline VALUE
1098 count_utf8_lead_bytes_with_word(const VALUE *s)
1099 {
1100     VALUE d = *s;
1101 
1102     /* Transform into bit0 represent UTF-8 leading or not. */
1103     d |= ~(d>>1);
1104     d >>= 6;
1105     d &= NONASCII_MASK >> 7;
1106 
1107     /* Gather every bytes. */
1108     d += (d>>8);
1109     d += (d>>16);
1110 #if SIZEOF_VALUE == 8
1111     d += (d>>32);
1112 #endif
1113     return (d&0xF);
1114 }
1115 #endif
1116 
1117 static long
1118 str_strlen(VALUE str, rb_encoding *enc)
1119 {
1120     const char *p, *e;
1121     long n;
1122     int cr;
1123 
1124     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1125     if (!enc) enc = STR_ENC_GET(str);
1126     p = RSTRING_PTR(str);
1127     e = RSTRING_END(str);
1128     cr = ENC_CODERANGE(str);
1129 #ifdef NONASCII_MASK
1130     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1131         enc == rb_utf8_encoding()) {
1132 
1133         VALUE len = 0;
1134         if ((int)sizeof(VALUE) * 2 < e - p) {
1135             const VALUE *s, *t;
1136             const VALUE lowbits = sizeof(VALUE) - 1;
1137             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1138             t = (const VALUE*)(~lowbits & (VALUE)e);
1139             while (p < (const char *)s) {
1140                 if (is_utf8_lead_byte(*p)) len++;
1141                 p++;
1142             }
1143             while (s < t) {
1144                 len += count_utf8_lead_bytes_with_word(s);
1145                 s++;
1146             }
1147             p = (const char *)s;
1148         }
1149         while (p < e) {
1150             if (is_utf8_lead_byte(*p)) len++;
1151             p++;
1152         }
1153         return (long)len;
1154     }
1155 #endif
1156     n = rb_enc_strlen_cr(p, e, enc, &cr);
1157     if (cr) {
1158         ENC_CODERANGE_SET(str, cr);
1159     }
1160     return n;
1161 }
1162 
1163 long
1164 rb_str_strlen(VALUE str)
1165 {
1166     return str_strlen(str, STR_ENC_GET(str));
1167 }
1168 
1169 /*
1170  *  call-seq:
1171  *     str.length   -> integer
1172  *     str.size     -> integer
1173  *
1174  *  Returns the character length of <i>str</i>.
1175  */
1176 
1177 VALUE
1178 rb_str_length(VALUE str)
1179 {
1180     long len;
1181 
1182     len = str_strlen(str, STR_ENC_GET(str));
1183     return LONG2NUM(len);
1184 }
1185 
1186 /*
1187  *  call-seq:
1188  *     str.bytesize  -> integer
1189  *
1190  *  Returns the length of +str+ in bytes.
1191  *
1192  *    "\x80\u3042".bytesize  #=> 4
1193  *    "hello".bytesize       #=> 5
1194  */
1195 
1196 static VALUE
1197 rb_str_bytesize(VALUE str)
1198 {
1199     return LONG2NUM(RSTRING_LEN(str));
1200 }
1201 
1202 /*
1203  *  call-seq:
1204  *     str.empty?   -> true or false
1205  *
1206  *  Returns <code>true</code> if <i>str</i> has a length of zero.
1207  *
1208  *     "hello".empty?   #=> false
1209  *     " ".empty?       #=> false
1210  *     "".empty?        #=> true
1211  */
1212 
1213 static VALUE
1214 rb_str_empty(VALUE str)
1215 {
1216     if (RSTRING_LEN(str) == 0)
1217         return Qtrue;
1218     return Qfalse;
1219 }
1220 
1221 /*
1222  *  call-seq:
1223  *     str + other_str   -> new_str
1224  *
1225  *  Concatenation---Returns a new <code>String</code> containing
1226  *  <i>other_str</i> concatenated to <i>str</i>.
1227  *
1228  *     "Hello from " + self.to_s   #=> "Hello from main"
1229  */
1230 
1231 VALUE
1232 rb_str_plus(VALUE str1, VALUE str2)
1233 {
1234     VALUE str3;
1235     rb_encoding *enc;
1236 
1237     StringValue(str2);
1238     enc = rb_enc_check(str1, str2);
1239     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1240     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1241     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1242            RSTRING_PTR(str2), RSTRING_LEN(str2));
1243     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1244 
1245     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1246         OBJ_TAINT(str3);
1247     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
1248                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
1249     return str3;
1250 }
1251 
1252 /*
1253  *  call-seq:
1254  *     str * integer   -> new_str
1255  *
1256  *  Copy --- Returns a new String containing +integer+ copies of the receiver.
1257  *  +integer+ must be greater than or equal to 0.
1258  *
1259  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
1260  *     "Ho! " * 0   #=> ""
1261  */
1262 
1263 VALUE
1264 rb_str_times(VALUE str, VALUE times)
1265 {
1266     VALUE str2;
1267     long n, len;
1268     char *ptr2;
1269 
1270     len = NUM2LONG(times);
1271     if (len < 0) {
1272         rb_raise(rb_eArgError, "negative argument");
1273     }
1274     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
1275         rb_raise(rb_eArgError, "argument too big");
1276     }
1277 
1278     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1279     ptr2 = RSTRING_PTR(str2);
1280     if (len) {
1281         n = RSTRING_LEN(str);
1282         memcpy(ptr2, RSTRING_PTR(str), n);
1283         while (n <= len/2) {
1284             memcpy(ptr2 + n, ptr2, n);
1285             n *= 2;
1286         }
1287         memcpy(ptr2 + n, ptr2, len-n);
1288     }
1289     ptr2[RSTRING_LEN(str2)] = '\0';
1290     OBJ_INFECT(str2, str);
1291     rb_enc_cr_str_copy_for_substr(str2, str);
1292 
1293     return str2;
1294 }
1295 
1296 /*
1297  *  call-seq:
1298  *     str % arg   -> new_str
1299  *
1300  *  Format---Uses <i>str</i> as a format specification, and returns the result
1301  *  of applying it to <i>arg</i>. If the format specification contains more than
1302  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1303  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
1304  *  details of the format string.
1305  *
1306  *     "%05d" % 123                              #=> "00123"
1307  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
1308  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
1309  */
1310 
1311 static VALUE
1312 rb_str_format_m(VALUE str, VALUE arg)
1313 {
1314     volatile VALUE tmp = rb_check_array_type(arg);
1315 
1316     if (!NIL_P(tmp)) {
1317         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1318     }
1319     return rb_str_format(1, &arg, str);
1320 }
1321 
1322 static inline void
1323 str_modifiable(VALUE str)
1324 {
1325     if (FL_TEST(str, STR_TMPLOCK)) {
1326         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1327     }
1328     rb_check_frozen(str);
1329 }
1330 
1331 static inline int
1332 str_independent(VALUE str)
1333 {
1334     str_modifiable(str);
1335     if (!STR_SHARED_P(str)) return 1;
1336     if (STR_EMBED_P(str)) return 1;
1337     return 0;
1338 }
1339 
1340 static void
1341 str_make_independent_expand(VALUE str, long expand)
1342 {
1343     char *ptr;
1344     long len = RSTRING_LEN(str);
1345     const int termlen = TERM_LEN(str);
1346     long capa = len + expand;
1347 
1348     if (len > capa) len = capa;
1349     ptr = ALLOC_N(char, capa + termlen);
1350     if (RSTRING_PTR(str)) {
1351         memcpy(ptr, RSTRING_PTR(str), len);
1352     }
1353     STR_SET_NOEMBED(str);
1354     STR_UNSET_NOCAPA(str);
1355     TERM_FILL(ptr + len, termlen);
1356     RSTRING(str)->as.heap.ptr = ptr;
1357     RSTRING(str)->as.heap.len = len;
1358     RSTRING(str)->as.heap.aux.capa = capa;
1359 }
1360 
1361 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1362 
1363 void
1364 rb_str_modify(VALUE str)
1365 {
1366     if (!str_independent(str))
1367         str_make_independent(str);
1368     ENC_CODERANGE_CLEAR(str);
1369 }
1370 
1371 void
1372 rb_str_modify_expand(VALUE str, long expand)
1373 {
1374     if (expand < 0) {
1375         rb_raise(rb_eArgError, "negative expanding string size");
1376     }
1377     if (!str_independent(str)) {
1378         str_make_independent_expand(str, expand);
1379     }
1380     else if (expand > 0) {
1381         long len = RSTRING_LEN(str);
1382         long capa = len + expand;
1383         int termlen = TERM_LEN(str);
1384         if (!STR_EMBED_P(str)) {
1385             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa + termlen);
1386             RSTRING(str)->as.heap.aux.capa = capa;
1387         }
1388         else if (capa + termlen > RSTRING_EMBED_LEN_MAX + 1) {
1389             str_make_independent_expand(str, expand);
1390         }
1391     }
1392     ENC_CODERANGE_CLEAR(str);
1393 }
1394 
1395 /* As rb_str_modify(), but don't clear coderange */
1396 static void
1397 str_modify_keep_cr(VALUE str)
1398 {
1399     if (!str_independent(str))
1400         str_make_independent(str);
1401     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1402         /* Force re-scan later */
1403         ENC_CODERANGE_CLEAR(str);
1404 }
1405 
1406 static inline void
1407 str_discard(VALUE str)
1408 {
1409     str_modifiable(str);
1410     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1411         xfree(RSTRING_PTR(str));
1412         RSTRING(str)->as.heap.ptr = 0;
1413         RSTRING(str)->as.heap.len = 0;
1414     }
1415 }
1416 
1417 void
1418 rb_str_associate(VALUE str, VALUE add)
1419 {
1420     /* sanity check */
1421     rb_check_frozen(str);
1422     if (STR_ASSOC_P(str)) {
1423         /* already associated */
1424         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1425     }
1426     else {
1427         if (STR_SHARED_P(str)) {
1428             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1429             str_make_independent(str);
1430             if (STR_ASSOC_P(assoc)) {
1431                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1432                 rb_ary_concat(assoc, add);
1433                 add = assoc;
1434             }
1435         }
1436         else if (STR_EMBED_P(str)) {
1437             str_make_independent(str);
1438         }
1439         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1440             RESIZE_CAPA(str, RSTRING_LEN(str));
1441         }
1442         FL_SET(str, STR_ASSOC);
1443         RBASIC_CLEAR_CLASS(add);
1444         OBJ_WRITE(str, &RSTRING(str)->as.heap.aux.shared, add);
1445     }
1446 }
1447 
1448 VALUE
1449 rb_str_associated(VALUE str)
1450 {
1451     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1452     if (STR_ASSOC_P(str)) {
1453         return RSTRING(str)->as.heap.aux.shared;
1454     }
1455     return Qfalse;
1456 }
1457 
1458 void
1459 rb_must_asciicompat(VALUE str)
1460 {
1461     rb_encoding *enc = rb_enc_get(str);
1462     if (!rb_enc_asciicompat(enc)) {
1463         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
1464     }
1465 }
1466 
1467 VALUE
1468 rb_string_value(volatile VALUE *ptr)
1469 {
1470     VALUE s = *ptr;
1471     if (!RB_TYPE_P(s, T_STRING)) {
1472         s = rb_str_to_str(s);
1473         *ptr = s;
1474     }
1475     return s;
1476 }
1477 
1478 char *
1479 rb_string_value_ptr(volatile VALUE *ptr)
1480 {
1481     VALUE str = rb_string_value(ptr);
1482     return RSTRING_PTR(str);
1483 }
1484 
1485 static const char *
1486 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
1487 {
1488     int n;
1489     const char *e = s + len;
1490 
1491     for (; s + minlen <= e; s += n) {
1492         if (!rb_enc_codepoint_len(s, e, &n, enc)) return s;
1493     }
1494     return 0;
1495 }
1496 
1497 static char *
1498 str_fill_term(VALUE str, char *s, long len, int termlen, rb_encoding *enc)
1499 {
1500     int oldtermlen = rb_enc_mbminlen(enc);
1501     long capa = rb_str_capacity(str) + oldtermlen;
1502     int n;
1503 
1504     if (capa < len + termlen) {
1505         rb_str_modify_expand(str, len + termlen - capa);
1506     }
1507     else {
1508         const char *e = s + len;
1509         int diff = 0;
1510         if (termlen > oldtermlen) diff = termlen - oldtermlen;
1511         if (!diff && str_independent(str) &&
1512             !rb_enc_ascget(e, e + oldtermlen, &n, enc)) {
1513             return s;
1514         }
1515         str_make_independent_expand(str, diff);
1516     }
1517     s = RSTRING_PTR(str);
1518     TERM_FILL(s + len, termlen);
1519     return s;
1520 }
1521 
1522 char *
1523 rb_string_value_cstr(volatile VALUE *ptr)
1524 {
1525     VALUE str = rb_string_value(ptr);
1526     char *s = RSTRING_PTR(str);
1527     long len = RSTRING_LEN(str);
1528     rb_encoding *enc = rb_enc_get(str);
1529     const int minlen = rb_enc_mbminlen(enc);
1530 
1531     if (minlen > 1) {
1532         if (str_null_char(s, len, minlen, enc)) {
1533             rb_raise(rb_eArgError, "string contains null char");
1534         }
1535         return str_fill_term(str, s, len, minlen, enc);
1536     }
1537     if (!s || memchr(s, 0, len)) {
1538         rb_raise(rb_eArgError, "string contains null byte");
1539     }
1540     if (s[len]) {
1541         rb_str_modify(str);
1542         s = RSTRING_PTR(str);
1543         s[RSTRING_LEN(str)] = 0;
1544     }
1545     return s;
1546 }
1547 
1548 void
1549 rb_str_fill_terminator(VALUE str, const int newminlen)
1550 {
1551     char *s = RSTRING_PTR(str);
1552     long len = RSTRING_LEN(str);
1553     rb_encoding *enc = rb_enc_get(str);
1554     str_fill_term(str, s, len, newminlen, enc);
1555 }
1556 
1557 VALUE
1558 rb_check_string_type(VALUE str)
1559 {
1560     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1561     return str;
1562 }
1563 
1564 /*
1565  *  call-seq:
1566  *     String.try_convert(obj) -> string or nil
1567  *
1568  *  Try to convert <i>obj</i> into a String, using to_str method.
1569  *  Returns converted string or nil if <i>obj</i> cannot be converted
1570  *  for any reason.
1571  *
1572  *     String.try_convert("str")     #=> "str"
1573  *     String.try_convert(/re/)      #=> nil
1574  */
1575 static VALUE
1576 rb_str_s_try_convert(VALUE dummy, VALUE str)
1577 {
1578     return rb_check_string_type(str);
1579 }
1580 
1581 static char*
1582 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1583 {
1584     long nth = *nthp;
1585     if (rb_enc_mbmaxlen(enc) == 1) {
1586         p += nth;
1587     }
1588     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1589         p += nth * rb_enc_mbmaxlen(enc);
1590     }
1591     else if (rb_enc_asciicompat(enc)) {
1592         const char *p2, *e2;
1593         int n;
1594 
1595         while (p < e && 0 < nth) {
1596             e2 = p + nth;
1597             if (e < e2) {
1598                 *nthp = nth;
1599                 return (char *)e;
1600             }
1601             if (ISASCII(*p)) {
1602                 p2 = search_nonascii(p, e2);
1603                 if (!p2) {
1604                     nth -= e2 - p;
1605                     *nthp = nth;
1606                     return (char *)e2;
1607                 }
1608                 nth -= p2 - p;
1609                 p = p2;
1610             }
1611             n = rb_enc_mbclen(p, e, enc);
1612             p += n;
1613             nth--;
1614         }
1615         *nthp = nth;
1616         if (nth != 0) {
1617             return (char *)e;
1618         }
1619         return (char *)p;
1620     }
1621     else {
1622         while (p < e && nth--) {
1623             p += rb_enc_mbclen(p, e, enc);
1624         }
1625     }
1626     if (p > e) p = e;
1627     *nthp = nth;
1628     return (char*)p;
1629 }
1630 
1631 char*
1632 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1633 {
1634     return str_nth_len(p, e, &nth, enc);
1635 }
1636 
1637 static char*
1638 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1639 {
1640     if (singlebyte)
1641         p += nth;
1642     else {
1643         p = str_nth_len(p, e, &nth, enc);
1644     }
1645     if (!p) return 0;
1646     if (p > e) p = e;
1647     return (char *)p;
1648 }
1649 
1650 /* char offset to byte offset */
1651 static long
1652 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1653 {
1654     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1655     if (!pp) return e - p;
1656     return pp - p;
1657 }
1658 
1659 long
1660 rb_str_offset(VALUE str, long pos)
1661 {
1662     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1663                       STR_ENC_GET(str), single_byte_optimizable(str));
1664 }
1665 
1666 #ifdef NONASCII_MASK
1667 static char *
1668 str_utf8_nth(const char *p, const char *e, long *nthp)
1669 {
1670     long nth = *nthp;
1671     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1672         const VALUE *s, *t;
1673         const VALUE lowbits = sizeof(VALUE) - 1;
1674         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1675         t = (const VALUE*)(~lowbits & (VALUE)e);
1676         while (p < (const char *)s) {
1677             if (is_utf8_lead_byte(*p)) nth--;
1678             p++;
1679         }
1680         do {
1681             nth -= count_utf8_lead_bytes_with_word(s);
1682             s++;
1683         } while (s < t && (int)sizeof(VALUE) <= nth);
1684         p = (char *)s;
1685     }
1686     while (p < e) {
1687         if (is_utf8_lead_byte(*p)) {
1688             if (nth == 0) break;
1689             nth--;
1690         }
1691         p++;
1692     }
1693     *nthp = nth;
1694     return (char *)p;
1695 }
1696 
1697 static long
1698 str_utf8_offset(const char *p, const char *e, long nth)
1699 {
1700     const char *pp = str_utf8_nth(p, e, &nth);
1701     return pp - p;
1702 }
1703 #endif
1704 
1705 /* byte offset to char offset */
1706 long
1707 rb_str_sublen(VALUE str, long pos)
1708 {
1709     if (single_byte_optimizable(str) || pos < 0)
1710         return pos;
1711     else {
1712         char *p = RSTRING_PTR(str);
1713         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1714     }
1715 }
1716 
1717 VALUE
1718 rb_str_subseq(VALUE str, long beg, long len)
1719 {
1720     VALUE str2;
1721 
1722     if (RSTRING_LEN(str) == beg + len &&
1723         RSTRING_EMBED_LEN_MAX < len) {
1724         str2 = rb_str_new_shared(rb_str_new_frozen(str));
1725         rb_str_drop_bytes(str2, beg);
1726     }
1727     else {
1728         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1729         RB_GC_GUARD(str);
1730     }
1731 
1732     rb_enc_cr_str_copy_for_substr(str2, str);
1733     OBJ_INFECT(str2, str);
1734 
1735     return str2;
1736 }
1737 
1738 char *
1739 rb_str_subpos(VALUE str, long beg, long *lenp)
1740 {
1741     long len = *lenp;
1742     long slen = -1L;
1743     long blen = RSTRING_LEN(str);
1744     rb_encoding *enc = STR_ENC_GET(str);
1745     char *p, *s = RSTRING_PTR(str), *e = s + blen;
1746 
1747     if (len < 0) return 0;
1748     if (!blen) {
1749         len = 0;
1750     }
1751     if (single_byte_optimizable(str)) {
1752         if (beg > blen) return 0;
1753         if (beg < 0) {
1754             beg += blen;
1755             if (beg < 0) return 0;
1756         }
1757         if (beg + len > blen)
1758             len = blen - beg;
1759         if (len < 0) return 0;
1760         p = s + beg;
1761         goto end;
1762     }
1763     if (beg < 0) {
1764         if (len > -beg) len = -beg;
1765         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1766             beg = -beg;
1767             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1768             p = e;
1769             if (!p) return 0;
1770             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1771             if (!p) return 0;
1772             len = e - p;
1773             goto end;
1774         }
1775         else {
1776             slen = str_strlen(str, enc);
1777             beg += slen;
1778             if (beg < 0) return 0;
1779             p = s + beg;
1780             if (len == 0) goto end;
1781         }
1782     }
1783     else if (beg > 0 && beg > RSTRING_LEN(str)) {
1784         return 0;
1785     }
1786     if (len == 0) {
1787         if (beg > str_strlen(str, enc)) return 0;
1788         p = s + beg;
1789     }
1790 #ifdef NONASCII_MASK
1791     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1792         enc == rb_utf8_encoding()) {
1793         p = str_utf8_nth(s, e, &beg);
1794         if (beg > 0) return 0;
1795         len = str_utf8_offset(p, e, len);
1796     }
1797 #endif
1798     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1799         int char_sz = rb_enc_mbmaxlen(enc);
1800 
1801         p = s + beg * char_sz;
1802         if (p > e) {
1803             return 0;
1804         }
1805         else if (len * char_sz > e - p)
1806             len = e - p;
1807         else
1808             len *= char_sz;
1809     }
1810     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1811         if (beg > 0) return 0;
1812         len = 0;
1813     }
1814     else {
1815         len = str_offset(p, e, len, enc, 0);
1816     }
1817   end:
1818     *lenp = len;
1819     RB_GC_GUARD(str);
1820     return p;
1821 }
1822 
1823 VALUE
1824 rb_str_substr(VALUE str, long beg, long len)
1825 {
1826     VALUE str2;
1827     char *p = rb_str_subpos(str, beg, &len);
1828 
1829     if (!p) return Qnil;
1830     if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
1831         str2 = rb_str_new4(str);
1832         str2 = str_new3(rb_obj_class(str2), str2);
1833         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1834         RSTRING(str2)->as.heap.len = len;
1835     }
1836     else {
1837         str2 = rb_str_new5(str, p, len);
1838         rb_enc_cr_str_copy_for_substr(str2, str);
1839         OBJ_INFECT(str2, str);
1840         RB_GC_GUARD(str);
1841     }
1842 
1843     return str2;
1844 }
1845 
1846 VALUE
1847 rb_str_freeze(VALUE str)
1848 {
1849     if (STR_ASSOC_P(str)) {
1850         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1851         OBJ_FREEZE(ary);
1852     }
1853     return rb_obj_freeze(str);
1854 }
1855 
1856 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
1857 #define rb_str_dup_frozen rb_str_new_frozen
1858 
1859 VALUE
1860 rb_str_locktmp(VALUE str)
1861 {
1862     if (FL_TEST(str, STR_TMPLOCK)) {
1863         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1864     }
1865     FL_SET(str, STR_TMPLOCK);
1866     return str;
1867 }
1868 
1869 VALUE
1870 rb_str_unlocktmp(VALUE str)
1871 {
1872     if (!FL_TEST(str, STR_TMPLOCK)) {
1873         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1874     }
1875     FL_UNSET(str, STR_TMPLOCK);
1876     return str;
1877 }
1878 
1879 void
1880 rb_str_set_len(VALUE str, long len)
1881 {
1882     long capa;
1883     const int termlen = TERM_LEN(str);
1884 
1885     str_modifiable(str);
1886     if (STR_SHARED_P(str)) {
1887         rb_raise(rb_eRuntimeError, "can't set length of shared string");
1888     }
1889     if (len + termlen - 1 > (capa = (long)rb_str_capacity(str))) {
1890         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1891     }
1892     STR_SET_LEN(str, len);
1893     TERM_FILL(&RSTRING_PTR(str)[len], termlen);
1894 }
1895 
1896 VALUE
1897 rb_str_resize(VALUE str, long len)
1898 {
1899     long slen;
1900     int independent;
1901 
1902     if (len < 0) {
1903         rb_raise(rb_eArgError, "negative string size (or size too big)");
1904     }
1905 
1906     independent = str_independent(str);
1907     ENC_CODERANGE_CLEAR(str);
1908     slen = RSTRING_LEN(str);
1909     if (len != slen) {
1910         const int termlen = TERM_LEN(str);
1911         if (STR_EMBED_P(str)) {
1912             if (len + termlen <= RSTRING_EMBED_LEN_MAX + 1) {
1913                 STR_SET_EMBED_LEN(str, len);
1914                 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
1915                 return str;
1916             }
1917             str_make_independent_expand(str, len - slen);
1918             STR_SET_NOEMBED(str);
1919         }
1920         else if (len + termlen <= RSTRING_EMBED_LEN_MAX + 1) {
1921             char *ptr = RSTRING(str)->as.heap.ptr;
1922             STR_SET_EMBED(str);
1923             if (slen > len) slen = len;
1924             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1925             TERM_FILL(RSTRING(str)->as.ary + len, termlen);
1926             STR_SET_EMBED_LEN(str, len);
1927             if (independent) xfree(ptr);
1928             return str;
1929         }
1930         else if (!independent) {
1931             str_make_independent_expand(str, len - slen);
1932         }
1933         else if (slen < len || slen - len > 1024) {
1934             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len + termlen);
1935         }
1936         if (!STR_NOCAPA_P(str)) {
1937             RSTRING(str)->as.heap.aux.capa = len;
1938         }
1939         RSTRING(str)->as.heap.len = len;
1940         TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
1941     }
1942     return str;
1943 }
1944 
1945 static VALUE
1946 str_buf_cat(VALUE str, const char *ptr, long len)
1947 {
1948     long capa, total, off = -1;
1949     const int termlen = TERM_LEN(str);
1950 
1951     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1952         off = ptr - RSTRING_PTR(str);
1953     }
1954     rb_str_modify(str);
1955     if (len == 0) return 0;
1956     if (STR_ASSOC_P(str)) {
1957         FL_UNSET(str, STR_ASSOC);
1958         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1959     }
1960     else if (STR_EMBED_P(str)) {
1961         capa = RSTRING_EMBED_LEN_MAX;
1962     }
1963     else {
1964         capa = RSTRING(str)->as.heap.aux.capa;
1965     }
1966     if (RSTRING_LEN(str) >= LONG_MAX - len) {
1967         rb_raise(rb_eArgError, "string sizes too big");
1968     }
1969     total = RSTRING_LEN(str)+len;
1970     if (capa <= total) {
1971         while (total > capa) {
1972             if (capa + termlen >= LONG_MAX / 2) {
1973                 capa = (total + 4095) / 4096;
1974                 break;
1975             }
1976             capa = (capa + termlen) * 2;
1977         }
1978         RESIZE_CAPA(str, capa);
1979     }
1980     if (off != -1) {
1981         ptr = RSTRING_PTR(str) + off;
1982     }
1983     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1984     STR_SET_LEN(str, total);
1985     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1986 
1987     return str;
1988 }
1989 
1990 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1991 
1992 VALUE
1993 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1994 {
1995     if (len == 0) return str;
1996     if (len < 0) {
1997         rb_raise(rb_eArgError, "negative string size (or size too big)");
1998     }
1999     return str_buf_cat(str, ptr, len);
2000 }
2001 
2002 VALUE
2003 rb_str_buf_cat2(VALUE str, const char *ptr)
2004 {
2005     return rb_str_buf_cat(str, ptr, strlen(ptr));
2006 }
2007 
2008 VALUE
2009 rb_str_cat(VALUE str, const char *ptr, long len)
2010 {
2011     if (len < 0) {
2012         rb_raise(rb_eArgError, "negative string size (or size too big)");
2013     }
2014     if (STR_ASSOC_P(str)) {
2015         char *p;
2016         rb_str_modify_expand(str, len);
2017         p = RSTRING(str)->as.heap.ptr;
2018         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
2019         len = RSTRING(str)->as.heap.len += len;
2020         TERM_FILL(p, TERM_LEN(str)); /* sentinel */
2021         return str;
2022     }
2023 
2024     return rb_str_buf_cat(str, ptr, len);
2025 }
2026 
2027 VALUE
2028 rb_str_cat2(VALUE str, const char *ptr)
2029 {
2030     return rb_str_cat(str, ptr, strlen(ptr));
2031 }
2032 
2033 static VALUE
2034 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2035     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2036 {
2037     int str_encindex = ENCODING_GET(str);
2038     int res_encindex;
2039     int str_cr, res_cr;
2040 
2041     str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
2042 
2043     if (str_encindex == ptr_encindex) {
2044         if (str_cr == ENC_CODERANGE_UNKNOWN)
2045             ptr_cr = ENC_CODERANGE_UNKNOWN;
2046         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2047             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2048         }
2049     }
2050     else {
2051         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
2052         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
2053         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2054             if (len == 0)
2055                 return str;
2056             if (RSTRING_LEN(str) == 0) {
2057                 rb_str_buf_cat(str, ptr, len);
2058                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2059                 return str;
2060             }
2061             goto incompatible;
2062         }
2063         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2064             ptr_cr = coderange_scan(ptr, len, ptr_enc);
2065         }
2066         if (str_cr == ENC_CODERANGE_UNKNOWN) {
2067             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2068                 str_cr = rb_enc_str_coderange(str);
2069             }
2070         }
2071     }
2072     if (ptr_cr_ret)
2073         *ptr_cr_ret = ptr_cr;
2074 
2075     if (str_encindex != ptr_encindex &&
2076         str_cr != ENC_CODERANGE_7BIT &&
2077         ptr_cr != ENC_CODERANGE_7BIT) {
2078       incompatible:
2079         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2080             rb_enc_name(rb_enc_from_index(str_encindex)),
2081             rb_enc_name(rb_enc_from_index(ptr_encindex)));
2082     }
2083 
2084     if (str_cr == ENC_CODERANGE_UNKNOWN) {
2085         res_encindex = str_encindex;
2086         res_cr = ENC_CODERANGE_UNKNOWN;
2087     }
2088     else if (str_cr == ENC_CODERANGE_7BIT) {
2089         if (ptr_cr == ENC_CODERANGE_7BIT) {
2090             res_encindex = str_encindex;
2091             res_cr = ENC_CODERANGE_7BIT;
2092         }
2093         else {
2094             res_encindex = ptr_encindex;
2095             res_cr = ptr_cr;
2096         }
2097     }
2098     else if (str_cr == ENC_CODERANGE_VALID) {
2099         res_encindex = str_encindex;
2100         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
2101             res_cr = str_cr;
2102         else
2103             res_cr = ptr_cr;
2104     }
2105     else { /* str_cr == ENC_CODERANGE_BROKEN */
2106         res_encindex = str_encindex;
2107         res_cr = str_cr;
2108         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2109     }
2110 
2111     if (len < 0) {
2112         rb_raise(rb_eArgError, "negative string size (or size too big)");
2113     }
2114     str_buf_cat(str, ptr, len);
2115     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2116     return str;
2117 }
2118 
2119 VALUE
2120 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2121 {
2122     return rb_enc_cr_str_buf_cat(str, ptr, len,
2123         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
2124 }
2125 
2126 VALUE
2127 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2128 {
2129     /* ptr must reference NUL terminated ASCII string. */
2130     int encindex = ENCODING_GET(str);
2131     rb_encoding *enc = rb_enc_from_index(encindex);
2132     if (rb_enc_asciicompat(enc)) {
2133         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2134             encindex, ENC_CODERANGE_7BIT, 0);
2135     }
2136     else {
2137         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2138         while (*ptr) {
2139             unsigned int c = (unsigned char)*ptr;
2140             int len = rb_enc_codelen(c, enc);
2141             rb_enc_mbcput(c, buf, enc);
2142             rb_enc_cr_str_buf_cat(str, buf, len,
2143                 encindex, ENC_CODERANGE_VALID, 0);
2144             ptr++;
2145         }
2146         return str;
2147     }
2148 }
2149 
2150 VALUE
2151 rb_str_buf_append(VALUE str, VALUE str2)
2152 {
2153     int str2_cr;
2154 
2155     str2_cr = ENC_CODERANGE(str2);
2156 
2157     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2158         ENCODING_GET(str2), str2_cr, &str2_cr);
2159 
2160     OBJ_INFECT(str, str2);
2161     ENC_CODERANGE_SET(str2, str2_cr);
2162 
2163     return str;
2164 }
2165 
2166 VALUE
2167 rb_str_append(VALUE str, VALUE str2)
2168 {
2169     rb_encoding *enc;
2170     int cr, cr2;
2171     long len2;
2172 
2173     StringValue(str2);
2174     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2175         long len1 = RSTRING(str)->as.heap.len, len = len1 + len2;
2176         enc = rb_enc_check(str, str2);
2177         cr = ENC_CODERANGE(str);
2178         if ((cr2 = ENC_CODERANGE(str2)) > cr || RSTRING_LEN(str) == 0)
2179             cr = cr2;
2180         rb_str_modify_expand(str, len2);
2181         memcpy(RSTRING(str)->as.heap.ptr + len1, RSTRING_PTR(str2), len2);
2182         TERM_FILL(RSTRING(str)->as.heap.ptr + len, rb_enc_mbminlen(enc));
2183         RSTRING(str)->as.heap.len = len;
2184         rb_enc_associate(str, enc);
2185         ENC_CODERANGE_SET(str, cr);
2186         OBJ_INFECT(str, str2);
2187         return str;
2188     }
2189     return rb_str_buf_append(str, str2);
2190 }
2191 
2192 /*
2193  *  call-seq:
2194  *     str << integer       -> str
2195  *     str.concat(integer)  -> str
2196  *     str << obj           -> str
2197  *     str.concat(obj)      -> str
2198  *
2199  *  Append---Concatenates the given object to <i>str</i>. If the object is a
2200  *  <code>Integer</code>, it is considered as a codepoint, and is converted
2201  *  to a character before concatenation.
2202  *
2203  *     a = "hello "
2204  *     a << "world"   #=> "hello world"
2205  *     a.concat(33)   #=> "hello world!"
2206  */
2207 
2208 VALUE
2209 rb_str_concat(VALUE str1, VALUE str2)
2210 {
2211     unsigned int code;
2212     rb_encoding *enc = STR_ENC_GET(str1);
2213 
2214     if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
2215         if (rb_num_to_uint(str2, &code) == 0) {
2216         }
2217         else if (FIXNUM_P(str2)) {
2218             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2219         }
2220         else {
2221             rb_raise(rb_eRangeError, "bignum out of char range");
2222         }
2223     }
2224     else {
2225         return rb_str_append(str1, str2);
2226     }
2227 
2228     if (enc == rb_usascii_encoding()) {
2229         /* US-ASCII automatically extended to ASCII-8BIT */
2230         char buf[1];
2231         buf[0] = (char)code;
2232         if (code > 0xFF) {
2233             rb_raise(rb_eRangeError, "%u out of char range", code);
2234         }
2235         rb_str_cat(str1, buf, 1);
2236         if (code > 127) {
2237             rb_enc_associate(str1, rb_ascii8bit_encoding());
2238             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
2239         }
2240     }
2241     else {
2242         long pos = RSTRING_LEN(str1);
2243         int cr = ENC_CODERANGE(str1);
2244         int len;
2245         char *buf;
2246 
2247         switch (len = rb_enc_codelen(code, enc)) {
2248           case ONIGERR_INVALID_CODE_POINT_VALUE:
2249             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2250             break;
2251           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
2252           case 0:
2253             rb_raise(rb_eRangeError, "%u out of char range", code);
2254             break;
2255         }
2256         buf = ALLOCA_N(char, len + 1);
2257         rb_enc_mbcput(code, buf, enc);
2258         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2259             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2260         }
2261         rb_str_resize(str1, pos+len);
2262         memcpy(RSTRING_PTR(str1) + pos, buf, len);
2263         if (cr == ENC_CODERANGE_7BIT && code > 127)
2264             cr = ENC_CODERANGE_VALID;
2265         ENC_CODERANGE_SET(str1, cr);
2266     }
2267     return str1;
2268 }
2269 
2270 /*
2271  *  call-seq:
2272  *     str.prepend(other_str)  -> str
2273  *
2274  *  Prepend---Prepend the given string to <i>str</i>.
2275  *
2276  *     a = "world"
2277  *     a.prepend("hello ") #=> "hello world"
2278  *     a                   #=> "hello world"
2279  */
2280 
2281 static VALUE
2282 rb_str_prepend(VALUE str, VALUE str2)
2283 {
2284     StringValue(str2);
2285     StringValue(str);
2286     rb_str_update(str, 0L, 0L, str2);
2287     return str;
2288 }
2289 
2290 st_index_t
2291 rb_str_hash(VALUE str)
2292 {
2293     int e = ENCODING_GET(str);
2294     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2295         e = 0;
2296     }
2297     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2298 }
2299 
2300 int
2301 rb_str_hash_cmp(VALUE str1, VALUE str2)
2302 {
2303     long len;
2304 
2305     if (!rb_str_comparable(str1, str2)) return 1;
2306     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2307         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2308         return 0;
2309     }
2310     return 1;
2311 }
2312 
2313 /*
2314  * call-seq:
2315  *    str.hash   -> fixnum
2316  *
2317  * Return a hash based on the string's length and content.
2318  */
2319 
2320 static VALUE
2321 rb_str_hash_m(VALUE str)
2322 {
2323     st_index_t hval = rb_str_hash(str);
2324     return INT2FIX(hval);
2325 }
2326 
2327 #define lesser(a,b) (((a)>(b))?(b):(a))
2328 
2329 int
2330 rb_str_comparable(VALUE str1, VALUE str2)
2331 {
2332     int idx1, idx2;
2333     int rc1, rc2;
2334 
2335     if (RSTRING_LEN(str1) == 0) return TRUE;
2336     if (RSTRING_LEN(str2) == 0) return TRUE;
2337     idx1 = ENCODING_GET(str1);
2338     idx2 = ENCODING_GET(str2);
2339     if (idx1 == idx2) return TRUE;
2340     rc1 = rb_enc_str_coderange(str1);
2341     rc2 = rb_enc_str_coderange(str2);
2342     if (rc1 == ENC_CODERANGE_7BIT) {
2343         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2344         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
2345             return TRUE;
2346     }
2347     if (rc2 == ENC_CODERANGE_7BIT) {
2348         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
2349             return TRUE;
2350     }
2351     return FALSE;
2352 }
2353 
2354 int
2355 rb_str_cmp(VALUE str1, VALUE str2)
2356 {
2357     long len1, len2;
2358     const char *ptr1, *ptr2;
2359     int retval;
2360 
2361     if (str1 == str2) return 0;
2362     RSTRING_GETMEM(str1, ptr1, len1);
2363     RSTRING_GETMEM(str2, ptr2, len2);
2364     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2365         if (len1 == len2) {
2366             if (!rb_str_comparable(str1, str2)) {
2367                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
2368                     return 1;
2369                 return -1;
2370             }
2371             return 0;
2372         }
2373         if (len1 > len2) return 1;
2374         return -1;
2375     }
2376     if (retval > 0) return 1;
2377     return -1;
2378 }
2379 
2380 /* expect tail call optimization */
2381 static VALUE
2382 str_eql(const VALUE str1, const VALUE str2)
2383 {
2384     const long len = RSTRING_LEN(str1);
2385     const char *ptr1, *ptr2;
2386 
2387     if (len != RSTRING_LEN(str2)) return Qfalse;
2388     if (!rb_str_comparable(str1, str2)) return Qfalse;
2389     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2390         return Qtrue;
2391     if (memcmp(ptr1, ptr2, len) == 0)
2392         return Qtrue;
2393     return Qfalse;
2394 }
2395 
2396 /*
2397  *  call-seq:
2398  *     str == obj    -> true or false
2399  *     str === obj   -> true or false
2400  *
2401  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
2402  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2403  *  <code><=></code> <i>obj</i> returns zero.
2404  */
2405 
2406 VALUE
2407 rb_str_equal(VALUE str1, VALUE str2)
2408 {
2409     if (str1 == str2) return Qtrue;
2410     if (!RB_TYPE_P(str2, T_STRING)) {
2411         if (!rb_respond_to(str2, rb_intern("to_str"))) {
2412             return Qfalse;
2413         }
2414         return rb_equal(str2, str1);
2415     }
2416     return str_eql(str1, str2);
2417 }
2418 
2419 /*
2420  * call-seq:
2421  *   str.eql?(other)   -> true or false
2422  *
2423  * Two strings are equal if they have the same length and content.
2424  */
2425 
2426 static VALUE
2427 rb_str_eql(VALUE str1, VALUE str2)
2428 {
2429     if (str1 == str2) return Qtrue;
2430     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
2431     return str_eql(str1, str2);
2432 }
2433 
2434 /*
2435  *  call-seq:
2436  *     string <=> other_string   -> -1, 0, +1 or nil
2437  *
2438  *
2439  *  Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less
2440  *  than, equal to, or greater than +other_string+.
2441  *
2442  *  +nil+ is returned if the two values are incomparable.
2443  *
2444  *  If the strings are of different lengths, and the strings are equal when
2445  *  compared up to the shortest length, then the longer string is considered
2446  *  greater than the shorter one.
2447  *
2448  *  <code><=></code> is the basis for the methods <code><</code>,
2449  *  <code><=</code>, <code>></code>, <code>>=</code>, and
2450  *  <code>between?</code>, included from module Comparable. The method
2451  *  String#== does not use Comparable#==.
2452  *
2453  *     "abcdef" <=> "abcde"     #=> 1
2454  *     "abcdef" <=> "abcdef"    #=> 0
2455  *     "abcdef" <=> "abcdefg"   #=> -1
2456  *     "abcdef" <=> "ABCDEF"    #=> 1
2457  */
2458 
2459 static VALUE
2460 rb_str_cmp_m(VALUE str1, VALUE str2)
2461 {
2462     int result;
2463 
2464     if (!RB_TYPE_P(str2, T_STRING)) {
2465         VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
2466         if (RB_TYPE_P(tmp, T_STRING)) {
2467             result = rb_str_cmp(str1, tmp);
2468         }
2469         else {
2470             return rb_invcmp(str1, str2);
2471         }
2472     }
2473     else {
2474         result = rb_str_cmp(str1, str2);
2475     }
2476     return INT2FIX(result);
2477 }
2478 
2479 /*
2480  *  call-seq:
2481  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
2482  *
2483  *  Case-insensitive version of <code>String#<=></code>.
2484  *
2485  *     "abcdef".casecmp("abcde")     #=> 1
2486  *     "aBcDeF".casecmp("abcdef")    #=> 0
2487  *     "abcdef".casecmp("abcdefg")   #=> -1
2488  *     "abcdef".casecmp("ABCDEF")    #=> 0
2489  */
2490 
2491 static VALUE
2492 rb_str_casecmp(VALUE str1, VALUE str2)
2493 {
2494     long len;
2495     rb_encoding *enc;
2496     char *p1, *p1end, *p2, *p2end;
2497 
2498     StringValue(str2);
2499     enc = rb_enc_compatible(str1, str2);
2500     if (!enc) {
2501         return Qnil;
2502     }
2503 
2504     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2505     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2506     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2507         while (p1 < p1end && p2 < p2end) {
2508             if (*p1 != *p2) {
2509                 unsigned int c1 = TOUPPER(*p1 & 0xff);
2510                 unsigned int c2 = TOUPPER(*p2 & 0xff);
2511                 if (c1 != c2)
2512                     return INT2FIX(c1 < c2 ? -1 : 1);
2513             }
2514             p1++;
2515             p2++;
2516         }
2517     }
2518     else {
2519         while (p1 < p1end && p2 < p2end) {
2520             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2521             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2522 
2523             if (0 <= c1 && 0 <= c2) {
2524                 c1 = TOUPPER(c1);
2525                 c2 = TOUPPER(c2);
2526                 if (c1 != c2)
2527                     return INT2FIX(c1 < c2 ? -1 : 1);
2528             }
2529             else {
2530                 int r;
2531                 l1 = rb_enc_mbclen(p1, p1end, enc);
2532                 l2 = rb_enc_mbclen(p2, p2end, enc);
2533                 len = l1 < l2 ? l1 : l2;
2534                 r = memcmp(p1, p2, len);
2535                 if (r != 0)
2536                     return INT2FIX(r < 0 ? -1 : 1);
2537                 if (l1 != l2)
2538                     return INT2FIX(l1 < l2 ? -1 : 1);
2539             }
2540             p1 += l1;
2541             p2 += l2;
2542         }
2543     }
2544     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2545     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2546     return INT2FIX(-1);
2547 }
2548 
2549 static long
2550 rb_str_index(VALUE str, VALUE sub, long offset)
2551 {
2552     char *s, *sptr, *e;
2553     long pos, len, slen;
2554     int single_byte = single_byte_optimizable(str);
2555     rb_encoding *enc;
2556 
2557     enc = rb_enc_check(str, sub);
2558     if (is_broken_string(sub)) return -1;
2559 
2560     len = single_byte ? RSTRING_LEN(str) : str_strlen(str, enc);
2561     slen = str_strlen(sub, enc);
2562     if (offset < 0) {
2563         offset += len;
2564         if (offset < 0) return -1;
2565     }
2566     if (len - offset < slen) return -1;
2567 
2568     s = RSTRING_PTR(str);
2569     e = RSTRING_END(str);
2570     if (offset) {
2571         offset = str_offset(s, e, offset, enc, single_byte);
2572         s += offset;
2573     }
2574     if (slen == 0) return offset;
2575     /* need proceed one character at a time */
2576     sptr = RSTRING_PTR(sub);
2577     slen = RSTRING_LEN(sub);
2578     len = RSTRING_LEN(str) - offset;
2579     for (;;) {
2580         char *t;
2581         pos = rb_memsearch(sptr, slen, s, len, enc);
2582         if (pos < 0) return pos;
2583         t = rb_enc_right_char_head(s, s+pos, e, enc);
2584         if (t == s + pos) break;
2585         len -= t - s;
2586         if (len <= 0) return -1;
2587         offset += t - s;
2588         s = t;
2589     }
2590     return pos + offset;
2591 }
2592 
2593 
2594 /*
2595  *  call-seq:
2596  *     str.index(substring [, offset])   -> fixnum or nil
2597  *     str.index(regexp [, offset])      -> fixnum or nil
2598  *
2599  *  Returns the index of the first occurrence of the given <i>substring</i> or
2600  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2601  *  found. If the second parameter is present, it specifies the position in the
2602  *  string to begin the search.
2603  *
2604  *     "hello".index('e')             #=> 1
2605  *     "hello".index('lo')            #=> 3
2606  *     "hello".index('a')             #=> nil
2607  *     "hello".index(?e)              #=> 1
2608  *     "hello".index(/[aeiou]/, -3)   #=> 4
2609  */
2610 
2611 static VALUE
2612 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2613 {
2614     VALUE sub;
2615     VALUE initpos;
2616     long pos;
2617 
2618     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2619         pos = NUM2LONG(initpos);
2620     }
2621     else {
2622         pos = 0;
2623     }
2624     if (pos < 0) {
2625         pos += str_strlen(str, STR_ENC_GET(str));
2626         if (pos < 0) {
2627             if (RB_TYPE_P(sub, T_REGEXP)) {
2628                 rb_backref_set(Qnil);
2629             }
2630             return Qnil;
2631         }
2632     }
2633 
2634     if (SPECIAL_CONST_P(sub)) goto generic;
2635     switch (BUILTIN_TYPE(sub)) {
2636       case T_REGEXP:
2637         if (pos > str_strlen(str, STR_ENC_GET(str)))
2638             return Qnil;
2639         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2640                          rb_enc_check(str, sub), single_byte_optimizable(str));
2641 
2642         pos = rb_reg_search(sub, str, pos, 0);
2643         pos = rb_str_sublen(str, pos);
2644         break;
2645 
2646       generic:
2647       default: {
2648         VALUE tmp;
2649 
2650         tmp = rb_check_string_type(sub);
2651         if (NIL_P(tmp)) {
2652             rb_raise(rb_eTypeError, "type mismatch: %s given",
2653                      rb_obj_classname(sub));
2654         }
2655         sub = tmp;
2656       }
2657         /* fall through */
2658       case T_STRING:
2659         pos = rb_str_index(str, sub, pos);
2660         pos = rb_str_sublen(str, pos);
2661         break;
2662     }
2663 
2664     if (pos == -1) return Qnil;
2665     return LONG2NUM(pos);
2666 }
2667 
2668 static long
2669 rb_str_rindex(VALUE str, VALUE sub, long pos)
2670 {
2671     long len, slen;
2672     char *s, *sbeg, *e, *t;
2673     rb_encoding *enc;
2674     int singlebyte = single_byte_optimizable(str);
2675 
2676     enc = rb_enc_check(str, sub);
2677     if (is_broken_string(sub)) {
2678         return -1;
2679     }
2680     len = str_strlen(str, enc);
2681     slen = str_strlen(sub, enc);
2682     /* substring longer than string */
2683     if (len < slen) return -1;
2684     if (len - pos < slen) {
2685         pos = len - slen;
2686     }
2687     if (len == 0) {
2688         return pos;
2689     }
2690     sbeg = RSTRING_PTR(str);
2691     e = RSTRING_END(str);
2692     t = RSTRING_PTR(sub);
2693     slen = RSTRING_LEN(sub);
2694     s = str_nth(sbeg, e, pos, enc, singlebyte);
2695     while (s) {
2696         if (memcmp(s, t, slen) == 0) {
2697             return pos;
2698         }
2699         if (pos == 0) break;
2700         pos--;
2701         s = rb_enc_prev_char(sbeg, s, e, enc);
2702     }
2703     return -1;
2704 }
2705 
2706 
2707 /*
2708  *  call-seq:
2709  *     str.rindex(substring [, fixnum])   -> fixnum or nil
2710  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
2711  *
2712  *  Returns the index of the last occurrence of the given <i>substring</i> or
2713  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2714  *  found. If the second parameter is present, it specifies the position in the
2715  *  string to end the search---characters beyond this point will not be
2716  *  considered.
2717  *
2718  *     "hello".rindex('e')             #=> 1
2719  *     "hello".rindex('l')             #=> 3
2720  *     "hello".rindex('a')             #=> nil
2721  *     "hello".rindex(?e)              #=> 1
2722  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2723  */
2724 
2725 static VALUE
2726 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2727 {
2728     VALUE sub;
2729     VALUE vpos;
2730     rb_encoding *enc = STR_ENC_GET(str);
2731     long pos, len = str_strlen(str, enc);
2732 
2733     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2734         pos = NUM2LONG(vpos);
2735         if (pos < 0) {
2736             pos += len;
2737             if (pos < 0) {
2738                 if (RB_TYPE_P(sub, T_REGEXP)) {
2739                     rb_backref_set(Qnil);
2740                 }
2741                 return Qnil;
2742             }
2743         }
2744         if (pos > len) pos = len;
2745     }
2746     else {
2747         pos = len;
2748     }
2749 
2750     if (SPECIAL_CONST_P(sub)) goto generic;
2751     switch (BUILTIN_TYPE(sub)) {
2752       case T_REGEXP:
2753         /* enc = rb_get_check(str, sub); */
2754         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2755                          STR_ENC_GET(str), single_byte_optimizable(str));
2756 
2757         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2758             pos = rb_reg_search(sub, str, pos, 1);
2759             pos = rb_str_sublen(str, pos);
2760         }
2761         if (pos >= 0) return LONG2NUM(pos);
2762         break;
2763 
2764       generic:
2765       default: {
2766         VALUE tmp;
2767 
2768         tmp = rb_check_string_type(sub);
2769         if (NIL_P(tmp)) {
2770             rb_raise(rb_eTypeError, "type mismatch: %s given",
2771                      rb_obj_classname(sub));
2772         }
2773         sub = tmp;
2774       }
2775         /* fall through */
2776       case T_STRING:
2777         pos = rb_str_rindex(str, sub, pos);
2778         if (pos >= 0) return LONG2NUM(pos);
2779         break;
2780     }
2781     return Qnil;
2782 }
2783 
2784 /*
2785  *  call-seq:
2786  *     str =~ obj   -> fixnum or nil
2787  *
2788  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2789  *  against <i>str</i>,and returns the position the match starts, or
2790  *  <code>nil</code> if there is no match. Otherwise, invokes
2791  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2792  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2793  *
2794  *  Note: <code>str =~ regexp</code> is not the same as
2795  *  <code>regexp =~ str</code>. Strings captured from named capture groups
2796  *  are assigned to local variables only in the second case.
2797  *
2798  *     "cat o' 9 tails" =~ /\d/   #=> 7
2799  *     "cat o' 9 tails" =~ 9      #=> nil
2800  */
2801 
2802 static VALUE
2803 rb_str_match(VALUE x, VALUE y)
2804 {
2805     if (SPECIAL_CONST_P(y)) goto generic;
2806     switch (BUILTIN_TYPE(y)) {
2807       case T_STRING:
2808         rb_raise(rb_eTypeError, "type mismatch: String given");
2809 
2810       case T_REGEXP:
2811         return rb_reg_match(y, x);
2812 
2813       generic:
2814       default:
2815         return rb_funcall(y, rb_intern("=~"), 1, x);
2816     }
2817 }
2818 
2819 
2820 static VALUE get_pat(VALUE, int);
2821 
2822 
2823 /*
2824  *  call-seq:
2825  *     str.match(pattern)        -> matchdata or nil
2826  *     str.match(pattern, pos)   -> matchdata or nil
2827  *
2828  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2829  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2830  *  parameter is present, it specifies the position in the string to begin the
2831  *  search.
2832  *
2833  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2834  *     'hello'.match('(.)\1')[0]   #=> "ll"
2835  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2836  *     'hello'.match('xx')         #=> nil
2837  *
2838  *  If a block is given, invoke the block with MatchData if match succeed, so
2839  *  that you can write
2840  *
2841  *     str.match(pat) {|m| ...}
2842  *
2843  *  instead of
2844  *
2845  *     if m = str.match(pat)
2846  *       ...
2847  *     end
2848  *
2849  *  The return value is a value from block execution in this case.
2850  */
2851 
2852 static VALUE
2853 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2854 {
2855     VALUE re, result;
2856     if (argc < 1)
2857         rb_check_arity(argc, 1, 2);
2858     re = argv[0];
2859     argv[0] = str;
2860     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2861     if (!NIL_P(result) && rb_block_given_p()) {
2862         return rb_yield(result);
2863     }
2864     return result;
2865 }
2866 
2867 enum neighbor_char {
2868     NEIGHBOR_NOT_CHAR,
2869     NEIGHBOR_FOUND,
2870     NEIGHBOR_WRAPPED
2871 };
2872 
2873 static enum neighbor_char
2874 enc_succ_char(char *p, long len, rb_encoding *enc)
2875 {
2876     long i;
2877     int l;
2878     while (1) {
2879         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2880             p[i] = '\0';
2881         if (i < 0)
2882             return NEIGHBOR_WRAPPED;
2883         ++((unsigned char*)p)[i];
2884         l = rb_enc_precise_mbclen(p, p+len, enc);
2885         if (MBCLEN_CHARFOUND_P(l)) {
2886             l = MBCLEN_CHARFOUND_LEN(l);
2887             if (l == len) {
2888                 return NEIGHBOR_FOUND;
2889             }
2890             else {
2891                 memset(p+l, 0xff, len-l);
2892             }
2893         }
2894         if (MBCLEN_INVALID_P(l) && i < len-1) {
2895             long len2;
2896             int l2;
2897             for (len2 = len-1; 0 < len2; len2--) {
2898                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2899                 if (!MBCLEN_INVALID_P(l2))
2900                     break;
2901             }
2902             memset(p+len2+1, 0xff, len-(len2+1));
2903         }
2904     }
2905 }
2906 
2907 static enum neighbor_char
2908 enc_pred_char(char *p, long len, rb_encoding *enc)
2909 {
2910     long i;
2911     int l;
2912     while (1) {
2913         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2914             p[i] = '\xff';
2915         if (i < 0)
2916             return NEIGHBOR_WRAPPED;
2917         --((unsigned char*)p)[i];
2918         l = rb_enc_precise_mbclen(p, p+len, enc);
2919         if (MBCLEN_CHARFOUND_P(l)) {
2920             l = MBCLEN_CHARFOUND_LEN(l);
2921             if (l == len) {
2922                 return NEIGHBOR_FOUND;
2923             }
2924             else {
2925                 memset(p+l, 0, len-l);
2926             }
2927         }
2928         if (MBCLEN_INVALID_P(l) && i < len-1) {
2929             long len2;
2930             int l2;
2931             for (len2 = len-1; 0 < len2; len2--) {
2932                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2933                 if (!MBCLEN_INVALID_P(l2))
2934                     break;
2935             }
2936             memset(p+len2+1, 0, len-(len2+1));
2937         }
2938     }
2939 }
2940 
2941 /*
2942   overwrite +p+ by succeeding letter in +enc+ and returns
2943   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2944   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2945   assuming each ranges are successive, and mbclen
2946   never change in each ranges.
2947   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2948   character.
2949  */
2950 static enum neighbor_char
2951 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2952 {
2953     enum neighbor_char ret;
2954     unsigned int c;
2955     int ctype;
2956     int range;
2957     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2958 
2959     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2960     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2961         ctype = ONIGENC_CTYPE_DIGIT;
2962     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2963         ctype = ONIGENC_CTYPE_ALPHA;
2964     else
2965         return NEIGHBOR_NOT_CHAR;
2966 
2967     MEMCPY(save, p, char, len);
2968     ret = enc_succ_char(p, len, enc);
2969     if (ret == NEIGHBOR_FOUND) {
2970         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2971         if (rb_enc_isctype(c, ctype, enc))
2972             return NEIGHBOR_FOUND;
2973     }
2974     MEMCPY(p, save, char, len);
2975     range = 1;
2976     while (1) {
2977         MEMCPY(save, p, char, len);
2978         ret = enc_pred_char(p, len, enc);
2979         if (ret == NEIGHBOR_FOUND) {
2980             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2981             if (!rb_enc_isctype(c, ctype, enc)) {
2982                 MEMCPY(p, save, char, len);
2983                 break;
2984             }
2985         }
2986         else {
2987             MEMCPY(p, save, char, len);
2988             break;
2989         }
2990         range++;
2991     }
2992     if (range == 1) {
2993         return NEIGHBOR_NOT_CHAR;
2994     }
2995 
2996     if (ctype != ONIGENC_CTYPE_DIGIT) {
2997         MEMCPY(carry, p, char, len);
2998         return NEIGHBOR_WRAPPED;
2999     }
3000 
3001     MEMCPY(carry, p, char, len);
3002     enc_succ_char(carry, len, enc);
3003     return NEIGHBOR_WRAPPED;
3004 }
3005 
3006 
3007 /*
3008  *  call-seq:
3009  *     str.succ   -> new_str
3010  *     str.next   -> new_str
3011  *
3012  *  Returns the successor to <i>str</i>. The successor is calculated by
3013  *  incrementing characters starting from the rightmost alphanumeric (or
3014  *  the rightmost character if there are no alphanumerics) in the
3015  *  string. Incrementing a digit always results in another digit, and
3016  *  incrementing a letter results in another letter of the same case.
3017  *  Incrementing nonalphanumerics uses the underlying character set's
3018  *  collating sequence.
3019  *
3020  *  If the increment generates a ``carry,'' the character to the left of
3021  *  it is incremented. This process repeats until there is no carry,
3022  *  adding an additional character if necessary.
3023  *
3024  *     "abcd".succ        #=> "abce"
3025  *     "THX1138".succ     #=> "THX1139"
3026  *     "<<koala>>".succ   #=> "<<koalb>>"
3027  *     "1999zzz".succ     #=> "2000aaa"
3028  *     "ZZZ9999".succ     #=> "AAAA0000"
3029  *     "***".succ         #=> "**+"
3030  */
3031 
3032 VALUE
3033 rb_str_succ(VALUE orig)
3034 {
3035     rb_encoding *enc;
3036     VALUE str;
3037     char *sbeg, *s, *e, *last_alnum = 0;
3038     int c = -1;
3039     long l;
3040     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
3041     long carry_pos = 0, carry_len = 1;
3042     enum neighbor_char neighbor = NEIGHBOR_FOUND;
3043 
3044     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
3045     rb_enc_cr_str_copy_for_substr(str, orig);
3046     OBJ_INFECT(str, orig);
3047     if (RSTRING_LEN(str) == 0) return str;
3048 
3049     enc = STR_ENC_GET(orig);
3050     sbeg = RSTRING_PTR(str);
3051     s = e = sbeg + RSTRING_LEN(str);
3052 
3053     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3054         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
3055             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
3056                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
3057                 s = last_alnum;
3058                 break;
3059             }
3060         }
3061         l = rb_enc_precise_mbclen(s, e, enc);
3062         if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
3063         l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
3064         neighbor = enc_succ_alnum_char(s, l, enc, carry);
3065         switch (neighbor) {
3066           case NEIGHBOR_NOT_CHAR:
3067             continue;
3068           case NEIGHBOR_FOUND:
3069             return str;
3070           case NEIGHBOR_WRAPPED:
3071             last_alnum = s;
3072             break;
3073         }
3074         c = 1;
3075         carry_pos = s - sbeg;
3076         carry_len = l;
3077     }
3078     if (c == -1) {              /* str contains no alnum */
3079         s = e;
3080         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3081             enum neighbor_char neighbor;
3082             l = rb_enc_precise_mbclen(s, e, enc);
3083             if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
3084             l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
3085             neighbor = enc_succ_char(s, l, enc);
3086             if (neighbor == NEIGHBOR_FOUND)
3087                 return str;
3088             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
3089                 /* wrapped to \0...\0.  search next valid char. */
3090                 enc_succ_char(s, l, enc);
3091             }
3092             if (!rb_enc_asciicompat(enc)) {
3093                 MEMCPY(carry, s, char, l);
3094                 carry_len = l;
3095             }
3096             carry_pos = s - sbeg;
3097         }
3098     }
3099     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
3100     s = RSTRING_PTR(str) + carry_pos;
3101     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
3102     memmove(s, carry, carry_len);
3103     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
3104     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3105     rb_enc_str_coderange(str);
3106     return str;
3107 }
3108 
3109 
3110 /*
3111  *  call-seq:
3112  *     str.succ!   -> str
3113  *     str.next!   -> str
3114  *
3115  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
3116  *  place.
3117  */
3118 
3119 static VALUE
3120 rb_str_succ_bang(VALUE str)
3121 {
3122     rb_str_shared_replace(str, rb_str_succ(str));
3123 
3124     return str;
3125 }
3126 
3127 
3128 /*
3129  *  call-seq:
3130  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
3131  *     str.upto(other_str, exclusive=false)                -> an_enumerator
3132  *
3133  *  Iterates through successive values, starting at <i>str</i> and
3134  *  ending at <i>other_str</i> inclusive, passing each value in turn to
3135  *  the block. The <code>String#succ</code> method is used to generate
3136  *  each value.  If optional second argument exclusive is omitted or is false,
3137  *  the last value will be included; otherwise it will be excluded.
3138  *
3139  *  If no block is given, an enumerator is returned instead.
3140  *
3141  *     "a8".upto("b6") {|s| print s, ' ' }
3142  *     for s in "a8".."b6"
3143  *       print s, ' '
3144  *     end
3145  *
3146  *  <em>produces:</em>
3147  *
3148  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
3149  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
3150  *
3151  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3152  *  both are recognized as decimal numbers. In addition, the width of
3153  *  string (e.g. leading zeros) is handled appropriately.
3154  *
3155  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
3156  *     "25".upto("5").to_a   #=> []
3157  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
3158  */
3159 
3160 static VALUE
3161 rb_str_upto(int argc, VALUE *argv, VALUE beg)
3162 {
3163     VALUE end, exclusive;
3164     VALUE current, after_end;
3165     ID succ;
3166     int n, excl, ascii;
3167     rb_encoding *enc;
3168 
3169     rb_scan_args(argc, argv, "11", &end, &exclusive);
3170     RETURN_ENUMERATOR(beg, argc, argv);
3171     excl = RTEST(exclusive);
3172     CONST_ID(succ, "succ");
3173     StringValue(end);
3174     enc = rb_enc_check(beg, end);
3175     ascii = (is_ascii_string(beg) && is_ascii_string(end));
3176     /* single character */
3177     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3178         char c = RSTRING_PTR(beg)[0];
3179         char e = RSTRING_PTR(end)[0];
3180 
3181         if (c > e || (excl && c == e)) return beg;
3182         for (;;) {
3183             rb_yield(rb_enc_str_new(&c, 1, enc));
3184             if (!excl && c == e) break;
3185             c++;
3186             if (excl && c == e) break;
3187         }
3188         return beg;
3189     }
3190     /* both edges are all digits */
3191     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3192         char *s, *send;
3193         VALUE b, e;
3194         int width;
3195 
3196         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3197         width = rb_long2int(send - s);
3198         while (s < send) {
3199             if (!ISDIGIT(*s)) goto no_digits;
3200             s++;
3201         }
3202         s = RSTRING_PTR(end); send = RSTRING_END(end);
3203         while (s < send) {
3204             if (!ISDIGIT(*s)) goto no_digits;
3205             s++;
3206         }
3207         b = rb_str_to_inum(beg, 10, FALSE);
3208         e = rb_str_to_inum(end, 10, FALSE);
3209         if (FIXNUM_P(b) && FIXNUM_P(e)) {
3210             long bi = FIX2LONG(b);
3211             long ei = FIX2LONG(e);
3212             rb_encoding *usascii = rb_usascii_encoding();
3213 
3214             while (bi <= ei) {
3215                 if (excl && bi == ei) break;
3216                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3217                 bi++;
3218             }
3219         }
3220         else {
3221             ID op = excl ? '<' : rb_intern("<=");
3222             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3223 
3224             args[0] = INT2FIX(width);
3225             while (rb_funcall(b, op, 1, e)) {
3226                 args[1] = b;
3227                 rb_yield(rb_str_format(numberof(args), args, fmt));
3228                 b = rb_funcall(b, succ, 0, 0);
3229             }
3230         }
3231         return beg;
3232     }
3233     /* normal case */
3234   no_digits:
3235     n = rb_str_cmp(beg, end);
3236     if (n > 0 || (excl && n == 0)) return beg;
3237 
3238     after_end = rb_funcall(end, succ, 0, 0);
3239     current = rb_str_dup(beg);
3240     while (!rb_str_equal(current, after_end)) {
3241         VALUE next = Qnil;
3242         if (excl || !rb_str_equal(current, end))
3243             next = rb_funcall(current, succ, 0, 0);
3244         rb_yield(current);
3245         if (NIL_P(next)) break;
3246         current = next;
3247         StringValue(current);
3248         if (excl && rb_str_equal(current, end)) break;
3249         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3250             break;
3251     }
3252 
3253     return beg;
3254 }
3255 
3256 static VALUE
3257 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3258 {
3259     if (rb_reg_search(re, str, 0, 0) >= 0) {
3260         VALUE match = rb_backref_get();
3261         int nth = rb_reg_backref_number(match, backref);
3262         return rb_reg_nth_match(nth, match);
3263     }
3264     return Qnil;
3265 }
3266 
3267 static VALUE
3268 rb_str_aref(VALUE str, VALUE indx)
3269 {
3270     long idx;
3271 
3272     if (FIXNUM_P(indx)) {
3273         idx = FIX2LONG(indx);
3274 
3275       num_index:
3276         str = rb_str_substr(str, idx, 1);
3277         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3278         return str;
3279     }
3280 
3281     if (SPECIAL_CONST_P(indx)) goto generic;
3282     switch (BUILTIN_TYPE(indx)) {
3283       case T_REGEXP:
3284         return rb_str_subpat(str, indx, INT2FIX(0));
3285 
3286       case T_STRING:
3287         if (rb_str_index(str, indx, 0) != -1)
3288             return rb_str_dup(indx);
3289         return Qnil;
3290 
3291       generic:
3292       default:
3293         /* check if indx is Range */
3294         {
3295             long beg, len;
3296             VALUE tmp;
3297 
3298             len = str_strlen(str, STR_ENC_GET(str));
3299             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3300               case Qfalse:
3301                 break;
3302               case Qnil:
3303                 return Qnil;
3304               default:
3305                 tmp = rb_str_substr(str, beg, len);
3306                 return tmp;
3307             }
3308         }
3309         idx = NUM2LONG(indx);
3310         goto num_index;
3311     }
3312 
3313     UNREACHABLE;
3314 }
3315 
3316 
3317 /*
3318  *  call-seq:
3319  *     str[index]                 -> new_str or nil
3320  *     str[start, length]         -> new_str or nil
3321  *     str[range]                 -> new_str or nil
3322  *     str[regexp]                -> new_str or nil
3323  *     str[regexp, capture]       -> new_str or nil
3324  *     str[match_str]             -> new_str or nil
3325  *     str.slice(index)           -> new_str or nil
3326  *     str.slice(start, length)   -> new_str or nil
3327  *     str.slice(range)           -> new_str or nil
3328  *     str.slice(regexp)          -> new_str or nil
3329  *     str.slice(regexp, capture) -> new_str or nil
3330  *     str.slice(match_str)       -> new_str or nil
3331  *
3332  *  Element Reference --- If passed a single +index+, returns a substring of
3333  *  one character at that index. If passed a +start+ index and a +length+,
3334  *  returns a substring containing +length+ characters starting at the
3335  *  +index+. If passed a +range+, its beginning and end are interpreted as
3336  *  offsets delimiting the substring to be returned.
3337  *
3338  *  In these three cases, if an index is negative, it is counted from the end
3339  *  of the string.  For the +start+ and +range+ cases the starting index
3340  *  is just before a character and an index matching the string's size.
3341  *  Additionally, an empty string is returned when the starting index for a
3342  *  character range is at the end of the string.
3343  *
3344  *  Returns +nil+ if the initial index falls outside the string or the length
3345  *  is negative.
3346  *
3347  *  If a +Regexp+ is supplied, the matching portion of the string is
3348  *  returned.  If a +capture+ follows the regular expression, which may be a
3349  *  capture group index or name, follows the regular expression that component
3350  *  of the MatchData is returned instead.
3351  *
3352  *  If a +match_str+ is given, that string is returned if it occurs in
3353  *  the string.
3354  *
3355  *  Returns +nil+ if the regular expression does not match or the match string
3356  *  cannot be found.
3357  *
3358  *     a = "hello there"
3359  *
3360  *     a[1]                   #=> "e"
3361  *     a[2, 3]                #=> "llo"
3362  *     a[2..3]                #=> "ll"
3363  *
3364  *     a[-3, 2]               #=> "er"
3365  *     a[7..-2]               #=> "her"
3366  *     a[-4..-2]              #=> "her"
3367  *     a[-2..-4]              #=> ""
3368  *
3369  *     a[11, 0]               #=> ""
3370  *     a[11]                  #=> nil
3371  *     a[12, 0]               #=> nil
3372  *     a[12..-1]              #=> nil
3373  *
3374  *     a[/[aeiou](.)\1/]      #=> "ell"
3375  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
3376  *     a[/[aeiou](.)\1/, 1]   #=> "l"
3377  *     a[/[aeiou](.)\1/, 2]   #=> nil
3378  *
3379  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
3380  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"]     #=> "e"
3381  *
3382  *     a["lo"]                #=> "lo"
3383  *     a["bye"]               #=> nil
3384  */
3385 
3386 static VALUE
3387 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
3388 {
3389     if (argc == 2) {
3390         if (RB_TYPE_P(argv[0], T_REGEXP)) {
3391             return rb_str_subpat(str, argv[0], argv[1]);
3392         }
3393         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3394     }
3395     rb_check_arity(argc, 1, 2);
3396     return rb_str_aref(str, argv[0]);
3397 }
3398 
3399 VALUE
3400 rb_str_drop_bytes(VALUE str, long len)
3401 {
3402     char *ptr = RSTRING_PTR(str);
3403     long olen = RSTRING_LEN(str), nlen;
3404 
3405     str_modifiable(str);
3406     if (len > olen) len = olen;
3407     nlen = olen - len;
3408     if (nlen <= RSTRING_EMBED_LEN_MAX) {
3409         char *oldptr = ptr;
3410         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3411         STR_SET_EMBED(str);
3412         STR_SET_EMBED_LEN(str, nlen);
3413         ptr = RSTRING(str)->as.ary;
3414         memmove(ptr, oldptr + len, nlen);
3415         if (fl == STR_NOEMBED) xfree(oldptr);
3416     }
3417     else {
3418         if (!STR_SHARED_P(str)) rb_str_new4(str);
3419         ptr = RSTRING(str)->as.heap.ptr += len;
3420         RSTRING(str)->as.heap.len = nlen;
3421     }
3422     ptr[nlen] = 0;
3423     ENC_CODERANGE_CLEAR(str);
3424     return str;
3425 }
3426 
3427 static void
3428 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3429 {
3430     if (beg == 0 && RSTRING_LEN(val) == 0) {
3431         rb_str_drop_bytes(str, len);
3432         OBJ_INFECT(str, val);
3433         return;
3434     }
3435 
3436     rb_str_modify(str);
3437     if (len < RSTRING_LEN(val)) {
3438         /* expand string */
3439         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + TERM_LEN(str));
3440     }
3441 
3442     if (RSTRING_LEN(val) != len) {
3443         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3444                 RSTRING_PTR(str) + beg + len,
3445                 RSTRING_LEN(str) - (beg + len));
3446     }
3447     if (RSTRING_LEN(val) < beg && len < 0) {
3448         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3449     }
3450     if (RSTRING_LEN(val) > 0) {
3451         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3452     }
3453     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3454     if (RSTRING_PTR(str)) {
3455         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3456     }
3457     OBJ_INFECT(str, val);
3458 }
3459 
3460 static void
3461 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3462 {
3463     long slen;
3464     char *p, *e;
3465     rb_encoding *enc;
3466     int singlebyte = single_byte_optimizable(str);
3467     int cr;
3468 
3469     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3470 
3471     StringValue(val);
3472     enc = rb_enc_check(str, val);
3473     slen = str_strlen(str, enc);
3474 
3475     if (slen < beg) {
3476       out_of_range:
3477         rb_raise(rb_eIndexError, "index %ld out of string", beg);
3478     }
3479     if (beg < 0) {
3480         if (-beg > slen) {
3481             goto out_of_range;
3482         }
3483         beg += slen;
3484     }
3485     if (slen < len || slen < beg + len) {
3486         len = slen - beg;
3487     }
3488     str_modify_keep_cr(str);
3489     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3490     if (!p) p = RSTRING_END(str);
3491     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3492     if (!e) e = RSTRING_END(str);
3493     /* error check */
3494     beg = p - RSTRING_PTR(str); /* physical position */
3495     len = e - p;                /* physical length */
3496     rb_str_splice_0(str, beg, len, val);
3497     rb_enc_associate(str, enc);
3498     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
3499     if (cr != ENC_CODERANGE_BROKEN)
3500         ENC_CODERANGE_SET(str, cr);
3501 }
3502 
3503 void
3504 rb_str_update(VALUE str, long beg, long len, VALUE val)
3505 {
3506     rb_str_splice(str, beg, len, val);
3507 }
3508 
3509 static void
3510 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3511 {
3512     int nth;
3513     VALUE match;
3514     long start, end, len;
3515     rb_encoding *enc;
3516     struct re_registers *regs;
3517 
3518     if (rb_reg_search(re, str, 0, 0) < 0) {
3519         rb_raise(rb_eIndexError, "regexp not matched");
3520     }
3521     match = rb_backref_get();
3522     nth = rb_reg_backref_number(match, backref);
3523     regs = RMATCH_REGS(match);
3524     if (nth >= regs->num_regs) {
3525       out_of_range:
3526         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3527     }
3528     if (nth < 0) {
3529         if (-nth >= regs->num_regs) {
3530             goto out_of_range;
3531         }
3532         nth += regs->num_regs;
3533     }
3534 
3535     start = BEG(nth);
3536     if (start == -1) {
3537         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3538     }
3539     end = END(nth);
3540     len = end - start;
3541     StringValue(val);
3542     enc = rb_enc_check(str, val);
3543     rb_str_splice_0(str, start, len, val);
3544     rb_enc_associate(str, enc);
3545 }
3546 
3547 static VALUE
3548 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3549 {
3550     long idx, beg;
3551 
3552     if (FIXNUM_P(indx)) {
3553         idx = FIX2LONG(indx);
3554       num_index:
3555         rb_str_splice(str, idx, 1, val);
3556         return val;
3557     }
3558 
3559     if (SPECIAL_CONST_P(indx)) goto generic;
3560     switch (TYPE(indx)) {
3561       case T_REGEXP:
3562         rb_str_subpat_set(str, indx, INT2FIX(0), val);
3563         return val;
3564 
3565       case T_STRING:
3566         beg = rb_str_index(str, indx, 0);
3567         if (beg < 0) {
3568             rb_raise(rb_eIndexError, "string not matched");
3569         }
3570         beg = rb_str_sublen(str, beg);
3571         rb_str_splice(str, beg, str_strlen(indx, 0), val);
3572         return val;
3573 
3574       generic:
3575       default:
3576         /* check if indx is Range */
3577         {
3578             long beg, len;
3579             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3580                 rb_str_splice(str, beg, len, val);
3581                 return val;
3582             }
3583         }
3584         idx = NUM2LONG(indx);
3585         goto num_index;
3586     }
3587 }
3588 
3589 /*
3590  *  call-seq:
3591  *     str[fixnum] = new_str
3592  *     str[fixnum, fixnum] = new_str
3593  *     str[range] = aString
3594  *     str[regexp] = new_str
3595  *     str[regexp, fixnum] = new_str
3596  *     str[regexp, name] = new_str
3597  *     str[other_str] = new_str
3598  *
3599  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
3600  *  portion of the string affected is determined using the same criteria as
3601  *  <code>String#[]</code>. If the replacement string is not the same length as
3602  *  the text it is replacing, the string will be adjusted accordingly. If the
3603  *  regular expression or string is used as the index doesn't match a position
3604  *  in the string, <code>IndexError</code> is raised. If the regular expression
3605  *  form is used, the optional second <code>Fixnum</code> allows you to specify
3606  *  which portion of the match to replace (effectively using the
3607  *  <code>MatchData</code> indexing rules. The forms that take a
3608  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3609  *  out of range; the <code>Range</code> form will raise a
3610  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3611  *  will raise an <code>IndexError</code> on negative match.
3612  */
3613 
3614 static VALUE
3615 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
3616 {
3617     if (argc == 3) {
3618         if (RB_TYPE_P(argv[0], T_REGEXP)) {
3619             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3620         }
3621         else {
3622             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3623         }
3624         return argv[2];
3625     }
3626     rb_check_arity(argc, 2, 3);
3627     return rb_str_aset(str, argv[0], argv[1]);
3628 }
3629 
3630 /*
3631  *  call-seq:
3632  *     str.insert(index, other_str)   -> str
3633  *
3634  *  Inserts <i>other_str</i> before the character at the given
3635  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3636  *  end of the string, and insert <em>after</em> the given character.
3637  *  The intent is insert <i>aString</i> so that it starts at the given
3638  *  <i>index</i>.
3639  *
3640  *     "abcd".insert(0, 'X')    #=> "Xabcd"
3641  *     "abcd".insert(3, 'X')    #=> "abcXd"
3642  *     "abcd".insert(4, 'X')    #=> "abcdX"
3643  *     "abcd".insert(-3, 'X')   #=> "abXcd"
3644  *     "abcd".insert(-1, 'X')   #=> "abcdX"
3645  */
3646 
3647 static VALUE
3648 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3649 {
3650     long pos = NUM2LONG(idx);
3651 
3652     if (pos == -1) {
3653         return rb_str_append(str, str2);
3654     }
3655     else if (pos < 0) {
3656         pos++;
3657     }
3658     rb_str_splice(str, pos, 0, str2);
3659     return str;
3660 }
3661 
3662 
3663 /*
3664  *  call-seq:
3665  *     str.slice!(fixnum)           -> fixnum or nil
3666  *     str.slice!(fixnum, fixnum)   -> new_str or nil
3667  *     str.slice!(range)            -> new_str or nil
3668  *     str.slice!(regexp)           -> new_str or nil
3669  *     str.slice!(other_str)        -> new_str or nil
3670  *
3671  *  Deletes the specified portion from <i>str</i>, and returns the portion
3672  *  deleted.
3673  *
3674  *     string = "this is a string"
3675  *     string.slice!(2)        #=> "i"
3676  *     string.slice!(3..6)     #=> " is "
3677  *     string.slice!(/s.*t/)   #=> "sa st"
3678  *     string.slice!("r")      #=> "r"
3679  *     string                  #=> "thing"
3680  */
3681 
3682 static VALUE
3683 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3684 {
3685     VALUE result;
3686     VALUE buf[3];
3687     int i;
3688 
3689     rb_check_arity(argc, 1, 2);
3690     for (i=0; i<argc; i++) {
3691         buf[i] = argv[i];
3692     }
3693     str_modify_keep_cr(str);
3694     result = rb_str_aref_m(argc, buf, str);
3695     if (!NIL_P(result)) {
3696         buf[i] = rb_str_new(0,0);
3697         rb_str_aset_m(argc+1, buf, str);
3698     }
3699     return result;
3700 }
3701 
3702 static VALUE
3703 get_pat(VALUE pat, int quote)
3704 {
3705     VALUE val;
3706 
3707     switch (TYPE(pat)) {
3708       case T_REGEXP:
3709         return pat;
3710 
3711       case T_STRING:
3712         break;
3713 
3714       default:
3715         val = rb_check_string_type(pat);
3716         if (NIL_P(val)) {
3717             Check_Type(pat, T_REGEXP);
3718         }
3719         pat = val;
3720     }
3721 
3722     if (quote) {
3723         pat = rb_reg_quote(pat);
3724     }
3725 
3726     return rb_reg_regcomp(pat);
3727 }
3728 
3729 
3730 /*
3731  *  call-seq:
3732  *     str.sub!(pattern, replacement)          -> str or nil
3733  *     str.sub!(pattern) {|match| block }      -> str or nil
3734  *
3735  *  Performs the same substitution as String#sub in-place.
3736  *
3737  *  Returns +str+ if a substitution was performed or +nil+ if no substitution
3738  *  was performed.
3739  */
3740 
3741 static VALUE
3742 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3743 {
3744     VALUE pat, repl, hash = Qnil;
3745     int iter = 0;
3746     int tainted = 0;
3747     long plen;
3748     int min_arity = rb_block_given_p() ? 1 : 2;
3749 
3750     rb_check_arity(argc, min_arity, 2);
3751     if (argc == 1) {
3752         iter = 1;
3753     }
3754     else {
3755         repl = argv[1];
3756         hash = rb_check_hash_type(argv[1]);
3757         if (NIL_P(hash)) {
3758             StringValue(repl);
3759         }
3760         if (OBJ_TAINTED(repl)) tainted = 1;
3761     }
3762 
3763     pat = get_pat(argv[0], 1);
3764     str_modifiable(str);
3765     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3766         rb_encoding *enc;
3767         int cr = ENC_CODERANGE(str);
3768         VALUE match = rb_backref_get();
3769         struct re_registers *regs = RMATCH_REGS(match);
3770         long beg0 = BEG(0);
3771         long end0 = END(0);
3772         char *p, *rp;
3773         long len, rlen;
3774 
3775         if (iter || !NIL_P(hash)) {
3776             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3777 
3778             if (iter) {
3779                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3780             }
3781             else {
3782                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3783                 repl = rb_obj_as_string(repl);
3784             }
3785             str_mod_check(str, p, len);
3786             rb_check_frozen(str);
3787         }
3788         else {
3789             repl = rb_reg_regsub(repl, str, regs, pat);
3790         }
3791         enc = rb_enc_compatible(str, repl);
3792         if (!enc) {
3793             rb_encoding *str_enc = STR_ENC_GET(str);
3794             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3795             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3796                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3797                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3798                          rb_enc_name(str_enc),
3799                          rb_enc_name(STR_ENC_GET(repl)));
3800             }
3801             enc = STR_ENC_GET(repl);
3802         }
3803         rb_str_modify(str);
3804         rb_enc_associate(str, enc);
3805         if (OBJ_TAINTED(repl)) tainted = 1;
3806         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3807             int cr2 = ENC_CODERANGE(repl);
3808             if (cr2 == ENC_CODERANGE_BROKEN ||
3809                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3810                 cr = ENC_CODERANGE_UNKNOWN;
3811             else
3812                 cr = cr2;
3813         }
3814         plen = end0 - beg0;
3815         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3816         len = RSTRING_LEN(str);
3817         if (rlen > plen) {
3818             RESIZE_CAPA(str, len + rlen - plen);
3819         }
3820         p = RSTRING_PTR(str);
3821         if (rlen != plen) {
3822             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3823         }
3824         memcpy(p + beg0, rp, rlen);
3825         len += rlen - plen;
3826         STR_SET_LEN(str, len);
3827         RSTRING_PTR(str)[len] = '\0';
3828         ENC_CODERANGE_SET(str, cr);
3829         if (tainted) OBJ_TAINT(str);
3830 
3831         return str;
3832     }
3833     return Qnil;
3834 }
3835 
3836 
3837 /*
3838  *  call-seq:
3839  *     str.sub(pattern, replacement)         -> new_str
3840  *     str.sub(pattern, hash)                -> new_str
3841  *     str.sub(pattern) {|match| block }     -> new_str
3842  *
3843  *  Returns a copy of +str+ with the _first_ occurrence of +pattern+
3844  *  replaced by the second argument. The +pattern+ is typically a Regexp; if
3845  *  given as a String, any regular expression metacharacters it contains will
3846  *  be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash
3847  *  followed by 'd', instead of a digit.
3848  *
3849  *  If +replacement+ is a String it will be substituted for the matched text.
3850  *  It may contain back-references to the pattern's capture groups of the form
3851  *  <code>"\\d"</code>, where <i>d</i> is a group number, or
3852  *  <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
3853  *  double-quoted string, both back-references must be preceded by an
3854  *  additional backslash. However, within +replacement+ the special match
3855  *  variables, such as <code>&$</code>, will not refer to the current match.
3856  *
3857  *  If the second argument is a Hash, and the matched text is one of its keys,
3858  *  the corresponding value is the replacement string.
3859  *
3860  *  In the block form, the current match string is passed in as a parameter,
3861  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3862  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3863  *  returned by the block will be substituted for the match on each call.
3864  *
3865  *  The result inherits any tainting in the original string or any supplied
3866  *  replacement string.
3867  *
3868  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3869  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3870  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
3871  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3872  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3873  *      #=> "Is /bin/bash your preferred shell?"
3874  */
3875 
3876 static VALUE
3877 rb_str_sub(int argc, VALUE *argv, VALUE str)
3878 {
3879     str = rb_str_dup(str);
3880     rb_str_sub_bang(argc, argv, str);
3881     return str;
3882 }
3883 
3884 static VALUE
3885 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3886 {
3887     VALUE pat, val, repl, match, dest, hash = Qnil;
3888     struct re_registers *regs;
3889     long beg, n;
3890     long beg0, end0;
3891     long offset, blen, slen, len, last;
3892     int iter = 0;
3893     char *sp, *cp;
3894     int tainted = 0;
3895     rb_encoding *str_enc;
3896 
3897     switch (argc) {
3898       case 1:
3899         RETURN_ENUMERATOR(str, argc, argv);
3900         iter = 1;
3901         break;
3902       case 2:
3903         repl = argv[1];
3904         hash = rb_check_hash_type(argv[1]);
3905         if (NIL_P(hash)) {
3906             StringValue(repl);
3907         }
3908         if (OBJ_TAINTED(repl)) tainted = 1;
3909         break;
3910       default:
3911         rb_check_arity(argc, 1, 2);
3912     }
3913 
3914     pat = get_pat(argv[0], 1);
3915     beg = rb_reg_search(pat, str, 0, 0);
3916     if (beg < 0) {
3917         if (bang) return Qnil;  /* no match, no substitution */
3918         return rb_str_dup(str);
3919     }
3920 
3921     offset = 0;
3922     n = 0;
3923     blen = RSTRING_LEN(str) + 30; /* len + margin */
3924     dest = rb_str_buf_new(blen);
3925     sp = RSTRING_PTR(str);
3926     slen = RSTRING_LEN(str);
3927     cp = sp;
3928     str_enc = STR_ENC_GET(str);
3929     rb_enc_associate(dest, str_enc);
3930     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
3931 
3932     do {
3933         n++;
3934         match = rb_backref_get();
3935         regs = RMATCH_REGS(match);
3936         beg0 = BEG(0);
3937         end0 = END(0);
3938         if (iter || !NIL_P(hash)) {
3939             if (iter) {
3940                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3941             }
3942             else {
3943                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3944                 val = rb_obj_as_string(val);
3945             }
3946             str_mod_check(str, sp, slen);
3947             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3948                 rb_raise(rb_eRuntimeError, "block should not cheat");
3949             }
3950         }
3951         else {
3952             val = rb_reg_regsub(repl, str, regs, pat);
3953         }
3954 
3955         if (OBJ_TAINTED(val)) tainted = 1;
3956 
3957         len = beg - offset;     /* copy pre-match substr */
3958         if (len) {
3959             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3960         }
3961 
3962         rb_str_buf_append(dest, val);
3963 
3964         last = offset;
3965         offset = end0;
3966         if (beg0 == end0) {
3967             /*
3968              * Always consume at least one character of the input string
3969              * in order to prevent infinite loops.
3970              */
3971             if (RSTRING_LEN(str) <= end0) break;
3972             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3973             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3974             offset = end0 + len;
3975         }
3976         cp = RSTRING_PTR(str) + offset;
3977         if (offset > RSTRING_LEN(str)) break;
3978         beg = rb_reg_search(pat, str, offset, 0);
3979     } while (beg >= 0);
3980     if (RSTRING_LEN(str) > offset) {
3981         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3982     }
3983     rb_reg_search(pat, str, last, 0);
3984     if (bang) {
3985         rb_str_shared_replace(str, dest);
3986     }
3987     else {
3988         RBASIC_SET_CLASS(dest, rb_obj_class(str));
3989         OBJ_INFECT(dest, str);
3990         str = dest;
3991     }
3992 
3993     if (tainted) OBJ_TAINT(str);
3994     return str;
3995 }
3996 
3997 
3998 /*
3999  *  call-seq:
4000  *     str.gsub!(pattern, replacement)        -> str or nil
4001  *     str.gsub!(pattern) {|match| block }    -> str or nil
4002  *     str.gsub!(pattern)                     -> an_enumerator
4003  *
4004  *  Performs the substitutions of <code>String#gsub</code> in place, returning
4005  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
4006  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
4007  */
4008 
4009 static VALUE
4010 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
4011 {
4012     str_modify_keep_cr(str);
4013     return str_gsub(argc, argv, str, 1);
4014 }
4015 
4016 
4017 /*
4018  *  call-seq:
4019  *     str.gsub(pattern, replacement)       -> new_str
4020  *     str.gsub(pattern, hash)              -> new_str
4021  *     str.gsub(pattern) {|match| block }   -> new_str
4022  *     str.gsub(pattern)                    -> enumerator
4023  *
4024  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
4025  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
4026  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
4027  *  regular expression metacharacters it contains will be interpreted
4028  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
4029  *  instead of a digit.
4030  *
4031  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
4032  *  the matched text. It may contain back-references to the pattern's capture
4033  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
4034  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
4035  *  double-quoted string, both back-references must be preceded by an
4036  *  additional backslash. However, within <i>replacement</i> the special match
4037  *  variables, such as <code>$&</code>, will not refer to the current match.
4038  *
4039  *  If the second argument is a <code>Hash</code>, and the matched text is one
4040  *  of its keys, the corresponding value is the replacement string.
4041  *
4042  *  In the block form, the current match string is passed in as a parameter,
4043  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
4044  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
4045  *  returned by the block will be substituted for the match on each call.
4046  *
4047  *  The result inherits any tainting in the original string or any supplied
4048  *  replacement string.
4049  *
4050  *  When neither a block nor a second argument is supplied, an
4051  *  <code>Enumerator</code> is returned.
4052  *
4053  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
4054  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
4055  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
4056  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
4057  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
4058  */
4059 
4060 static VALUE
4061 rb_str_gsub(int argc, VALUE *argv, VALUE str)
4062 {
4063     return str_gsub(argc, argv, str, 0);
4064 }
4065 
4066 
4067 /*
4068  *  call-seq:
4069  *     str.replace(other_str)   -> str
4070  *
4071  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
4072  *  values in <i>other_str</i>.
4073  *
4074  *     s = "hello"         #=> "hello"
4075  *     s.replace "world"   #=> "world"
4076  */
4077 
4078 VALUE
4079 rb_str_replace(VALUE str, VALUE str2)
4080 {
4081     str_modifiable(str);
4082     if (str == str2) return str;
4083 
4084     StringValue(str2);
4085     str_discard(str);
4086     return str_replace(str, str2);
4087 }
4088 
4089 /*
4090  *  call-seq:
4091  *     string.clear    ->  string
4092  *
4093  *  Makes string empty.
4094  *
4095  *     a = "abcde"
4096  *     a.clear    #=> ""
4097  */
4098 
4099 static VALUE
4100 rb_str_clear(VALUE str)
4101 {
4102     str_discard(str);
4103     STR_SET_EMBED(str);
4104     STR_SET_EMBED_LEN(str, 0);
4105     RSTRING_PTR(str)[0] = 0;
4106     if (rb_enc_asciicompat(STR_ENC_GET(str)))
4107         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4108     else
4109         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4110     return str;
4111 }
4112 
4113 /*
4114  *  call-seq:
4115  *     string.chr    ->  string
4116  *
4117  *  Returns a one-character string at the beginning of the string.
4118  *
4119  *     a = "abcde"
4120  *     a.chr    #=> "a"
4121  */
4122 
4123 static VALUE
4124 rb_str_chr(VALUE str)
4125 {
4126     return rb_str_substr(str, 0, 1);
4127 }
4128 
4129 /*
4130  *  call-seq:
4131  *     str.getbyte(index)          -> 0 .. 255
4132  *
4133  *  returns the <i>index</i>th byte as an integer.
4134  */
4135 static VALUE
4136 rb_str_getbyte(VALUE str, VALUE index)
4137 {
4138     long pos = NUM2LONG(index);
4139 
4140     if (pos < 0)
4141         pos += RSTRING_LEN(str);
4142     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
4143         return Qnil;
4144 
4145     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
4146 }
4147 
4148 /*
4149  *  call-seq:
4150  *     str.setbyte(index, integer) -> integer
4151  *
4152  *  modifies the <i>index</i>th byte as <i>integer</i>.
4153  */
4154 static VALUE
4155 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4156 {
4157     long pos = NUM2LONG(index);
4158     int byte = NUM2INT(value);
4159 
4160     rb_str_modify(str);
4161 
4162     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4163         rb_raise(rb_eIndexError, "index %ld out of string", pos);
4164     if (pos < 0)
4165         pos += RSTRING_LEN(str);
4166 
4167     RSTRING_PTR(str)[pos] = byte;
4168 
4169     return value;
4170 }
4171 
4172 static VALUE
4173 str_byte_substr(VALUE str, long beg, long len)
4174 {
4175     char *p, *s = RSTRING_PTR(str);
4176     long n = RSTRING_LEN(str);
4177     VALUE str2;
4178 
4179     if (beg > n || len < 0) return Qnil;
4180     if (beg < 0) {
4181         beg += n;
4182         if (beg < 0) return Qnil;
4183     }
4184     if (beg + len > n)
4185         len = n - beg;
4186     if (len <= 0) {
4187         len = 0;
4188         p = 0;
4189     }
4190     else
4191         p = s + beg;
4192 
4193     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4194         str2 = rb_str_new4(str);
4195         str2 = str_new3(rb_obj_class(str2), str2);
4196         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4197         RSTRING(str2)->as.heap.len = len;
4198     }
4199     else {
4200         str2 = rb_str_new5(str, p, len);
4201     }
4202 
4203     str_enc_copy(str2, str);
4204 
4205     if (RSTRING_LEN(str2) == 0) {
4206         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4207             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
4208         else
4209             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4210     }
4211     else {
4212         switch (ENC_CODERANGE(str)) {
4213           case ENC_CODERANGE_7BIT:
4214             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4215             break;
4216           default:
4217             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
4218             break;
4219         }
4220     }
4221 
4222     OBJ_INFECT(str2, str);
4223 
4224     return str2;
4225 }
4226 
4227 static VALUE
4228 str_byte_aref(VALUE str, VALUE indx)
4229 {
4230     long idx;
4231     switch (TYPE(indx)) {
4232       case T_FIXNUM:
4233         idx = FIX2LONG(indx);
4234 
4235       num_index:
4236         str = str_byte_substr(str, idx, 1);
4237         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4238         return str;
4239 
4240       default:
4241         /* check if indx is Range */
4242         {
4243             long beg, len = RSTRING_LEN(str);
4244 
4245             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4246               case Qfalse:
4247                 break;
4248               case Qnil:
4249                 return Qnil;
4250               default:
4251                 return str_byte_substr(str, beg, len);
4252             }
4253         }
4254         idx = NUM2LONG(indx);
4255         goto num_index;
4256     }
4257 
4258     UNREACHABLE;
4259 }
4260 
4261 /*
4262  *  call-seq:
4263  *     str.byteslice(fixnum)           -> new_str or nil
4264  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
4265  *     str.byteslice(range)            -> new_str or nil
4266  *
4267  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
4268  *  substring of one byte at that position. If passed two <code>Fixnum</code>
4269  *  objects, returns a substring starting at the offset given by the first, and
4270  *  a length given by the second. If given a <code>Range</code>, a substring containing
4271  *  bytes at offsets given by the range is returned. In all three cases, if
4272  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
4273  *  <code>nil</code> if the initial offset falls outside the string, the length
4274  *  is negative, or the beginning of the range is greater than the end.
4275  *  The encoding of the resulted string keeps original encoding.
4276  *
4277  *     "hello".byteslice(1)     #=> "e"
4278  *     "hello".byteslice(-1)    #=> "o"
4279  *     "hello".byteslice(1, 2)  #=> "el"
4280  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4281  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
4282  */
4283 
4284 static VALUE
4285 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
4286 {
4287     if (argc == 2) {
4288         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4289     }
4290     rb_check_arity(argc, 1, 2);
4291     return str_byte_aref(str, argv[0]);
4292 }
4293 
4294 /*
4295  *  call-seq:
4296  *     str.reverse   -> new_str
4297  *
4298  *  Returns a new string with the characters from <i>str</i> in reverse order.
4299  *
4300  *     "stressed".reverse   #=> "desserts"
4301  */
4302 
4303 static VALUE
4304 rb_str_reverse(VALUE str)
4305 {
4306     rb_encoding *enc;
4307     VALUE rev;
4308     char *s, *e, *p;
4309     int single = 1;
4310 
4311     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4312     enc = STR_ENC_GET(str);
4313     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4314     s = RSTRING_PTR(str); e = RSTRING_END(str);
4315     p = RSTRING_END(rev);
4316 
4317     if (RSTRING_LEN(str) > 1) {
4318         if (single_byte_optimizable(str)) {
4319             while (s < e) {
4320                 *--p = *s++;
4321             }
4322         }
4323         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4324             while (s < e) {
4325                 int clen = rb_enc_fast_mbclen(s, e, enc);
4326 
4327                 if (clen > 1 || (*s & 0x80)) single = 0;
4328                 p -= clen;
4329                 memcpy(p, s, clen);
4330                 s += clen;
4331             }
4332         }
4333         else {
4334             while (s < e) {
4335                 int clen = rb_enc_mbclen(s, e, enc);
4336 
4337                 if (clen > 1 || (*s & 0x80)) single = 0;
4338                 p -= clen;
4339                 memcpy(p, s, clen);
4340                 s += clen;
4341             }
4342         }
4343     }
4344     STR_SET_LEN(rev, RSTRING_LEN(str));
4345     OBJ_INFECT(rev, str);
4346     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4347         if (single) {
4348             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4349         }
4350         else {
4351             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4352         }
4353     }
4354     rb_enc_cr_str_copy_for_substr(rev, str);
4355 
4356     return rev;
4357 }
4358 
4359 
4360 /*
4361  *  call-seq:
4362  *     str.reverse!   -> str
4363  *
4364  *  Reverses <i>str</i> in place.
4365  */
4366 
4367 static VALUE
4368 rb_str_reverse_bang(VALUE str)
4369 {
4370     if (RSTRING_LEN(str) > 1) {
4371         if (single_byte_optimizable(str)) {
4372             char *s, *e, c;
4373 
4374             str_modify_keep_cr(str);
4375             s = RSTRING_PTR(str);
4376             e = RSTRING_END(str) - 1;
4377             while (s < e) {
4378                 c = *s;
4379                 *s++ = *e;
4380                 *e-- = c;
4381             }
4382         }
4383         else {
4384             rb_str_shared_replace(str, rb_str_reverse(str));
4385         }
4386     }
4387     else {
4388         str_modify_keep_cr(str);
4389     }
4390     return str;
4391 }
4392 
4393 
4394 /*
4395  *  call-seq:
4396  *     str.include? other_str   -> true or false
4397  *
4398  *  Returns <code>true</code> if <i>str</i> contains the given string or
4399  *  character.
4400  *
4401  *     "hello".include? "lo"   #=> true
4402  *     "hello".include? "ol"   #=> false
4403  *     "hello".include? ?h     #=> true
4404  */
4405 
4406 static VALUE
4407 rb_str_include(VALUE str, VALUE arg)
4408 {
4409     long i;
4410 
4411     StringValue(arg);
4412     i = rb_str_index(str, arg, 0);
4413 
4414     if (i == -1) return Qfalse;
4415     return Qtrue;
4416 }
4417 
4418 
4419 /*
4420  *  call-seq:
4421  *     str.to_i(base=10)   -> integer
4422  *
4423  *  Returns the result of interpreting leading characters in <i>str</i> as an
4424  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4425  *  end of a valid number are ignored. If there is not a valid number at the
4426  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
4427  *  exception when <i>base</i> is valid.
4428  *
4429  *     "12345".to_i             #=> 12345
4430  *     "99 red balloons".to_i   #=> 99
4431  *     "0a".to_i                #=> 0
4432  *     "0a".to_i(16)            #=> 10
4433  *     "hello".to_i             #=> 0
4434  *     "1100101".to_i(2)        #=> 101
4435  *     "1100101".to_i(8)        #=> 294977
4436  *     "1100101".to_i(10)       #=> 1100101
4437  *     "1100101".to_i(16)       #=> 17826049
4438  */
4439 
4440 static VALUE
4441 rb_str_to_i(int argc, VALUE *argv, VALUE str)
4442 {
4443     int base;
4444 
4445     if (argc == 0) base = 10;
4446     else {
4447         VALUE b;
4448 
4449         rb_scan_args(argc, argv, "01", &b);
4450         base = NUM2INT(b);
4451     }
4452     if (base < 0) {
4453         rb_raise(rb_eArgError, "invalid radix %d", base);
4454     }
4455     return rb_str_to_inum(str, base, FALSE);
4456 }
4457 
4458 
4459 /*
4460  *  call-seq:
4461  *     str.to_f   -> float
4462  *
4463  *  Returns the result of interpreting leading characters in <i>str</i> as a
4464  *  floating point number. Extraneous characters past the end of a valid number
4465  *  are ignored. If there is not a valid number at the start of <i>str</i>,
4466  *  <code>0.0</code> is returned. This method never raises an exception.
4467  *
4468  *     "123.45e1".to_f        #=> 1234.5
4469  *     "45.67 degrees".to_f   #=> 45.67
4470  *     "thx1138".to_f         #=> 0.0
4471  */
4472 
4473 static VALUE
4474 rb_str_to_f(VALUE str)
4475 {
4476     return DBL2NUM(rb_str_to_dbl(str, FALSE));
4477 }
4478 
4479 
4480 /*
4481  *  call-seq:
4482  *     str.to_s     -> str
4483  *     str.to_str   -> str
4484  *
4485  *  Returns the receiver.
4486  */
4487 
4488 static VALUE
4489 rb_str_to_s(VALUE str)
4490 {
4491     if (rb_obj_class(str) != rb_cString) {
4492         return str_duplicate(rb_cString, str);
4493     }
4494     return str;
4495 }
4496 
4497 #if 0
4498 static void
4499 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4500 {
4501     char s[RUBY_MAX_CHAR_LEN];
4502     int n = rb_enc_codelen(c, enc);
4503 
4504     rb_enc_mbcput(c, s, enc);
4505     rb_enc_str_buf_cat(str, s, n, enc);
4506 }
4507 #endif
4508 
4509 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4510 
4511 int
4512 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4513 {
4514     char buf[CHAR_ESC_LEN + 1];
4515     int l;
4516 
4517 #if SIZEOF_INT > 4
4518     c &= 0xffffffff;
4519 #endif
4520     if (unicode_p) {
4521         if (c < 0x7F && ISPRINT(c)) {
4522             snprintf(buf, CHAR_ESC_LEN, "%c", c);
4523         }
4524         else if (c < 0x10000) {
4525             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4526         }
4527         else {
4528             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4529         }
4530     }
4531     else {
4532         if (c < 0x100) {
4533             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4534         }
4535         else {
4536             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4537         }
4538     }
4539     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
4540     rb_str_buf_cat(result, buf, l);
4541     return l;
4542 }
4543 
4544 /*
4545  * call-seq:
4546  *   str.inspect   -> string
4547  *
4548  * Returns a printable version of _str_, surrounded by quote marks,
4549  * with special characters escaped.
4550  *
4551  *    str = "hello"
4552  *    str[3] = "\b"
4553  *    str.inspect       #=> "\"hel\\bo\""
4554  */
4555 
4556 VALUE
4557 rb_str_inspect(VALUE str)
4558 {
4559     rb_encoding *enc = STR_ENC_GET(str);
4560     int encidx = rb_enc_to_index(enc);
4561     const char *p, *pend, *prev;
4562     char buf[CHAR_ESC_LEN + 1];
4563     VALUE result = rb_str_buf_new(0);
4564     rb_encoding *resenc = rb_default_internal_encoding();
4565     int unicode_p = rb_enc_unicode_p(enc);
4566     int asciicompat = rb_enc_asciicompat(enc);
4567 
4568     if (resenc == NULL) resenc = rb_default_external_encoding();
4569     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4570     rb_enc_associate(result, resenc);
4571     str_buf_cat2(result, "\"");
4572 
4573     p = RSTRING_PTR(str); pend = RSTRING_END(str);
4574     prev = p;
4575     if (encidx == ENCINDEX_UTF_16) {
4576         const unsigned char *q = (const unsigned char *)p;
4577         if (q[0] == 0xFE && q[1] == 0xFF)
4578             enc = rb_enc_from_index(ENCINDEX_UTF_16BE);
4579         else if (q[0] == 0xFF && q[1] == 0xFE)
4580             enc = rb_enc_from_index(ENCINDEX_UTF_16LE);
4581         else
4582             unicode_p = 0;
4583     }
4584     else if (encidx == ENCINDEX_UTF_32) {
4585         const unsigned char *q = (const unsigned char *)p;
4586         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4587             enc = rb_enc_from_index(ENCINDEX_UTF_32BE);
4588         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4589             enc = rb_enc_from_index(ENCINDEX_UTF_32LE);
4590         else
4591             unicode_p = 0;
4592     }
4593     while (p < pend) {
4594         unsigned int c, cc;
4595         int n;
4596 
4597         n = rb_enc_precise_mbclen(p, pend, enc);
4598         if (!MBCLEN_CHARFOUND_P(n)) {
4599             if (p > prev) str_buf_cat(result, prev, p - prev);
4600             n = rb_enc_mbminlen(enc);
4601             if (pend < p + n)
4602                 n = (int)(pend - p);
4603             while (n--) {
4604                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4605                 str_buf_cat(result, buf, strlen(buf));
4606                 prev = ++p;
4607             }
4608             continue;
4609         }
4610         n = MBCLEN_CHARFOUND_LEN(n);
4611         c = rb_enc_mbc_to_codepoint(p, pend, enc);
4612         p += n;
4613         if ((asciicompat || unicode_p) &&
4614           (c == '"'|| c == '\\' ||
4615             (c == '#' &&
4616              p < pend &&
4617              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
4618              (cc = rb_enc_codepoint(p,pend,enc),
4619               (cc == '$' || cc == '@' || cc == '{'))))) {
4620             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4621             str_buf_cat2(result, "\\");
4622             if (asciicompat || enc == resenc) {
4623                 prev = p - n;
4624                 continue;
4625             }
4626         }
4627         switch (c) {
4628           case '\n': cc = 'n'; break;
4629           case '\r': cc = 'r'; break;
4630           case '\t': cc = 't'; break;
4631           case '\f': cc = 'f'; break;
4632           case '\013': cc = 'v'; break;
4633           case '\010': cc = 'b'; break;
4634           case '\007': cc = 'a'; break;
4635           case 033: cc = 'e'; break;
4636           default: cc = 0; break;
4637         }
4638         if (cc) {
4639             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4640             buf[0] = '\\';
4641             buf[1] = (char)cc;
4642             str_buf_cat(result, buf, 2);
4643             prev = p;
4644             continue;
4645         }
4646         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4647             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4648             continue;
4649         }
4650         else {
4651             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4652             rb_str_buf_cat_escaped_char(result, c, unicode_p);
4653             prev = p;
4654             continue;
4655         }
4656     }
4657     if (p > prev) str_buf_cat(result, prev, p - prev);
4658     str_buf_cat2(result, "\"");
4659 
4660     OBJ_INFECT(result, str);
4661     return result;
4662 }
4663 
4664 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4665 
4666 /*
4667  *  call-seq:
4668  *     str.dump   -> new_str
4669  *
4670  *  Produces a version of +str+ with all non-printing characters replaced by
4671  *  <code>\nnn</code> notation and all special characters escaped.
4672  *
4673  *    "hello \n ''".dump  #=> "\"hello \\n ''\"
4674  */
4675 
4676 VALUE
4677 rb_str_dump(VALUE str)
4678 {
4679     rb_encoding *enc = rb_enc_get(str);
4680     long len;
4681     const char *p, *pend;
4682     char *q, *qend;
4683     VALUE result;
4684     int u8 = (enc == rb_utf8_encoding());
4685 
4686     len = 2;                    /* "" */
4687     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4688     while (p < pend) {
4689         unsigned char c = *p++;
4690         switch (c) {
4691           case '"':  case '\\':
4692           case '\n': case '\r':
4693           case '\t': case '\f':
4694           case '\013': case '\010': case '\007': case '\033':
4695             len += 2;
4696             break;
4697 
4698           case '#':
4699             len += IS_EVSTR(p, pend) ? 2 : 1;
4700             break;
4701 
4702           default:
4703             if (ISPRINT(c)) {
4704                 len++;
4705             }
4706             else {
4707                 if (u8 && c > 0x7F) {   /* \u{NN} */
4708                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
4709                     if (MBCLEN_CHARFOUND_P(n)) {
4710                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4711                         while (cc >>= 4) len++;
4712                         len += 5;
4713                         p += MBCLEN_CHARFOUND_LEN(n)-1;
4714                         break;
4715                     }
4716                 }
4717                 len += 4;       /* \xNN */
4718             }
4719             break;
4720         }
4721     }
4722     if (!rb_enc_asciicompat(enc)) {
4723         len += 19;              /* ".force_encoding('')" */
4724         len += strlen(enc->name);
4725     }
4726 
4727     result = rb_str_new5(str, 0, len);
4728     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4729     q = RSTRING_PTR(result); qend = q + len + 1;
4730 
4731     *q++ = '"';
4732     while (p < pend) {
4733         unsigned char c = *p++;
4734 
4735         if (c == '"' || c == '\\') {
4736             *q++ = '\\';
4737             *q++ = c;
4738         }
4739         else if (c == '#') {
4740             if (IS_EVSTR(p, pend)) *q++ = '\\';
4741             *q++ = '#';
4742         }
4743         else if (c == '\n') {
4744             *q++ = '\\';
4745             *q++ = 'n';
4746         }
4747         else if (c == '\r') {
4748             *q++ = '\\';
4749             *q++ = 'r';
4750         }
4751         else if (c == '\t') {
4752             *q++ = '\\';
4753             *q++ = 't';
4754         }
4755         else if (c == '\f') {
4756             *q++ = '\\';
4757             *q++ = 'f';
4758         }
4759         else if (c == '\013') {
4760             *q++ = '\\';
4761             *q++ = 'v';
4762         }
4763         else if (c == '\010') {
4764             *q++ = '\\';
4765             *q++ = 'b';
4766         }
4767         else if (c == '\007') {
4768             *q++ = '\\';
4769             *q++ = 'a';
4770         }
4771         else if (c == '\033') {
4772             *q++ = '\\';
4773             *q++ = 'e';
4774         }
4775         else if (ISPRINT(c)) {
4776             *q++ = c;
4777         }
4778         else {
4779             *q++ = '\\';
4780             if (u8) {
4781                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4782                 if (MBCLEN_CHARFOUND_P(n)) {
4783                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4784                     p += n;
4785                     snprintf(q, qend-q, "u{%x}", cc);
4786                     q += strlen(q);
4787                     continue;
4788                 }
4789             }
4790             snprintf(q, qend-q, "x%02X", c);
4791             q += 3;
4792         }
4793     }
4794     *q++ = '"';
4795     *q = '\0';
4796     if (!rb_enc_asciicompat(enc)) {
4797         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4798         enc = rb_ascii8bit_encoding();
4799     }
4800     OBJ_INFECT(result, str);
4801     /* result from dump is ASCII */
4802     rb_enc_associate(result, enc);
4803     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
4804     return result;
4805 }
4806 
4807 
4808 static void
4809 rb_str_check_dummy_enc(rb_encoding *enc)
4810 {
4811     if (rb_enc_dummy_p(enc)) {
4812         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4813                  rb_enc_name(enc));
4814     }
4815 }
4816 
4817 /*
4818  *  call-seq:
4819  *     str.upcase!   -> str or nil
4820  *
4821  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4822  *  were made.
4823  *  Note: case replacement is effective only in ASCII region.
4824  */
4825 
4826 static VALUE
4827 rb_str_upcase_bang(VALUE str)
4828 {
4829     rb_encoding *enc;
4830     char *s, *send;
4831     int modify = 0;
4832     int n;
4833 
4834     str_modify_keep_cr(str);
4835     enc = STR_ENC_GET(str);
4836     rb_str_check_dummy_enc(enc);
4837     s = RSTRING_PTR(str); send = RSTRING_END(str);
4838     if (single_byte_optimizable(str)) {
4839         while (s < send) {
4840             unsigned int c = *(unsigned char*)s;
4841 
4842             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4843                 *s = 'A' + (c - 'a');
4844                 modify = 1;
4845             }
4846             s++;
4847         }
4848     }
4849     else {
4850         int ascompat = rb_enc_asciicompat(enc);
4851 
4852         while (s < send) {
4853             unsigned int c;
4854 
4855             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4856                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4857                     *s = 'A' + (c - 'a');
4858                     modify = 1;
4859                 }
4860                 s++;
4861             }
4862             else {
4863                 c = rb_enc_codepoint_len(s, send, &n, enc);
4864                 if (rb_enc_islower(c, enc)) {
4865                     /* assuming toupper returns codepoint with same size */
4866                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4867                     modify = 1;
4868                 }
4869                 s += n;
4870             }
4871         }
4872     }
4873 
4874     if (modify) return str;
4875     return Qnil;
4876 }
4877 
4878 
4879 /*
4880  *  call-seq:
4881  *     str.upcase   -> new_str
4882  *
4883  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4884  *  uppercase counterparts. The operation is locale insensitive---only
4885  *  characters ``a'' to ``z'' are affected.
4886  *  Note: case replacement is effective only in ASCII region.
4887  *
4888  *     "hEllO".upcase   #=> "HELLO"
4889  */
4890 
4891 static VALUE
4892 rb_str_upcase(VALUE str)
4893 {
4894     str = rb_str_dup(str);
4895     rb_str_upcase_bang(str);
4896     return str;
4897 }
4898 
4899 
4900 /*
4901  *  call-seq:
4902  *     str.downcase!   -> str or nil
4903  *
4904  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4905  *  changes were made.
4906  *  Note: case replacement is effective only in ASCII region.
4907  */
4908 
4909 static VALUE
4910 rb_str_downcase_bang(VALUE str)
4911 {
4912     rb_encoding *enc;
4913     char *s, *send;
4914     int modify = 0;
4915 
4916     str_modify_keep_cr(str);
4917     enc = STR_ENC_GET(str);
4918     rb_str_check_dummy_enc(enc);
4919     s = RSTRING_PTR(str); send = RSTRING_END(str);
4920     if (single_byte_optimizable(str)) {
4921         while (s < send) {
4922             unsigned int c = *(unsigned char*)s;
4923 
4924             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4925                 *s = 'a' + (c - 'A');
4926                 modify = 1;
4927             }
4928             s++;
4929         }
4930     }
4931     else {
4932         int ascompat = rb_enc_asciicompat(enc);
4933 
4934         while (s < send) {
4935             unsigned int c;
4936             int n;
4937 
4938             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4939                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4940                     *s = 'a' + (c - 'A');
4941                     modify = 1;
4942                 }
4943                 s++;
4944             }
4945             else {
4946                 c = rb_enc_codepoint_len(s, send, &n, enc);
4947                 if (rb_enc_isupper(c, enc)) {
4948                     /* assuming toupper returns codepoint with same size */
4949                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4950                     modify = 1;
4951                 }
4952                 s += n;
4953             }
4954         }
4955     }
4956 
4957     if (modify) return str;
4958     return Qnil;
4959 }
4960 
4961 
4962 /*
4963  *  call-seq:
4964  *     str.downcase   -> new_str
4965  *
4966  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4967  *  lowercase counterparts. The operation is locale insensitive---only
4968  *  characters ``A'' to ``Z'' are affected.
4969  *  Note: case replacement is effective only in ASCII region.
4970  *
4971  *     "hEllO".downcase   #=> "hello"
4972  */
4973 
4974 static VALUE
4975 rb_str_downcase(VALUE str)
4976 {
4977     str = rb_str_dup(str);
4978     rb_str_downcase_bang(str);
4979     return str;
4980 }
4981 
4982 
4983 /*
4984  *  call-seq:
4985  *     str.capitalize!   -> str or nil
4986  *
4987  *  Modifies <i>str</i> by converting the first character to uppercase and the
4988  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4989  *  Note: case conversion is effective only in ASCII region.
4990  *
4991  *     a = "hello"
4992  *     a.capitalize!   #=> "Hello"
4993  *     a               #=> "Hello"
4994  *     a.capitalize!   #=> nil
4995  */
4996 
4997 static VALUE
4998 rb_str_capitalize_bang(VALUE str)
4999 {
5000     rb_encoding *enc;
5001     char *s, *send;
5002     int modify = 0;
5003     unsigned int c;
5004     int n;
5005 
5006     str_modify_keep_cr(str);
5007     enc = STR_ENC_GET(str);
5008     rb_str_check_dummy_enc(enc);
5009     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5010     s = RSTRING_PTR(str); send = RSTRING_END(str);
5011 
5012     c = rb_enc_codepoint_len(s, send, &n, enc);
5013     if (rb_enc_islower(c, enc)) {
5014         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
5015         modify = 1;
5016     }
5017     s += n;
5018     while (s < send) {
5019         c = rb_enc_codepoint_len(s, send, &n, enc);
5020         if (rb_enc_isupper(c, enc)) {
5021             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
5022             modify = 1;
5023         }
5024         s += n;
5025     }
5026 
5027     if (modify) return str;
5028     return Qnil;
5029 }
5030 
5031 
5032 /*
5033  *  call-seq:
5034  *     str.capitalize   -> new_str
5035  *
5036  *  Returns a copy of <i>str</i> with the first character converted to uppercase
5037  *  and the remainder to lowercase.
5038  *  Note: case conversion is effective only in ASCII region.
5039  *
5040  *     "hello".capitalize    #=> "Hello"
5041  *     "HELLO".capitalize    #=> "Hello"
5042  *     "123ABC".capitalize   #=> "123abc"
5043  */
5044 
5045 static VALUE
5046 rb_str_capitalize(VALUE str)
5047 {
5048     str = rb_str_dup(str);
5049     rb_str_capitalize_bang(str);
5050     return str;
5051 }
5052 
5053 
5054 /*
5055  *  call-seq:
5056  *     str.swapcase!   -> str or nil
5057  *
5058  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
5059  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
5060  *  Note: case conversion is effective only in ASCII region.
5061  */
5062 
5063 static VALUE
5064 rb_str_swapcase_bang(VALUE str)
5065 {
5066     rb_encoding *enc;
5067     char *s, *send;
5068     int modify = 0;
5069     int n;
5070 
5071     str_modify_keep_cr(str);
5072     enc = STR_ENC_GET(str);
5073     rb_str_check_dummy_enc(enc);
5074     s = RSTRING_PTR(str); send = RSTRING_END(str);
5075     while (s < send) {
5076         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
5077 
5078         if (rb_enc_isupper(c, enc)) {
5079             /* assuming toupper returns codepoint with same size */
5080             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
5081             modify = 1;
5082         }
5083         else if (rb_enc_islower(c, enc)) {
5084             /* assuming tolower returns codepoint with same size */
5085             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
5086             modify = 1;
5087         }
5088         s += n;
5089     }
5090 
5091     if (modify) return str;
5092     return Qnil;
5093 }
5094 
5095 
5096 /*
5097  *  call-seq:
5098  *     str.swapcase   -> new_str
5099  *
5100  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
5101  *  to lowercase and lowercase characters converted to uppercase.
5102  *  Note: case conversion is effective only in ASCII region.
5103  *
5104  *     "Hello".swapcase          #=> "hELLO"
5105  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
5106  */
5107 
5108 static VALUE
5109 rb_str_swapcase(VALUE str)
5110 {
5111     str = rb_str_dup(str);
5112     rb_str_swapcase_bang(str);
5113     return str;
5114 }
5115 
5116 typedef unsigned char *USTR;
5117 
5118 struct tr {
5119     int gen;
5120     unsigned int now, max;
5121     char *p, *pend;
5122 };
5123 
5124 static unsigned int
5125 trnext(struct tr *t, rb_encoding *enc)
5126 {
5127     int n;
5128 
5129     for (;;) {
5130         if (!t->gen) {
5131 nextpart:
5132             if (t->p == t->pend) return -1;
5133             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
5134                 t->p += n;
5135             }
5136             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5137             t->p += n;
5138             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
5139                 t->p += n;
5140                 if (t->p < t->pend) {
5141                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5142                     t->p += n;
5143                     if (t->now > c) {
5144                         if (t->now < 0x80 && c < 0x80) {
5145                             rb_raise(rb_eArgError,
5146                                      "invalid range \"%c-%c\" in string transliteration",
5147                                      t->now, c);
5148                         }
5149                         else {
5150                             rb_raise(rb_eArgError, "invalid range in string transliteration");
5151                         }
5152                         continue; /* not reached */
5153                     }
5154                     t->gen = 1;
5155                     t->max = c;
5156                 }
5157             }
5158             return t->now;
5159         }
5160         else {
5161             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
5162                 if (t->now == t->max) {
5163                     t->gen = 0;
5164                     goto nextpart;
5165                 }
5166             }
5167             if (t->now < t->max) {
5168                 return t->now;
5169             }
5170             else {
5171                 t->gen = 0;
5172                 return t->max;
5173             }
5174         }
5175     }
5176 }
5177 
5178 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5179 
5180 static VALUE
5181 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5182 {
5183     const unsigned int errc = -1;
5184     unsigned int trans[256];
5185     rb_encoding *enc, *e1, *e2;
5186     struct tr trsrc, trrepl;
5187     int cflag = 0;
5188     unsigned int c, c0, last = 0;
5189     int modify = 0, i, l;
5190     char *s, *send;
5191     VALUE hash = 0;
5192     int singlebyte = single_byte_optimizable(str);
5193     int cr;
5194 
5195 #define CHECK_IF_ASCII(c) \
5196     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5197            (cr = ENC_CODERANGE_VALID) : 0)
5198 
5199     StringValue(src);
5200     StringValue(repl);
5201     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5202     if (RSTRING_LEN(repl) == 0) {
5203         return rb_str_delete_bang(1, &src, str);
5204     }
5205 
5206     cr = ENC_CODERANGE(str);
5207     e1 = rb_enc_check(str, src);
5208     e2 = rb_enc_check(str, repl);
5209     if (e1 == e2) {
5210         enc = e1;
5211     }
5212     else {
5213         enc = rb_enc_check(src, repl);
5214     }
5215     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5216     if (RSTRING_LEN(src) > 1 &&
5217         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5218         trsrc.p + l < trsrc.pend) {
5219         cflag = 1;
5220         trsrc.p += l;
5221     }
5222     trrepl.p = RSTRING_PTR(repl);
5223     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5224     trsrc.gen = trrepl.gen = 0;
5225     trsrc.now = trrepl.now = 0;
5226     trsrc.max = trrepl.max = 0;
5227 
5228     if (cflag) {
5229         for (i=0; i<256; i++) {
5230             trans[i] = 1;
5231         }
5232         while ((c = trnext(&trsrc, enc)) != errc) {
5233             if (c < 256) {
5234                 trans[c] = errc;
5235             }
5236             else {
5237                 if (!hash) hash = rb_hash_new();
5238                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5239             }
5240         }
5241         while ((c = trnext(&trrepl, enc)) != errc)
5242             /* retrieve last replacer */;
5243         last = trrepl.now;
5244         for (i=0; i<256; i++) {
5245             if (trans[i] != errc) {
5246                 trans[i] = last;
5247             }
5248         }
5249     }
5250     else {
5251         unsigned int r;
5252 
5253         for (i=0; i<256; i++) {
5254             trans[i] = errc;
5255         }
5256         while ((c = trnext(&trsrc, enc)) != errc) {
5257             r = trnext(&trrepl, enc);
5258             if (r == errc) r = trrepl.now;
5259             if (c < 256) {
5260                 trans[c] = r;
5261                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5262             }
5263             else {
5264                 if (!hash) hash = rb_hash_new();
5265                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5266             }
5267         }
5268     }
5269 
5270     if (cr == ENC_CODERANGE_VALID)
5271         cr = ENC_CODERANGE_7BIT;
5272     str_modify_keep_cr(str);
5273     s = RSTRING_PTR(str); send = RSTRING_END(str);
5274     if (sflag) {
5275         int clen, tlen;
5276         long offset, max = RSTRING_LEN(str);
5277         unsigned int save = -1;
5278         char *buf = ALLOC_N(char, max), *t = buf;
5279 
5280         while (s < send) {
5281             int may_modify = 0;
5282 
5283             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5284             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5285 
5286             s += clen;
5287             if (c < 256) {
5288                 c = trans[c];
5289             }
5290             else if (hash) {
5291                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5292                 if (NIL_P(tmp)) {
5293                     if (cflag) c = last;
5294                     else c = errc;
5295                 }
5296                 else if (cflag) c = errc;
5297                 else c = NUM2INT(tmp);
5298             }
5299             else {
5300                 c = errc;
5301             }
5302             if (c != (unsigned int)-1) {
5303                 if (save == c) {
5304                     CHECK_IF_ASCII(c);
5305                     continue;
5306                 }
5307                 save = c;
5308                 tlen = rb_enc_codelen(c, enc);
5309                 modify = 1;
5310             }
5311             else {
5312                 save = -1;
5313                 c = c0;
5314                 if (enc != e1) may_modify = 1;
5315             }
5316             while (t - buf + tlen >= max) {
5317                 offset = t - buf;
5318                 max *= 2;
5319                 REALLOC_N(buf, char, max);
5320                 t = buf + offset;
5321             }
5322             rb_enc_mbcput(c, t, enc);
5323             if (may_modify && memcmp(s, t, tlen) != 0) {
5324                 modify = 1;
5325             }
5326             CHECK_IF_ASCII(c);
5327             t += tlen;
5328         }
5329         if (!STR_EMBED_P(str)) {
5330             xfree(RSTRING(str)->as.heap.ptr);
5331         }
5332         *t = '\0';
5333         RSTRING(str)->as.heap.ptr = buf;
5334         RSTRING(str)->as.heap.len = t - buf;
5335         STR_SET_NOEMBED(str);
5336         RSTRING(str)->as.heap.aux.capa = max;
5337     }
5338     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5339         while (s < send) {
5340             c = (unsigned char)*s;
5341             if (trans[c] != errc) {
5342                 if (!cflag) {
5343                     c = trans[c];
5344                     *s = c;
5345                     modify = 1;
5346                 }
5347                 else {
5348                     *s = last;
5349                     modify = 1;
5350                 }
5351             }
5352             CHECK_IF_ASCII(c);
5353             s++;
5354         }
5355     }
5356     else {
5357         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5358         long offset;
5359         char *buf = ALLOC_N(char, max), *t = buf;
5360 
5361         while (s < send) {
5362             int may_modify = 0;
5363             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5364             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5365 
5366             if (c < 256) {
5367                 c = trans[c];
5368             }
5369             else if (hash) {
5370                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5371                 if (NIL_P(tmp)) {
5372                     if (cflag) c = last;
5373                     else c = errc;
5374                 }
5375                 else if (cflag) c = errc;
5376                 else c = NUM2INT(tmp);
5377             }
5378             else {
5379                 c = cflag ? last : errc;
5380             }
5381             if (c != errc) {
5382                 tlen = rb_enc_codelen(c, enc);
5383                 modify = 1;
5384             }
5385             else {
5386                 c = c0;
5387                 if (enc != e1) may_modify = 1;
5388             }
5389             while (t - buf + tlen >= max) {
5390                 offset = t - buf;
5391                 max *= 2;
5392                 REALLOC_N(buf, char, max);
5393                 t = buf + offset;
5394             }
5395             if (s != t) {
5396                 rb_enc_mbcput(c, t, enc);
5397                 if (may_modify && memcmp(s, t, tlen) != 0) {
5398                     modify = 1;
5399                 }
5400             }
5401             CHECK_IF_ASCII(c);
5402             s += clen;
5403             t += tlen;
5404         }
5405         if (!STR_EMBED_P(str)) {
5406             xfree(RSTRING(str)->as.heap.ptr);
5407         }
5408         *t = '\0';
5409         RSTRING(str)->as.heap.ptr = buf;
5410         RSTRING(str)->as.heap.len = t - buf;
5411         STR_SET_NOEMBED(str);
5412         RSTRING(str)->as.heap.aux.capa = max;
5413     }
5414 
5415     if (modify) {
5416         if (cr != ENC_CODERANGE_BROKEN)
5417             ENC_CODERANGE_SET(str, cr);
5418         rb_enc_associate(str, enc);
5419         return str;
5420     }
5421     return Qnil;
5422 }
5423 
5424 
5425 /*
5426  *  call-seq:
5427  *     str.tr!(from_str, to_str)   -> str or nil
5428  *
5429  *  Translates <i>str</i> in place, using the same rules as
5430  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5431  *  changes were made.
5432  */
5433 
5434 static VALUE
5435 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
5436 {
5437     return tr_trans(str, src, repl, 0);
5438 }
5439 
5440 
5441 /*
5442  *  call-seq:
5443  *     str.tr(from_str, to_str)   => new_str
5444  *
5445  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
5446  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
5447  *  +from_str+, it is padded with its last character in order to maintain the
5448  *  correspondence.
5449  *
5450  *     "hello".tr('el', 'ip')      #=> "hippo"
5451  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
5452  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
5453  *
5454  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
5455  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
5456  *  all characters except those listed.
5457  *
5458  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
5459  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
5460  *
5461  *  The backslash character <code>\</code> can be used to escape
5462  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
5463  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
5464  *
5465  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
5466  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
5467  *
5468  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
5469  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
5470  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
5471  *
5472  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
5473  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
5474  */
5475 
5476 static VALUE
5477 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5478 {
5479     str = rb_str_dup(str);
5480     tr_trans(str, src, repl, 0);
5481     return str;
5482 }
5483 
5484 #define TR_TABLE_SIZE 257
5485 static void
5486 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5487                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5488 {
5489     const unsigned int errc = -1;
5490     char buf[256];
5491     struct tr tr;
5492     unsigned int c;
5493     VALUE table = 0, ptable = 0;
5494     int i, l, cflag = 0;
5495 
5496     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5497     tr.gen = tr.now = tr.max = 0;
5498 
5499     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5500         cflag = 1;
5501         tr.p += l;
5502     }
5503     if (first) {
5504         for (i=0; i<256; i++) {
5505             stable[i] = 1;
5506         }
5507         stable[256] = cflag;
5508     }
5509     else if (stable[256] && !cflag) {
5510         stable[256] = 0;
5511     }
5512     for (i=0; i<256; i++) {
5513         buf[i] = cflag;
5514     }
5515 
5516     while ((c = trnext(&tr, enc)) != errc) {
5517         if (c < 256) {
5518             buf[c & 0xff] = !cflag;
5519         }
5520         else {
5521             VALUE key = UINT2NUM(c);
5522 
5523             if (!table && (first || *tablep || stable[256])) {
5524                 if (cflag) {
5525                     ptable = *ctablep;
5526                     table = ptable ? ptable : rb_hash_new();
5527                     *ctablep = table;
5528                 }
5529                 else {
5530                     table = rb_hash_new();
5531                     ptable = *tablep;
5532                     *tablep = table;
5533                 }
5534             }
5535             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
5536                 rb_hash_aset(table, key, Qtrue);
5537             }
5538         }
5539     }
5540     for (i=0; i<256; i++) {
5541         stable[i] = stable[i] && buf[i];
5542     }
5543     if (!table && !cflag) {
5544         *tablep = 0;
5545     }
5546 }
5547 
5548 
5549 static int
5550 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5551 {
5552     if (c < 256) {
5553         return table[c] != 0;
5554     }
5555     else {
5556         VALUE v = UINT2NUM(c);
5557 
5558         if (del) {
5559             if (!NIL_P(rb_hash_lookup(del, v)) &&
5560                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5561                 return TRUE;
5562             }
5563         }
5564         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5565             return FALSE;
5566         }
5567         return table[256] ? TRUE : FALSE;
5568     }
5569 }
5570 
5571 /*
5572  *  call-seq:
5573  *     str.delete!([other_str]+)   -> str or nil
5574  *
5575  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5576  *  <code>nil</code> if <i>str</i> was not modified.
5577  */
5578 
5579 static VALUE
5580 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
5581 {
5582     char squeez[TR_TABLE_SIZE];
5583     rb_encoding *enc = 0;
5584     char *s, *send, *t;
5585     VALUE del = 0, nodel = 0;
5586     int modify = 0;
5587     int i, ascompat, cr;
5588 
5589     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5590     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5591     for (i=0; i<argc; i++) {
5592         VALUE s = argv[i];
5593 
5594         StringValue(s);
5595         enc = rb_enc_check(str, s);
5596         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5597     }
5598 
5599     str_modify_keep_cr(str);
5600     ascompat = rb_enc_asciicompat(enc);
5601     s = t = RSTRING_PTR(str);
5602     send = RSTRING_END(str);
5603     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5604     while (s < send) {
5605         unsigned int c;
5606         int clen;
5607 
5608         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5609             if (squeez[c]) {
5610                 modify = 1;
5611             }
5612             else {
5613                 if (t != s) *t = c;
5614                 t++;
5615             }
5616             s++;
5617         }
5618         else {
5619             c = rb_enc_codepoint_len(s, send, &clen, enc);
5620 
5621             if (tr_find(c, squeez, del, nodel)) {
5622                 modify = 1;
5623             }
5624             else {
5625                 if (t != s) rb_enc_mbcput(c, t, enc);
5626                 t += clen;
5627                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5628             }
5629             s += clen;
5630         }
5631     }
5632     *t = '\0';
5633     STR_SET_LEN(str, t - RSTRING_PTR(str));
5634     ENC_CODERANGE_SET(str, cr);
5635 
5636     if (modify) return str;
5637     return Qnil;
5638 }
5639 
5640 
5641 /*
5642  *  call-seq:
5643  *     str.delete([other_str]+)   -> new_str
5644  *
5645  *  Returns a copy of <i>str</i> with all characters in the intersection of its
5646  *  arguments deleted. Uses the same rules for building the set of characters as
5647  *  <code>String#count</code>.
5648  *
5649  *     "hello".delete "l","lo"        #=> "heo"
5650  *     "hello".delete "lo"            #=> "he"
5651  *     "hello".delete "aeiou", "^e"   #=> "hell"
5652  *     "hello".delete "ej-m"          #=> "ho"
5653  */
5654 
5655 static VALUE
5656 rb_str_delete(int argc, VALUE *argv, VALUE str)
5657 {
5658     str = rb_str_dup(str);
5659     rb_str_delete_bang(argc, argv, str);
5660     return str;
5661 }
5662 
5663 
5664 /*
5665  *  call-seq:
5666  *     str.squeeze!([other_str]*)   -> str or nil
5667  *
5668  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
5669  *  <code>nil</code> if no changes were made.
5670  */
5671 
5672 static VALUE
5673 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
5674 {
5675     char squeez[TR_TABLE_SIZE];
5676     rb_encoding *enc = 0;
5677     VALUE del = 0, nodel = 0;
5678     char *s, *send, *t;
5679     int i, modify = 0;
5680     int ascompat, singlebyte = single_byte_optimizable(str);
5681     unsigned int save;
5682 
5683     if (argc == 0) {
5684         enc = STR_ENC_GET(str);
5685     }
5686     else {
5687         for (i=0; i<argc; i++) {
5688             VALUE s = argv[i];
5689 
5690             StringValue(s);
5691             enc = rb_enc_check(str, s);
5692             if (singlebyte && !single_byte_optimizable(s))
5693                 singlebyte = 0;
5694             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5695         }
5696     }
5697 
5698     str_modify_keep_cr(str);
5699     s = t = RSTRING_PTR(str);
5700     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5701     send = RSTRING_END(str);
5702     save = -1;
5703     ascompat = rb_enc_asciicompat(enc);
5704 
5705     if (singlebyte) {
5706         while (s < send) {
5707             unsigned int c = *(unsigned char*)s++;
5708             if (c != save || (argc > 0 && !squeez[c])) {
5709                 *t++ = save = c;
5710             }
5711         }
5712     } else {
5713         while (s < send) {
5714             unsigned int c;
5715             int clen;
5716 
5717             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5718                 if (c != save || (argc > 0 && !squeez[c])) {
5719                     *t++ = save = c;
5720                 }
5721                 s++;
5722             }
5723             else {
5724                 c = rb_enc_codepoint_len(s, send, &clen, enc);
5725 
5726                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5727                     if (t != s) rb_enc_mbcput(c, t, enc);
5728                     save = c;
5729                     t += clen;
5730                 }
5731                 s += clen;
5732             }
5733         }
5734     }
5735 
5736     *t = '\0';
5737     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5738         STR_SET_LEN(str, t - RSTRING_PTR(str));
5739         modify = 1;
5740     }
5741 
5742     if (modify) return str;
5743     return Qnil;
5744 }
5745 
5746 
5747 /*
5748  *  call-seq:
5749  *     str.squeeze([other_str]*)    -> new_str
5750  *
5751  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
5752  *  procedure described for <code>String#count</code>. Returns a new string
5753  *  where runs of the same character that occur in this set are replaced by a
5754  *  single character. If no arguments are given, all runs of identical
5755  *  characters are replaced by a single character.
5756  *
5757  *     "yellow moon".squeeze                  #=> "yelow mon"
5758  *     "  now   is  the".squeeze(" ")         #=> " now is the"
5759  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
5760  */
5761 
5762 static VALUE
5763 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
5764 {
5765     str = rb_str_dup(str);
5766     rb_str_squeeze_bang(argc, argv, str);
5767     return str;
5768 }
5769 
5770 
5771 /*
5772  *  call-seq:
5773  *     str.tr_s!(from_str, to_str)   -> str or nil
5774  *
5775  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5776  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
5777  */
5778 
5779 static VALUE
5780 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
5781 {
5782     return tr_trans(str, src, repl, 1);
5783 }
5784 
5785 
5786 /*
5787  *  call-seq:
5788  *     str.tr_s(from_str, to_str)   -> new_str
5789  *
5790  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5791  *  then removes duplicate characters in regions that were affected by the
5792  *  translation.
5793  *
5794  *     "hello".tr_s('l', 'r')     #=> "hero"
5795  *     "hello".tr_s('el', '*')    #=> "h*o"
5796  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
5797  */
5798 
5799 static VALUE
5800 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5801 {
5802     str = rb_str_dup(str);
5803     tr_trans(str, src, repl, 1);
5804     return str;
5805 }
5806 
5807 
5808 /*
5809  *  call-seq:
5810  *     str.count([other_str]+)   -> fixnum
5811  *
5812  *  Each +other_str+ parameter defines a set of characters to count.  The
5813  *  intersection of these sets defines the characters to count in +str+.  Any
5814  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
5815  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
5816  *  backslash character <code>\</code> can be used to escape <code>^</code> or
5817  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
5818  *  sequence or the end of a +other_str+.
5819  *
5820  *     a = "hello world"
5821  *     a.count "lo"                   #=> 5
5822  *     a.count "lo", "o"              #=> 2
5823  *     a.count "hello", "^l"          #=> 4
5824  *     a.count "ej-m"                 #=> 4
5825  *
5826  *     "hello^world".count "\\^aeiou" #=> 4
5827  *     "hello-world".count "a\\-eo"   #=> 4
5828  *
5829  *     c = "hello world\\r\\n"
5830  *     c.count "\\"                   #=> 2
5831  *     c.count "\\A"                  #=> 0
5832  *     c.count "X-\\w"                #=> 3
5833  */
5834 
5835 static VALUE
5836 rb_str_count(int argc, VALUE *argv, VALUE str)
5837 {
5838     char table[TR_TABLE_SIZE];
5839     rb_encoding *enc = 0;
5840     VALUE del = 0, nodel = 0;
5841     char *s, *send;
5842     int i;
5843     int ascompat;
5844 
5845     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5846     for (i=0; i<argc; i++) {
5847         VALUE tstr = argv[i];
5848         unsigned char c;
5849 
5850         StringValue(tstr);
5851         enc = rb_enc_check(str, tstr);
5852         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5853             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5854             int n = 0;
5855 
5856             s = RSTRING_PTR(str);
5857             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5858             send = RSTRING_END(str);
5859             while (s < send) {
5860                 if (*(unsigned char*)s++ == c) n++;
5861             }
5862             return INT2NUM(n);
5863         }
5864         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5865     }
5866 
5867     s = RSTRING_PTR(str);
5868     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5869     send = RSTRING_END(str);
5870     ascompat = rb_enc_asciicompat(enc);
5871     i = 0;
5872     while (s < send) {
5873         unsigned int c;
5874 
5875         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5876             if (table[c]) {
5877                 i++;
5878             }
5879             s++;
5880         }
5881         else {
5882             int clen;
5883             c = rb_enc_codepoint_len(s, send, &clen, enc);
5884             if (tr_find(c, table, del, nodel)) {
5885                 i++;
5886             }
5887             s += clen;
5888         }
5889     }
5890 
5891     return INT2NUM(i);
5892 }
5893 
5894 static const char isspacetable[256] = {
5895     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5896     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5897     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5898     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5899     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5900     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5901     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5902     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5903     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5904     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5905     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5906     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5907     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5908     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5909     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5910     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5911 };
5912 
5913 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5914 
5915 /*
5916  *  call-seq:
5917  *     str.split(pattern=$;, [limit])   -> anArray
5918  *
5919  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
5920  *  of these substrings.
5921  *
5922  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
5923  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5924  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
5925  *  of contiguous whitespace characters ignored.
5926  *
5927  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5928  *  pattern matches. Whenever the pattern matches a zero-length string,
5929  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
5930  *  groups, the respective matches will be returned in the array as well.
5931  *
5932  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
5933  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5934  *  split on whitespace as if ` ' were specified.
5935  *
5936  *  If the <i>limit</i> parameter is omitted, trailing null fields are
5937  *  suppressed. If <i>limit</i> is a positive number, at most that number of
5938  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5939  *  string is returned as the only entry in an array). If negative, there is no
5940  *  limit to the number of fields returned, and trailing null fields are not
5941  *  suppressed.
5942  *
5943  *  When the input +str+ is empty an empty Array is returned as the string is
5944  *  considered to have no fields to split.
5945  *
5946  *     " now's  the time".split        #=> ["now's", "the", "time"]
5947  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
5948  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
5949  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5950  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
5951  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
5952  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
5953  *
5954  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
5955  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
5956  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
5957  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
5958  *
5959  *     "".split(',', -1)               #=> []
5960  */
5961 
5962 static VALUE
5963 rb_str_split_m(int argc, VALUE *argv, VALUE str)
5964 {
5965     rb_encoding *enc;
5966     VALUE spat;
5967     VALUE limit;
5968     enum {awk, string, regexp} split_type;
5969     long beg, end, i = 0;
5970     int lim = 0;
5971     VALUE result, tmp;
5972 
5973     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5974         lim = NUM2INT(limit);
5975         if (lim <= 0) limit = Qnil;
5976         else if (lim == 1) {
5977             if (RSTRING_LEN(str) == 0)
5978                 return rb_ary_new2(0);
5979             return rb_ary_new3(1, str);
5980         }
5981         i = 1;
5982     }
5983 
5984     enc = STR_ENC_GET(str);
5985     if (NIL_P(spat)) {
5986         if (!NIL_P(rb_fs)) {
5987             spat = rb_fs;
5988             goto fs_set;
5989         }
5990         split_type = awk;
5991     }
5992     else {
5993       fs_set:
5994         if (RB_TYPE_P(spat, T_STRING)) {
5995             rb_encoding *enc2 = STR_ENC_GET(spat);
5996 
5997             split_type = string;
5998             if (RSTRING_LEN(spat) == 0) {
5999                 /* Special case - split into chars */
6000                 spat = rb_reg_regcomp(spat);
6001                 split_type = regexp;
6002             }
6003             else if (rb_enc_asciicompat(enc2) == 1) {
6004                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
6005                     split_type = awk;
6006                 }
6007             }
6008             else {
6009                 int l;
6010                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
6011                     RSTRING_LEN(spat) == l) {
6012                     split_type = awk;
6013                 }
6014             }
6015         }
6016         else {
6017             spat = get_pat(spat, 1);
6018             split_type = regexp;
6019         }
6020     }
6021 
6022     result = rb_ary_new();
6023     beg = 0;
6024     if (split_type == awk) {
6025         char *ptr = RSTRING_PTR(str);
6026         char *eptr = RSTRING_END(str);
6027         char *bptr = ptr;
6028         int skip = 1;
6029         unsigned int c;
6030 
6031         end = beg;
6032         if (is_ascii_string(str)) {
6033             while (ptr < eptr) {
6034                 c = (unsigned char)*ptr++;
6035                 if (skip) {
6036                     if (ascii_isspace(c)) {
6037                         beg = ptr - bptr;
6038                     }
6039                     else {
6040                         end = ptr - bptr;
6041                         skip = 0;
6042                         if (!NIL_P(limit) && lim <= i) break;
6043                     }
6044                 }
6045                 else if (ascii_isspace(c)) {
6046                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6047                     skip = 1;
6048                     beg = ptr - bptr;
6049                     if (!NIL_P(limit)) ++i;
6050                 }
6051                 else {
6052                     end = ptr - bptr;
6053                 }
6054             }
6055         }
6056         else {
6057             while (ptr < eptr) {
6058                 int n;
6059 
6060                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
6061                 ptr += n;
6062                 if (skip) {
6063                     if (rb_isspace(c)) {
6064                         beg = ptr - bptr;
6065                     }
6066                     else {
6067                         end = ptr - bptr;
6068                         skip = 0;
6069                         if (!NIL_P(limit) && lim <= i) break;
6070                     }
6071                 }
6072                 else if (rb_isspace(c)) {
6073                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6074                     skip = 1;
6075                     beg = ptr - bptr;
6076                     if (!NIL_P(limit)) ++i;
6077                 }
6078                 else {
6079                     end = ptr - bptr;
6080                 }
6081             }
6082         }
6083     }
6084     else if (split_type == string) {
6085         char *ptr = RSTRING_PTR(str);
6086         char *temp = ptr;
6087         char *eptr = RSTRING_END(str);
6088         char *sptr = RSTRING_PTR(spat);
6089         long slen = RSTRING_LEN(spat);
6090 
6091         if (is_broken_string(str)) {
6092             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
6093         }
6094         if (is_broken_string(spat)) {
6095             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
6096         }
6097         enc = rb_enc_check(str, spat);
6098         while (ptr < eptr &&
6099                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
6100             /* Check we are at the start of a char */
6101             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
6102             if (t != ptr + end) {
6103                 ptr = t;
6104                 continue;
6105             }
6106             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
6107             ptr += end + slen;
6108             if (!NIL_P(limit) && lim <= ++i) break;
6109         }
6110         beg = ptr - temp;
6111     }
6112     else {
6113         char *ptr = RSTRING_PTR(str);
6114         long len = RSTRING_LEN(str);
6115         long start = beg;
6116         long idx;
6117         int last_null = 0;
6118         struct re_registers *regs;
6119 
6120         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
6121             regs = RMATCH_REGS(rb_backref_get());
6122             if (start == end && BEG(0) == END(0)) {
6123                 if (!ptr) {
6124                     rb_ary_push(result, str_new_empty(str));
6125                     break;
6126                 }
6127                 else if (last_null == 1) {
6128                     rb_ary_push(result, rb_str_subseq(str, beg,
6129                                                       rb_enc_fast_mbclen(ptr+beg,
6130                                                                          ptr+len,
6131                                                                          enc)));
6132                     beg = start;
6133                 }
6134                 else {
6135                     if (ptr+start == ptr+len)
6136                         start++;
6137                     else
6138                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
6139                     last_null = 1;
6140                     continue;
6141                 }
6142             }
6143             else {
6144                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6145                 beg = start = END(0);
6146             }
6147             last_null = 0;
6148 
6149             for (idx=1; idx < regs->num_regs; idx++) {
6150                 if (BEG(idx) == -1) continue;
6151                 if (BEG(idx) == END(idx))
6152                     tmp = str_new_empty(str);
6153                 else
6154                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
6155                 rb_ary_push(result, tmp);
6156             }
6157             if (!NIL_P(limit) && lim <= ++i) break;
6158         }
6159     }
6160     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
6161         if (RSTRING_LEN(str) == beg)
6162             tmp = str_new_empty(str);
6163         else
6164             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
6165         rb_ary_push(result, tmp);
6166     }
6167     if (NIL_P(limit) && lim == 0) {
6168         long len;
6169         while ((len = RARRAY_LEN(result)) > 0 &&
6170                (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
6171             rb_ary_pop(result);
6172     }
6173 
6174     return result;
6175 }
6176 
6177 VALUE
6178 rb_str_split(VALUE str, const char *sep0)
6179 {
6180     VALUE sep;
6181 
6182     StringValue(str);
6183     sep = rb_str_new2(sep0);
6184     return rb_str_split_m(1, &sep, str);
6185 }
6186 
6187 
6188 static VALUE
6189 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
6190 {
6191     rb_encoding *enc;
6192     VALUE rs;
6193     unsigned int newline;
6194     const char *p, *pend, *s, *ptr;
6195     long len, rslen;
6196     VALUE line;
6197     int n;
6198     VALUE orig = str;
6199     VALUE UNINITIALIZED_VAR(ary);
6200 
6201     if (argc == 0) {
6202         rs = rb_rs;
6203     }
6204     else {
6205         rb_scan_args(argc, argv, "01", &rs);
6206     }
6207 
6208     if (rb_block_given_p()) {
6209         if (wantarray) {
6210 #if STRING_ENUMERATORS_WANTARRAY
6211             rb_warn("given block not used");
6212             ary = rb_ary_new();
6213 #else
6214             rb_warning("passing a block to String#lines is deprecated");
6215             wantarray = 0;
6216 #endif
6217         }
6218     }
6219     else {
6220         if (wantarray)
6221             ary = rb_ary_new();
6222         else
6223             RETURN_ENUMERATOR(str, argc, argv);
6224     }
6225 
6226     if (NIL_P(rs)) {
6227         if (wantarray) {
6228             rb_ary_push(ary, str);
6229             return ary;
6230         }
6231         else {
6232             rb_yield(str);
6233             return orig;
6234         }
6235     }
6236     str = rb_str_new4(str);
6237     ptr = p = s = RSTRING_PTR(str);
6238     pend = p + RSTRING_LEN(str);
6239     len = RSTRING_LEN(str);
6240     StringValue(rs);
6241     if (rs == rb_default_rs) {
6242         enc = rb_enc_get(str);
6243         while (p < pend) {
6244             char *p0;
6245 
6246             p = memchr(p, '\n', pend - p);
6247             if (!p) break;
6248             p0 = rb_enc_left_char_head(s, p, pend, enc);
6249             if (!rb_enc_is_newline(p0, pend, enc)) {
6250                 p++;
6251                 continue;
6252             }
6253             p = p0 + rb_enc_mbclen(p0, pend, enc);
6254             line = rb_str_subseq(str, s - ptr, p - s);
6255             if (wantarray)
6256                 rb_ary_push(ary, line);
6257             else
6258                 rb_yield(line);
6259             str_mod_check(str, ptr, len);
6260             s = p;
6261         }
6262         goto finish;
6263     }
6264 
6265     enc = rb_enc_check(str, rs);
6266     rslen = RSTRING_LEN(rs);
6267     if (rslen == 0) {
6268         newline = '\n';
6269     }
6270     else {
6271         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6272     }
6273 
6274     while (p < pend) {
6275         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6276 
6277       again:
6278         if (rslen == 0 && c == newline) {
6279             p += n;
6280             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6281                 goto again;
6282             }
6283             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6284                 p += n;
6285             }
6286             p -= n;
6287         }
6288         if (c == newline &&
6289             (rslen <= 1 ||
6290              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6291             const char *pp = p + (rslen ? rslen : n);
6292             line = rb_str_subseq(str, s - ptr, pp - s);
6293             if (wantarray)
6294                 rb_ary_push(ary, line);
6295             else
6296                 rb_yield(line);
6297             str_mod_check(str, ptr, len);
6298             s = pp;
6299         }
6300         p += n;
6301     }
6302 
6303   finish:
6304     if (s != pend) {
6305         line = rb_str_subseq(str, s - ptr, pend - s);
6306         if (wantarray)
6307             rb_ary_push(ary, line);
6308         else
6309             rb_yield(line);
6310         RB_GC_GUARD(str);
6311     }
6312 
6313     if (wantarray)
6314         return ary;
6315     else
6316         return orig;
6317 }
6318 
6319 /*
6320  *  call-seq:
6321  *     str.each_line(separator=$/) {|substr| block }   -> str
6322  *     str.each_line(separator=$/)                     -> an_enumerator
6323  *
6324  *  Splits <i>str</i> using the supplied parameter as the record
6325  *  separator (<code>$/</code> by default), passing each substring in
6326  *  turn to the supplied block.  If a zero-length record separator is
6327  *  supplied, the string is split into paragraphs delimited by
6328  *  multiple successive newlines.
6329  *
6330  *  If no block is given, an enumerator is returned instead.
6331  *
6332  *     print "Example one\n"
6333  *     "hello\nworld".each_line {|s| p s}
6334  *     print "Example two\n"
6335  *     "hello\nworld".each_line('l') {|s| p s}
6336  *     print "Example three\n"
6337  *     "hello\n\n\nworld".each_line('') {|s| p s}
6338  *
6339  *  <em>produces:</em>
6340  *
6341  *     Example one
6342  *     "hello\n"
6343  *     "world"
6344  *     Example two
6345  *     "hel"
6346  *     "l"
6347  *     "o\nworl"
6348  *     "d"
6349  *     Example three
6350  *     "hello\n\n\n"
6351  *     "world"
6352  */
6353 
6354 static VALUE
6355 rb_str_each_line(int argc, VALUE *argv, VALUE str)
6356 {
6357     return rb_str_enumerate_lines(argc, argv, str, 0);
6358 }
6359 
6360 /*
6361  *  call-seq:
6362  *     str.lines(separator=$/)  -> an_array
6363  *
6364  *  Returns an array of lines in <i>str</i> split using the supplied
6365  *  record separator (<code>$/</code> by default).  This is a
6366  *  shorthand for <code>str.each_line(separator).to_a</code>.
6367  *
6368  *  If a block is given, which is a deprecated form, works the same as
6369  *  <code>each_line</code>.
6370  */
6371 
6372 static VALUE
6373 rb_str_lines(int argc, VALUE *argv, VALUE str)
6374 {
6375     return rb_str_enumerate_lines(argc, argv, str, 1);
6376 }
6377 
6378 static VALUE
6379 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
6380 {
6381     return LONG2FIX(RSTRING_LEN(str));
6382 }
6383 
6384 static VALUE
6385 rb_str_enumerate_bytes(VALUE str, int wantarray)
6386 {
6387     long i;
6388     VALUE UNINITIALIZED_VAR(ary);
6389 
6390     if (rb_block_given_p()) {
6391         if (wantarray) {
6392 #if STRING_ENUMERATORS_WANTARRAY
6393             rb_warn("given block not used");
6394             ary = rb_ary_new();
6395 #else
6396             rb_warning("passing a block to String#bytes is deprecated");
6397             wantarray = 0;
6398 #endif
6399         }
6400     }
6401     else {
6402         if (wantarray)
6403             ary = rb_ary_new2(RSTRING_LEN(str));
6404         else
6405             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
6406     }
6407 
6408     for (i=0; i<RSTRING_LEN(str); i++) {
6409         if (wantarray)
6410             rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6411         else
6412             rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6413     }
6414     if (wantarray)
6415         return ary;
6416     else
6417         return str;
6418 }
6419 
6420 /*
6421  *  call-seq:
6422  *     str.each_byte {|fixnum| block }    -> str
6423  *     str.each_byte                      -> an_enumerator
6424  *
6425  *  Passes each byte in <i>str</i> to the given block, or returns an
6426  *  enumerator if no block is given.
6427  *
6428  *     "hello".each_byte {|c| print c, ' ' }
6429  *
6430  *  <em>produces:</em>
6431  *
6432  *     104 101 108 108 111
6433  */
6434 
6435 static VALUE
6436 rb_str_each_byte(VALUE str)
6437 {
6438     return rb_str_enumerate_bytes(str, 0);
6439 }
6440 
6441 /*
6442  *  call-seq:
6443  *     str.bytes    -> an_array
6444  *
6445  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
6446  *  <code>str.each_byte.to_a</code>.
6447  *
6448  *  If a block is given, which is a deprecated form, works the same as
6449  *  <code>each_byte</code>.
6450  */
6451 
6452 static VALUE
6453 rb_str_bytes(VALUE str)
6454 {
6455     return rb_str_enumerate_bytes(str, 1);
6456 }
6457 
6458 static VALUE
6459 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
6460 {
6461     long len = RSTRING_LEN(str);
6462     if (!single_byte_optimizable(str)) {
6463         const char *ptr = RSTRING_PTR(str);
6464         rb_encoding *enc = rb_enc_get(str);
6465         const char *end_ptr = ptr + len;
6466         for (len = 0; ptr < end_ptr; ++len) {
6467             ptr += rb_enc_mbclen(ptr, end_ptr, enc);
6468         }
6469     }
6470     return LONG2FIX(len);
6471 }
6472 
6473 static VALUE
6474 rb_str_enumerate_chars(VALUE str, int wantarray)
6475 {
6476     VALUE orig = str;
6477     VALUE substr;
6478     long i, len, n;
6479     const char *ptr;
6480     rb_encoding *enc;
6481     VALUE UNINITIALIZED_VAR(ary);
6482 
6483     if (rb_block_given_p()) {
6484         if (wantarray) {
6485 #if STRING_ENUMERATORS_WANTARRAY
6486             rb_warn("given block not used");
6487             ary = rb_ary_new();
6488 #else
6489             rb_warning("passing a block to String#chars is deprecated");
6490             wantarray = 0;
6491 #endif
6492         }
6493     }
6494     else {
6495         if (wantarray)
6496             ary = rb_ary_new();
6497         else
6498             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6499     }
6500 
6501     str = rb_str_new4(str);
6502     ptr = RSTRING_PTR(str);
6503     len = RSTRING_LEN(str);
6504     enc = rb_enc_get(str);
6505     switch (ENC_CODERANGE(str)) {
6506       case ENC_CODERANGE_VALID:
6507       case ENC_CODERANGE_7BIT:
6508         for (i = 0; i < len; i += n) {
6509             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6510             substr = rb_str_subseq(str, i, n);
6511             if (wantarray)
6512                 rb_ary_push(ary, substr);
6513             else
6514                 rb_yield(substr);
6515         }
6516         break;
6517       default:
6518         for (i = 0; i < len; i += n) {
6519             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6520             substr = rb_str_subseq(str, i, n);
6521             if (wantarray)
6522                 rb_ary_push(ary, substr);
6523             else
6524                 rb_yield(substr);
6525         }
6526     }
6527     RB_GC_GUARD(str);
6528     if (wantarray)
6529         return ary;
6530     else
6531         return orig;
6532 }
6533 
6534 /*
6535  *  call-seq:
6536  *     str.each_char {|cstr| block }    -> str
6537  *     str.each_char                    -> an_enumerator
6538  *
6539  *  Passes each character in <i>str</i> to the given block, or returns
6540  *  an enumerator if no block is given.
6541  *
6542  *     "hello".each_char {|c| print c, ' ' }
6543  *
6544  *  <em>produces:</em>
6545  *
6546  *     h e l l o
6547  */
6548 
6549 static VALUE
6550 rb_str_each_char(VALUE str)
6551 {
6552     return rb_str_enumerate_chars(str, 0);
6553 }
6554 
6555 /*
6556  *  call-seq:
6557  *     str.chars    -> an_array
6558  *
6559  *  Returns an array of characters in <i>str</i>.  This is a shorthand
6560  *  for <code>str.each_char.to_a</code>.
6561  *
6562  *  If a block is given, which is a deprecated form, works the same as
6563  *  <code>each_char</code>.
6564  */
6565 
6566 static VALUE
6567 rb_str_chars(VALUE str)
6568 {
6569     return rb_str_enumerate_chars(str, 1);
6570 }
6571 
6572 
6573 static VALUE
6574 rb_str_enumerate_codepoints(VALUE str, int wantarray)
6575 {
6576     VALUE orig = str;
6577     int n;
6578     unsigned int c;
6579     const char *ptr, *end;
6580     rb_encoding *enc;
6581     VALUE UNINITIALIZED_VAR(ary);
6582 
6583     if (single_byte_optimizable(str))
6584         return rb_str_enumerate_bytes(str, wantarray);
6585 
6586     if (rb_block_given_p()) {
6587         if (wantarray) {
6588 #if STRING_ENUMERATORS_WANTARRAY
6589             rb_warn("given block not used");
6590             ary = rb_ary_new();
6591 #else
6592             rb_warning("passing a block to String#codepoints is deprecated");
6593             wantarray = 0;
6594 #endif
6595         }
6596     }
6597     else {
6598         if (wantarray)
6599             ary = rb_ary_new();
6600         else
6601             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6602     }
6603 
6604     str = rb_str_new4(str);
6605     ptr = RSTRING_PTR(str);
6606     end = RSTRING_END(str);
6607     enc = STR_ENC_GET(str);
6608     while (ptr < end) {
6609         c = rb_enc_codepoint_len(ptr, end, &n, enc);
6610         if (wantarray)
6611             rb_ary_push(ary, UINT2NUM(c));
6612         else
6613             rb_yield(UINT2NUM(c));
6614         ptr += n;
6615     }
6616     RB_GC_GUARD(str);
6617     if (wantarray)
6618         return ary;
6619     else
6620         return orig;
6621 }
6622 
6623 /*
6624  *  call-seq:
6625  *     str.each_codepoint {|integer| block }    -> str
6626  *     str.each_codepoint                       -> an_enumerator
6627  *
6628  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6629  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
6630  *  given block.
6631  *
6632  *  If no block is given, an enumerator is returned instead.
6633  *
6634  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
6635  *
6636  *  <em>produces:</em>
6637  *
6638  *     104 101 108 108 111 1593
6639  */
6640 
6641 static VALUE
6642 rb_str_each_codepoint(VALUE str)
6643 {
6644     return rb_str_enumerate_codepoints(str, 0);
6645 }
6646 
6647 /*
6648  *  call-seq:
6649  *     str.codepoints   -> an_array
6650  *
6651  *  Returns an array of the <code>Integer</code> ordinals of the
6652  *  characters in <i>str</i>.  This is a shorthand for
6653  *  <code>str.each_codepoint.to_a</code>.
6654  *
6655  *  If a block is given, which is a deprecated form, works the same as
6656  *  <code>each_codepoint</code>.
6657  */
6658 
6659 static VALUE
6660 rb_str_codepoints(VALUE str)
6661 {
6662     return rb_str_enumerate_codepoints(str, 1);
6663 }
6664 
6665 
6666 static long
6667 chopped_length(VALUE str)
6668 {
6669     rb_encoding *enc = STR_ENC_GET(str);
6670     const char *p, *p2, *beg, *end;
6671 
6672     beg = RSTRING_PTR(str);
6673     end = beg + RSTRING_LEN(str);
6674     if (beg > end) return 0;
6675     p = rb_enc_prev_char(beg, end, end, enc);
6676     if (!p) return 0;
6677     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6678         p2 = rb_enc_prev_char(beg, p, end, enc);
6679         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6680     }
6681     return p - beg;
6682 }
6683 
6684 /*
6685  *  call-seq:
6686  *     str.chop!   -> str or nil
6687  *
6688  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6689  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
6690  *  <code>String#chomp!</code>.
6691  */
6692 
6693 static VALUE
6694 rb_str_chop_bang(VALUE str)
6695 {
6696     str_modify_keep_cr(str);
6697     if (RSTRING_LEN(str) > 0) {
6698         long len;
6699         len = chopped_length(str);
6700         STR_SET_LEN(str, len);
6701         RSTRING_PTR(str)[len] = '\0';
6702         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6703             ENC_CODERANGE_CLEAR(str);
6704         }
6705         return str;
6706     }
6707     return Qnil;
6708 }
6709 
6710 
6711 /*
6712  *  call-seq:
6713  *     str.chop   -> new_str
6714  *
6715  *  Returns a new <code>String</code> with the last character removed.  If the
6716  *  string ends with <code>\r\n</code>, both characters are removed. Applying
6717  *  <code>chop</code> to an empty string returns an empty
6718  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
6719  *  the string unchanged if it doesn't end in a record separator.
6720  *
6721  *     "string\r\n".chop   #=> "string"
6722  *     "string\n\r".chop   #=> "string\n"
6723  *     "string\n".chop     #=> "string"
6724  *     "string".chop       #=> "strin"
6725  *     "x".chop.chop       #=> ""
6726  */
6727 
6728 static VALUE
6729 rb_str_chop(VALUE str)
6730 {
6731     return rb_str_subseq(str, 0, chopped_length(str));
6732 }
6733 
6734 
6735 /*
6736  *  call-seq:
6737  *     str.chomp!(separator=$/)   -> str or nil
6738  *
6739  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6740  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
6741  */
6742 
6743 static VALUE
6744 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
6745 {
6746     rb_encoding *enc;
6747     VALUE rs;
6748     int newline;
6749     char *p, *pp, *e;
6750     long len, rslen;
6751 
6752     str_modify_keep_cr(str);
6753     len = RSTRING_LEN(str);
6754     if (len == 0) return Qnil;
6755     p = RSTRING_PTR(str);
6756     e = p + len;
6757     if (argc == 0) {
6758         rs = rb_rs;
6759         if (rs == rb_default_rs) {
6760           smart_chomp:
6761             enc = rb_enc_get(str);
6762             if (rb_enc_mbminlen(enc) > 1) {
6763                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6764                 if (rb_enc_is_newline(pp, e, enc)) {
6765                     e = pp;
6766                 }
6767                 pp = e - rb_enc_mbminlen(enc);
6768                 if (pp >= p) {
6769                     pp = rb_enc_left_char_head(p, pp, e, enc);
6770                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6771                         e = pp;
6772                     }
6773                 }
6774                 if (e == RSTRING_END(str)) {
6775                     return Qnil;
6776                 }
6777                 len = e - RSTRING_PTR(str);
6778                 STR_SET_LEN(str, len);
6779             }
6780             else {
6781                 if (RSTRING_PTR(str)[len-1] == '\n') {
6782                     STR_DEC_LEN(str);
6783                     if (RSTRING_LEN(str) > 0 &&
6784                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6785                         STR_DEC_LEN(str);
6786                     }
6787                 }
6788                 else if (RSTRING_PTR(str)[len-1] == '\r') {
6789                     STR_DEC_LEN(str);
6790                 }
6791                 else {
6792                     return Qnil;
6793                 }
6794             }
6795             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6796             return str;
6797         }
6798     }
6799     else {
6800         rb_scan_args(argc, argv, "01", &rs);
6801     }
6802     if (NIL_P(rs)) return Qnil;
6803     StringValue(rs);
6804     rslen = RSTRING_LEN(rs);
6805     if (rslen == 0) {
6806         while (len>0 && p[len-1] == '\n') {
6807             len--;
6808             if (len>0 && p[len-1] == '\r')
6809                 len--;
6810         }
6811         if (len < RSTRING_LEN(str)) {
6812             STR_SET_LEN(str, len);
6813             RSTRING_PTR(str)[len] = '\0';
6814             return str;
6815         }
6816         return Qnil;
6817     }
6818     if (rslen > len) return Qnil;
6819     newline = RSTRING_PTR(rs)[rslen-1];
6820     if (rslen == 1 && newline == '\n')
6821         goto smart_chomp;
6822 
6823     enc = rb_enc_check(str, rs);
6824     if (is_broken_string(rs)) {
6825         return Qnil;
6826     }
6827     pp = e - rslen;
6828     if (p[len-1] == newline &&
6829         (rslen <= 1 ||
6830          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6831         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6832             return Qnil;
6833         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6834             ENC_CODERANGE_CLEAR(str);
6835         }
6836         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6837         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6838         return str;
6839     }
6840     return Qnil;
6841 }
6842 
6843 
6844 /*
6845  *  call-seq:
6846  *     str.chomp(separator=$/)   -> new_str
6847  *
6848  *  Returns a new <code>String</code> with the given record separator removed
6849  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
6850  *  changed from the default Ruby record separator, then <code>chomp</code> also
6851  *  removes carriage return characters (that is it will remove <code>\n</code>,
6852  *  <code>\r</code>, and <code>\r\n</code>).
6853  *
6854  *     "hello".chomp            #=> "hello"
6855  *     "hello\n".chomp          #=> "hello"
6856  *     "hello\r\n".chomp        #=> "hello"
6857  *     "hello\n\r".chomp        #=> "hello\n"
6858  *     "hello\r".chomp          #=> "hello"
6859  *     "hello \n there".chomp   #=> "hello \n there"
6860  *     "hello".chomp("llo")     #=> "he"
6861  */
6862 
6863 static VALUE
6864 rb_str_chomp(int argc, VALUE *argv, VALUE str)
6865 {
6866     str = rb_str_dup(str);
6867     rb_str_chomp_bang(argc, argv, str);
6868     return str;
6869 }
6870 
6871 /*
6872  *  call-seq:
6873  *     str.lstrip!   -> self or nil
6874  *
6875  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6876  *  change was made. See also <code>String#rstrip!</code> and
6877  *  <code>String#strip!</code>.
6878  *
6879  *     "  hello  ".lstrip   #=> "hello  "
6880  *     "hello".lstrip!      #=> nil
6881  */
6882 
6883 static VALUE
6884 rb_str_lstrip_bang(VALUE str)
6885 {
6886     rb_encoding *enc;
6887     char *s, *t, *e;
6888 
6889     str_modify_keep_cr(str);
6890     enc = STR_ENC_GET(str);
6891     s = RSTRING_PTR(str);
6892     if (!s || RSTRING_LEN(str) == 0) return Qnil;
6893     e = t = RSTRING_END(str);
6894     /* remove spaces at head */
6895     while (s < e) {
6896         int n;
6897