001
002
003
004
005
006
007
008
009
010
011
012
013
014 #include "ruby/ruby.h"
015 #include "ruby/re.h"
016 #include "ruby/encoding.h"
017 #include "vm_core.h"
018 #include "internal.h"
019 #include "probes.h"
020 #include <assert.h>
021
022 #define BEG(no) (regs->beg[(no)])
023 #define END(no) (regs->end[(no)])
024
025 #include <math.h>
026 #include <ctype.h>
027
028 #ifdef HAVE_UNISTD_H
029 #include <unistd.h>
030 #endif
031
032 #define STRING_ENUMERATORS_WANTARRAY 0
033
034 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
035
036 #undef rb_str_new_cstr
037 #undef rb_tainted_str_new_cstr
038 #undef rb_usascii_str_new_cstr
039 #undef rb_external_str_new_cstr
040 #undef rb_locale_str_new_cstr
041 #undef rb_str_new2
042 #undef rb_str_new3
043 #undef rb_str_new4
044 #undef rb_str_new5
045 #undef rb_tainted_str_new2
046 #undef rb_usascii_str_new2
047 #undef rb_str_dup_frozen
048 #undef rb_str_buf_new_cstr
049 #undef rb_str_buf_new2
050 #undef rb_str_buf_cat2
051 #undef rb_str_cat2
052
053 static VALUE rb_str_clear(VALUE str);
054
055 VALUE rb_cString;
056 VALUE rb_cSymbol;
057
058 #define RUBY_MAX_CHAR_LEN 16
059 #define STR_TMPLOCK FL_USER7
060 #define STR_NOEMBED FL_USER1
061 #define STR_SHARED FL_USER2
062 #define STR_ASSOC FL_USER3
063 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
064 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
065 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
066 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
067 #define STR_UNSET_NOCAPA(s) do {\
068 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
069 } while (0)
070
071
072 #define STR_SET_NOEMBED(str) do {\
073 FL_SET((str), STR_NOEMBED);\
074 STR_SET_EMBED_LEN((str), 0);\
075 } while (0)
076 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
077 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
078 #define STR_SET_EMBED_LEN(str, n) do { \
079 long tmp_n = (n);\
080 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
081 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
082 } while (0)
083
084 #define STR_SET_LEN(str, n) do { \
085 if (STR_EMBED_P(str)) {\
086 STR_SET_EMBED_LEN((str), (n));\
087 }\
088 else {\
089 RSTRING(str)->as.heap.len = (n);\
090 }\
091 } while (0)
092
093 #define STR_DEC_LEN(str) do {\
094 if (STR_EMBED_P(str)) {\
095 long n = RSTRING_LEN(str);\
096 n--;\
097 STR_SET_EMBED_LEN((str), n);\
098 }\
099 else {\
100 RSTRING(str)->as.heap.len--;\
101 }\
102 } while (0)
103
104 #define RESIZE_CAPA(str,capacity) do {\
105 if (STR_EMBED_P(str)) {\
106 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
107 char *tmp = ALLOC_N(char, (capacity)+1);\
108 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
109 RSTRING(str)->as.heap.ptr = tmp;\
110 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
111 STR_SET_NOEMBED(str);\
112 RSTRING(str)->as.heap.aux.capa = (capacity);\
113 }\
114 }\
115 else {\
116 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
117 if (!STR_NOCAPA_P(str))\
118 RSTRING(str)->as.heap.aux.capa = (capacity);\
119 }\
120 } while (0)
121
122 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
123 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
124
125 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
126
127 static inline int
128 single_byte_optimizable(VALUE str)
129 {
130 rb_encoding *enc;
131
132
133 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
134 return 1;
135
136 enc = STR_ENC_GET(str);
137 if (rb_enc_mbmaxlen(enc) == 1)
138 return 1;
139
140
141
142 return 0;
143 }
144
145 VALUE rb_fs;
146
147 static inline const char *
148 search_nonascii(const char *p, const char *e)
149 {
150 #if SIZEOF_VALUE == 8
151 # define NONASCII_MASK 0x8080808080808080ULL
152 #elif SIZEOF_VALUE == 4
153 # define NONASCII_MASK 0x80808080UL
154 #endif
155 #ifdef NONASCII_MASK
156 if ((int)sizeof(VALUE) * 2 < e - p) {
157 const VALUE *s, *t;
158 const VALUE lowbits = sizeof(VALUE) - 1;
159 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
160 while (p < (const char *)s) {
161 if (!ISASCII(*p))
162 return p;
163 p++;
164 }
165 t = (const VALUE*)(~lowbits & (VALUE)e);
166 while (s < t) {
167 if (*s & NONASCII_MASK) {
168 t = s;
169 break;
170 }
171 s++;
172 }
173 p = (const char *)t;
174 }
175 #endif
176 while (p < e) {
177 if (!ISASCII(*p))
178 return p;
179 p++;
180 }
181 return NULL;
182 }
183
184 static int
185 coderange_scan(const char *p, long len, rb_encoding *enc)
186 {
187 const char *e = p + len;
188
189 if (rb_enc_to_index(enc) == 0) {
190
191 p = search_nonascii(p, e);
192 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
193 }
194
195 if (rb_enc_asciicompat(enc)) {
196 p = search_nonascii(p, e);
197 if (!p) {
198 return ENC_CODERANGE_7BIT;
199 }
200 while (p < e) {
201 int ret = rb_enc_precise_mbclen(p, e, enc);
202 if (!MBCLEN_CHARFOUND_P(ret)) {
203 return ENC_CODERANGE_BROKEN;
204 }
205 p += MBCLEN_CHARFOUND_LEN(ret);
206 if (p < e) {
207 p = search_nonascii(p, e);
208 if (!p) {
209 return ENC_CODERANGE_VALID;
210 }
211 }
212 }
213 if (e < p) {
214 return ENC_CODERANGE_BROKEN;
215 }
216 return ENC_CODERANGE_VALID;
217 }
218
219 while (p < e) {
220 int ret = rb_enc_precise_mbclen(p, e, enc);
221
222 if (!MBCLEN_CHARFOUND_P(ret)) {
223 return ENC_CODERANGE_BROKEN;
224 }
225 p += MBCLEN_CHARFOUND_LEN(ret);
226 }
227 if (e < p) {
228 return ENC_CODERANGE_BROKEN;
229 }
230 return ENC_CODERANGE_VALID;
231 }
232
233 long
234 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
235 {
236 const char *p = s;
237
238 if (*cr == ENC_CODERANGE_BROKEN)
239 return e - s;
240
241 if (rb_enc_to_index(enc) == 0) {
242
243 p = search_nonascii(p, e);
244 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
245 return e - s;
246 }
247 else if (rb_enc_asciicompat(enc)) {
248 p = search_nonascii(p, e);
249 if (!p) {
250 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
251 return e - s;
252 }
253 while (p < e) {
254 int ret = rb_enc_precise_mbclen(p, e, enc);
255 if (!MBCLEN_CHARFOUND_P(ret)) {
256 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
257 return p - s;
258 }
259 p += MBCLEN_CHARFOUND_LEN(ret);
260 if (p < e) {
261 p = search_nonascii(p, e);
262 if (!p) {
263 *cr = ENC_CODERANGE_VALID;
264 return e - s;
265 }
266 }
267 }
268 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
269 return p - s;
270 }
271 else {
272 while (p < e) {
273 int ret = rb_enc_precise_mbclen(p, e, enc);
274 if (!MBCLEN_CHARFOUND_P(ret)) {
275 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
276 return p - s;
277 }
278 p += MBCLEN_CHARFOUND_LEN(ret);
279 }
280 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
281 return p - s;
282 }
283 }
284
285 static inline void
286 str_enc_copy(VALUE str1, VALUE str2)
287 {
288 rb_enc_set_index(str1, ENCODING_GET(str2));
289 }
290
291 static void
292 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
293 {
294
295
296
297 str_enc_copy(dest, src);
298 if (RSTRING_LEN(dest) == 0) {
299 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
301 else
302 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
303 return;
304 }
305 switch (ENC_CODERANGE(src)) {
306 case ENC_CODERANGE_7BIT:
307 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
308 break;
309 case ENC_CODERANGE_VALID:
310 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
311 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
312 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
313 else
314 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
315 break;
316 default:
317 break;
318 }
319 }
320
321 static void
322 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
323 {
324 str_enc_copy(dest, src);
325 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
326 }
327
328 int
329 rb_enc_str_coderange(VALUE str)
330 {
331 int cr = ENC_CODERANGE(str);
332
333 if (cr == ENC_CODERANGE_UNKNOWN) {
334 rb_encoding *enc = STR_ENC_GET(str);
335 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
336 ENC_CODERANGE_SET(str, cr);
337 }
338 return cr;
339 }
340
341 int
342 rb_enc_str_asciionly_p(VALUE str)
343 {
344 rb_encoding *enc = STR_ENC_GET(str);
345
346 if (!rb_enc_asciicompat(enc))
347 return FALSE;
348 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
349 return TRUE;
350 return FALSE;
351 }
352
353 static inline void
354 str_mod_check(VALUE s, const char *p, long len)
355 {
356 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
357 rb_raise(rb_eRuntimeError, "string modified");
358 }
359 }
360
361 size_t
362 rb_str_capacity(VALUE str)
363 {
364 if (STR_EMBED_P(str)) {
365 return RSTRING_EMBED_LEN_MAX;
366 }
367 else if (STR_NOCAPA_P(str)) {
368 return RSTRING(str)->as.heap.len;
369 }
370 else {
371 return RSTRING(str)->as.heap.aux.capa;
372 }
373 }
374
375 static inline VALUE
376 str_alloc(VALUE klass)
377 {
378 NEWOBJ_OF(str, struct RString, klass, T_STRING);
379
380 str->as.heap.ptr = 0;
381 str->as.heap.len = 0;
382 str->as.heap.aux.capa = 0;
383
384 return (VALUE)str;
385 }
386
387 static inline VALUE
388 empty_str_alloc(VALUE klass)
389 {
390 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
391 RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
392 }
393 return str_alloc(klass);
394 }
395
396 static VALUE
397 str_new(VALUE klass, const char *ptr, long len)
398 {
399 VALUE str;
400
401 if (len < 0) {
402 rb_raise(rb_eArgError, "negative string size (or size too big)");
403 }
404
405 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
406 RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
407 }
408
409 str = str_alloc(klass);
410 if (len > RSTRING_EMBED_LEN_MAX) {
411 RSTRING(str)->as.heap.aux.capa = len;
412 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
413 STR_SET_NOEMBED(str);
414 }
415 else if (len == 0) {
416 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
417 }
418 if (ptr) {
419 memcpy(RSTRING_PTR(str), ptr, len);
420 }
421 STR_SET_LEN(str, len);
422 RSTRING_PTR(str)[len] = '\0';
423 return str;
424 }
425
426 VALUE
427 rb_str_new(const char *ptr, long len)
428 {
429 return str_new(rb_cString, ptr, len);
430 }
431
432 VALUE
433 rb_usascii_str_new(const char *ptr, long len)
434 {
435 VALUE str = rb_str_new(ptr, len);
436 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
437 return str;
438 }
439
440 VALUE
441 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
442 {
443 VALUE str = rb_str_new(ptr, len);
444 rb_enc_associate(str, enc);
445 return str;
446 }
447
448 VALUE
449 rb_str_new_cstr(const char *ptr)
450 {
451 if (!ptr) {
452 rb_raise(rb_eArgError, "NULL pointer given");
453 }
454 return rb_str_new(ptr, strlen(ptr));
455 }
456
457 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
458 #define rb_str_new2 rb_str_new_cstr
459
460 VALUE
461 rb_usascii_str_new_cstr(const char *ptr)
462 {
463 VALUE str = rb_str_new2(ptr);
464 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
465 return str;
466 }
467
468 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
469 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
470
471 VALUE
472 rb_tainted_str_new(const char *ptr, long len)
473 {
474 VALUE str = rb_str_new(ptr, len);
475
476 OBJ_TAINT(str);
477 return str;
478 }
479
480 VALUE
481 rb_tainted_str_new_cstr(const char *ptr)
482 {
483 VALUE str = rb_str_new2(ptr);
484
485 OBJ_TAINT(str);
486 return str;
487 }
488
489 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
490 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
491
492 VALUE
493 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
494 {
495 extern VALUE rb_cEncodingConverter;
496 rb_econv_t *ec;
497 rb_econv_result_t ret;
498 long len, olen;
499 VALUE econv_wrapper;
500 VALUE newstr;
501 const unsigned char *start, *sp;
502 unsigned char *dest, *dp;
503 size_t converted_output = 0;
504
505 if (!to) return str;
506 if (!from) from = rb_enc_get(str);
507 if (from == to) return str;
508 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
509 to == rb_ascii8bit_encoding()) {
510 if (STR_ENC_GET(str) != to) {
511 str = rb_str_dup(str);
512 rb_enc_associate(str, to);
513 }
514 return str;
515 }
516
517 len = RSTRING_LEN(str);
518 newstr = rb_str_new(0, len);
519 olen = len;
520
521 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
522 RBASIC(econv_wrapper)->klass = 0;
523 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
524 if (!ec) return str;
525 DATA_PTR(econv_wrapper) = ec;
526
527 sp = (unsigned char*)RSTRING_PTR(str);
528 start = sp;
529 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
530 (dp = dest + converted_output),
531 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
532 ret == econv_destination_buffer_full) {
533
534 size_t converted_input = sp - start;
535 size_t rest = len - converted_input;
536 converted_output = dp - dest;
537 rb_str_set_len(newstr, converted_output);
538 if (converted_input && converted_output &&
539 rest < (LONG_MAX / converted_output)) {
540 rest = (rest * converted_output) / converted_input;
541 }
542 else {
543 rest = olen;
544 }
545 olen += rest < 2 ? 2 : rest;
546 rb_str_resize(newstr, olen);
547 }
548 DATA_PTR(econv_wrapper) = 0;
549 rb_econv_close(ec);
550 rb_gc_force_recycle(econv_wrapper);
551 switch (ret) {
552 case econv_finished:
553 len = dp - (unsigned char*)RSTRING_PTR(newstr);
554 rb_str_set_len(newstr, len);
555 rb_enc_associate(newstr, to);
556 return newstr;
557
558 default:
559
560 return str;
561 }
562 }
563
564 VALUE
565 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
566 {
567 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
568 }
569
570 VALUE
571 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
572 {
573 VALUE str;
574
575 str = rb_tainted_str_new(ptr, len);
576 if (eenc == rb_usascii_encoding() &&
577 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
578 rb_enc_associate(str, rb_ascii8bit_encoding());
579 return str;
580 }
581 rb_enc_associate(str, eenc);
582 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
583 }
584
585 VALUE
586 rb_external_str_new(const char *ptr, long len)
587 {
588 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
589 }
590
591 VALUE
592 rb_external_str_new_cstr(const char *ptr)
593 {
594 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
595 }
596
597 VALUE
598 rb_locale_str_new(const char *ptr, long len)
599 {
600 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
601 }
602
603 VALUE
604 rb_locale_str_new_cstr(const char *ptr)
605 {
606 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
607 }
608
609 VALUE
610 rb_filesystem_str_new(const char *ptr, long len)
611 {
612 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
613 }
614
615 VALUE
616 rb_filesystem_str_new_cstr(const char *ptr)
617 {
618 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
619 }
620
621 VALUE
622 rb_str_export(VALUE str)
623 {
624 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
625 }
626
627 VALUE
628 rb_str_export_locale(VALUE str)
629 {
630 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
631 }
632
633 VALUE
634 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
635 {
636 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
637 }
638
639 static VALUE
640 str_replace_shared_without_enc(VALUE str2, VALUE str)
641 {
642 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
643 STR_SET_EMBED(str2);
644 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
645 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
646 }
647 else {
648 str = rb_str_new_frozen(str);
649 FL_SET(str2, STR_NOEMBED);
650 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
651 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
652 RSTRING(str2)->as.heap.aux.shared = str;
653 FL_SET(str2, ELTS_SHARED);
654 }
655 return str2;
656 }
657
658 static VALUE
659 str_replace_shared(VALUE str2, VALUE str)
660 {
661 str_replace_shared_without_enc(str2, str);
662 rb_enc_cr_str_exact_copy(str2, str);
663 return str2;
664 }
665
666 static VALUE
667 str_new_shared(VALUE klass, VALUE str)
668 {
669 return str_replace_shared(str_alloc(klass), str);
670 }
671
672 static VALUE
673 str_new3(VALUE klass, VALUE str)
674 {
675 return str_new_shared(klass, str);
676 }
677
678 VALUE
679 rb_str_new_shared(VALUE str)
680 {
681 VALUE str2 = str_new3(rb_obj_class(str), str);
682
683 OBJ_INFECT(str2, str);
684 return str2;
685 }
686
687 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
688 #define rb_str_new3 rb_str_new_shared
689
690 static VALUE
691 str_new4(VALUE klass, VALUE str)
692 {
693 VALUE str2;
694
695 str2 = str_alloc(klass);
696 STR_SET_NOEMBED(str2);
697 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
698 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
699 if (STR_SHARED_P(str)) {
700 VALUE shared = RSTRING(str)->as.heap.aux.shared;
701 assert(OBJ_FROZEN(shared));
702 FL_SET(str2, ELTS_SHARED);
703 RSTRING(str2)->as.heap.aux.shared = shared;
704 }
705 else {
706 FL_SET(str, ELTS_SHARED);
707 RSTRING(str)->as.heap.aux.shared = str2;
708 }
709 rb_enc_cr_str_exact_copy(str2, str);
710 OBJ_INFECT(str2, str);
711 return str2;
712 }
713
714 VALUE
715 rb_str_new_frozen(VALUE orig)
716 {
717 VALUE klass, str;
718
719 if (OBJ_FROZEN(orig)) return orig;
720 klass = rb_obj_class(orig);
721 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
722 long ofs;
723 assert(OBJ_FROZEN(str));
724 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
725 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
726 ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
727 ENCODING_GET(str) != ENCODING_GET(orig)) {
728 str = str_new3(klass, str);
729 RSTRING(str)->as.heap.ptr += ofs;
730 RSTRING(str)->as.heap.len -= ofs;
731 rb_enc_cr_str_exact_copy(str, orig);
732 OBJ_INFECT(str, orig);
733 }
734 }
735 else if (STR_EMBED_P(orig)) {
736 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
737 rb_enc_cr_str_exact_copy(str, orig);
738 OBJ_INFECT(str, orig);
739 }
740 else if (STR_ASSOC_P(orig)) {
741 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
742 FL_UNSET(orig, STR_ASSOC);
743 str = str_new4(klass, orig);
744 FL_SET(str, STR_ASSOC);
745 RSTRING(str)->as.heap.aux.shared = assoc;
746 }
747 else {
748 str = str_new4(klass, orig);
749 }
750 OBJ_FREEZE(str);
751 return str;
752 }
753
754 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
755 #define rb_str_new4 rb_str_new_frozen
756
757 VALUE
758 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
759 {
760 return str_new(rb_obj_class(obj), ptr, len);
761 }
762
763 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
764 rb_str_new_with_class, (obj, ptr, len))
765 #define rb_str_new5 rb_str_new_with_class
766
767 static VALUE
768 str_new_empty(VALUE str)
769 {
770 VALUE v = rb_str_new5(str, 0, 0);
771 rb_enc_copy(v, str);
772 OBJ_INFECT(v, str);
773 return v;
774 }
775
776 #define STR_BUF_MIN_SIZE 128
777
778 VALUE
779 rb_str_buf_new(long capa)
780 {
781 VALUE str = str_alloc(rb_cString);
782
783 if (capa < STR_BUF_MIN_SIZE) {
784 capa = STR_BUF_MIN_SIZE;
785 }
786 FL_SET(str, STR_NOEMBED);
787 RSTRING(str)->as.heap.aux.capa = capa;
788 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
789 RSTRING(str)->as.heap.ptr[0] = '\0';
790
791 return str;
792 }
793
794 VALUE
795 rb_str_buf_new_cstr(const char *ptr)
796 {
797 VALUE str;
798 long len = strlen(ptr);
799
800 str = rb_str_buf_new(len);
801 rb_str_buf_cat(str, ptr, len);
802
803 return str;
804 }
805
806 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
807 #define rb_str_buf_new2 rb_str_buf_new_cstr
808
809 VALUE
810 rb_str_tmp_new(long len)
811 {
812 return str_new(0, 0, len);
813 }
814
815 void *
816 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
817 {
818 VALUE s = rb_str_tmp_new(len);
819 *store = s;
820 return RSTRING_PTR(s);
821 }
822
823 void
824 rb_free_tmp_buffer(volatile VALUE *store)
825 {
826 VALUE s = *store;
827 *store = 0;
828 if (s) rb_str_clear(s);
829 }
830
831 void
832 rb_str_free(VALUE str)
833 {
834 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
835 xfree(RSTRING(str)->as.heap.ptr);
836 }
837 }
838
839 RUBY_FUNC_EXPORTED size_t
840 rb_str_memsize(VALUE str)
841 {
842 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
843 return RSTRING(str)->as.heap.aux.capa;
844 }
845 else {
846 return 0;
847 }
848 }
849
850 VALUE
851 rb_str_to_str(VALUE str)
852 {
853 return rb_convert_type(str, T_STRING, "String", "to_str");
854 }
855
856 static inline void str_discard(VALUE str);
857
858 void
859 rb_str_shared_replace(VALUE str, VALUE str2)
860 {
861 rb_encoding *enc;
862 int cr;
863 if (str == str2) return;
864 enc = STR_ENC_GET(str2);
865 cr = ENC_CODERANGE(str2);
866 str_discard(str);
867 OBJ_INFECT(str, str2);
868 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
869 STR_SET_EMBED(str);
870 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
871 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
872 rb_enc_associate(str, enc);
873 ENC_CODERANGE_SET(str, cr);
874 return;
875 }
876 STR_SET_NOEMBED(str);
877 STR_UNSET_NOCAPA(str);
878 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
879 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
880 if (STR_NOCAPA_P(str2)) {
881 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
882 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
883 }
884 else {
885 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
886 }
887 STR_SET_EMBED(str2);
888 RSTRING_PTR(str2)[0] = 0;
889 STR_SET_EMBED_LEN(str2, 0);
890 rb_enc_associate(str, enc);
891 ENC_CODERANGE_SET(str, cr);
892 }
893
894 static ID id_to_s;
895
896 VALUE
897 rb_obj_as_string(VALUE obj)
898 {
899 VALUE str;
900
901 if (RB_TYPE_P(obj, T_STRING)) {
902 return obj;
903 }
904 str = rb_funcall(obj, id_to_s, 0);
905 if (!RB_TYPE_P(str, T_STRING))
906 return rb_any_to_s(obj);
907 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
908 return str;
909 }
910
911 static VALUE
912 str_replace(VALUE str, VALUE str2)
913 {
914 long len;
915
916 len = RSTRING_LEN(str2);
917 if (STR_ASSOC_P(str2)) {
918 str2 = rb_str_new4(str2);
919 }
920 if (STR_SHARED_P(str2)) {
921 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
922 assert(OBJ_FROZEN(shared));
923 STR_SET_NOEMBED(str);
924 RSTRING(str)->as.heap.len = len;
925 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
926 FL_SET(str, ELTS_SHARED);
927 FL_UNSET(str, STR_ASSOC);
928 RSTRING(str)->as.heap.aux.shared = shared;
929 }
930 else {
931 str_replace_shared(str, str2);
932 }
933
934 OBJ_INFECT(str, str2);
935 rb_enc_cr_str_exact_copy(str, str2);
936 return str;
937 }
938
939 static VALUE
940 str_duplicate(VALUE klass, VALUE str)
941 {
942 VALUE dup = str_alloc(klass);
943 str_replace(dup, str);
944 return dup;
945 }
946
947 VALUE
948 rb_str_dup(VALUE str)
949 {
950 return str_duplicate(rb_obj_class(str), str);
951 }
952
953 VALUE
954 rb_str_resurrect(VALUE str)
955 {
956 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
957 RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
958 rb_sourcefile(), rb_sourceline());
959 }
960 return str_replace(str_alloc(rb_cString), str);
961 }
962
963
964
965
966
967
968
969
970 static VALUE
971 rb_str_init(int argc, VALUE *argv, VALUE str)
972 {
973 VALUE orig;
974
975 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
976 rb_str_replace(str, orig);
977 return str;
978 }
979
980 static inline long
981 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
982 {
983 long c;
984 const char *q;
985
986 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
987 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
988 }
989 else if (rb_enc_asciicompat(enc)) {
990 c = 0;
991 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
992 while (p < e) {
993 if (ISASCII(*p)) {
994 q = search_nonascii(p, e);
995 if (!q)
996 return c + (e - p);
997 c += q - p;
998 p = q;
999 }
1000 p += rb_enc_fast_mbclen(p, e, enc);
1001 c++;
1002 }
1003 }
1004 else {
1005 while (p < e) {
1006 if (ISASCII(*p)) {
1007 q = search_nonascii(p, e);
1008 if (!q)
1009 return c + (e - p);
1010 c += q - p;
1011 p = q;
1012 }
1013 p += rb_enc_mbclen(p, e, enc);
1014 c++;
1015 }
1016 }
1017 return c;
1018 }
1019
1020 for (c=0; p<e; c++) {
1021 p += rb_enc_mbclen(p, e, enc);
1022 }
1023 return c;
1024 }
1025
1026 long
1027 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1028 {
1029 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1030 }
1031
1032 long
1033 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1034 {
1035 long c;
1036 const char *q;
1037 int ret;
1038
1039 *cr = 0;
1040 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1041 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
1042 }
1043 else if (rb_enc_asciicompat(enc)) {
1044 c = 0;
1045 while (p < e) {
1046 if (ISASCII(*p)) {
1047 q = search_nonascii(p, e);
1048 if (!q) {
1049 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1050 return c + (e - p);
1051 }
1052 c += q - p;
1053 p = q;
1054 }
1055 ret = rb_enc_precise_mbclen(p, e, enc);
1056 if (MBCLEN_CHARFOUND_P(ret)) {
1057 *cr |= ENC_CODERANGE_VALID;
1058 p += MBCLEN_CHARFOUND_LEN(ret);
1059 }
1060 else {
1061 *cr = ENC_CODERANGE_BROKEN;
1062 p++;
1063 }
1064 c++;
1065 }
1066 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1067 return c;
1068 }
1069
1070 for (c=0; p<e; c++) {
1071 ret = rb_enc_precise_mbclen(p, e, enc);
1072 if (MBCLEN_CHARFOUND_P(ret)) {
1073 *cr |= ENC_CODERANGE_VALID;
1074 p += MBCLEN_CHARFOUND_LEN(ret);
1075 }
1076 else {
1077 *cr = ENC_CODERANGE_BROKEN;
1078 if (p + rb_enc_mbminlen(enc) <= e)
1079 p += rb_enc_mbminlen(enc);
1080 else
1081 p = e;
1082 }
1083 }
1084 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1085 return c;
1086 }
1087
1088 #ifdef NONASCII_MASK
1089 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103 static inline VALUE
1104 count_utf8_lead_bytes_with_word(const VALUE *s)
1105 {
1106 VALUE d = *s;
1107
1108
1109 d |= ~(d>>1);
1110 d >>= 6;
1111 d &= NONASCII_MASK >> 7;
1112
1113
1114 d += (d>>8);
1115 d += (d>>16);
1116 #if SIZEOF_VALUE == 8
1117 d += (d>>32);
1118 #endif
1119 return (d&0xF);
1120 }
1121 #endif
1122
1123 static long
1124 str_strlen(VALUE str, rb_encoding *enc)
1125 {
1126 const char *p, *e;
1127 long n;
1128 int cr;
1129
1130 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1131 if (!enc) enc = STR_ENC_GET(str);
1132 p = RSTRING_PTR(str);
1133 e = RSTRING_END(str);
1134 cr = ENC_CODERANGE(str);
1135 #ifdef NONASCII_MASK
1136 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1137 enc == rb_utf8_encoding()) {
1138
1139 VALUE len = 0;
1140 if ((int)sizeof(VALUE) * 2 < e - p) {
1141 const VALUE *s, *t;
1142 const VALUE lowbits = sizeof(VALUE) - 1;
1143 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1144 t = (const VALUE*)(~lowbits & (VALUE)e);
1145 while (p < (const char *)s) {
1146 if (is_utf8_lead_byte(*p)) len++;
1147 p++;
1148 }
1149 while (s < t) {
1150 len += count_utf8_lead_bytes_with_word(s);
1151 s++;
1152 }
1153 p = (const char *)s;
1154 }
1155 while (p < e) {
1156 if (is_utf8_lead_byte(*p)) len++;
1157 p++;
1158 }
1159 return (long)len;
1160 }
1161 #endif
1162 n = rb_enc_strlen_cr(p, e, enc, &cr);
1163 if (cr) {
1164 ENC_CODERANGE_SET(str, cr);
1165 }
1166 return n;
1167 }
1168
1169 long
1170 rb_str_strlen(VALUE str)
1171 {
1172 return str_strlen(str, STR_ENC_GET(str));
1173 }
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183 VALUE
1184 rb_str_length(VALUE str)
1185 {
1186 long len;
1187
1188 len = str_strlen(str, STR_ENC_GET(str));
1189 return LONG2NUM(len);
1190 }
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202 static VALUE
1203 rb_str_bytesize(VALUE str)
1204 {
1205 return LONG2NUM(RSTRING_LEN(str));
1206 }
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219 static VALUE
1220 rb_str_empty(VALUE str)
1221 {
1222 if (RSTRING_LEN(str) == 0)
1223 return Qtrue;
1224 return Qfalse;
1225 }
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237 VALUE
1238 rb_str_plus(VALUE str1, VALUE str2)
1239 {
1240 VALUE str3;
1241 rb_encoding *enc;
1242
1243 StringValue(str2);
1244 enc = rb_enc_check(str1, str2);
1245 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1246 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1247 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1248 RSTRING_PTR(str2), RSTRING_LEN(str2));
1249 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1250
1251 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1252 OBJ_TAINT(str3);
1253 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
1254 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
1255 return str3;
1256 }
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269 VALUE
1270 rb_str_times(VALUE str, VALUE times)
1271 {
1272 VALUE str2;
1273 long n, len;
1274 char *ptr2;
1275
1276 len = NUM2LONG(times);
1277 if (len < 0) {
1278 rb_raise(rb_eArgError, "negative argument");
1279 }
1280 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1281 rb_raise(rb_eArgError, "argument too big");
1282 }
1283
1284 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1285 ptr2 = RSTRING_PTR(str2);
1286 if (len) {
1287 n = RSTRING_LEN(str);
1288 memcpy(ptr2, RSTRING_PTR(str), n);
1289 while (n <= len/2) {
1290 memcpy(ptr2 + n, ptr2, n);
1291 n *= 2;
1292 }
1293 memcpy(ptr2 + n, ptr2, len-n);
1294 }
1295 ptr2[RSTRING_LEN(str2)] = '\0';
1296 OBJ_INFECT(str2, str);
1297 rb_enc_cr_str_copy_for_substr(str2, str);
1298
1299 return str2;
1300 }
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317 static VALUE
1318 rb_str_format_m(VALUE str, VALUE arg)
1319 {
1320 volatile VALUE tmp = rb_check_array_type(arg);
1321
1322 if (!NIL_P(tmp)) {
1323 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1324 }
1325 return rb_str_format(1, &arg, str);
1326 }
1327
1328 static inline void
1329 str_modifiable(VALUE str)
1330 {
1331 if (FL_TEST(str, STR_TMPLOCK)) {
1332 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1333 }
1334 rb_check_frozen(str);
1335 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1336 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1337 }
1338
1339 static inline int
1340 str_independent(VALUE str)
1341 {
1342 str_modifiable(str);
1343 if (!STR_SHARED_P(str)) return 1;
1344 if (STR_EMBED_P(str)) return 1;
1345 return 0;
1346 }
1347
1348 static void
1349 str_make_independent_expand(VALUE str, long expand)
1350 {
1351 char *ptr;
1352 long len = RSTRING_LEN(str);
1353 long capa = len + expand;
1354
1355 if (len > capa) len = capa;
1356 ptr = ALLOC_N(char, capa + 1);
1357 if (RSTRING_PTR(str)) {
1358 memcpy(ptr, RSTRING_PTR(str), len);
1359 }
1360 STR_SET_NOEMBED(str);
1361 STR_UNSET_NOCAPA(str);
1362 ptr[len] = 0;
1363 RSTRING(str)->as.heap.ptr = ptr;
1364 RSTRING(str)->as.heap.len = len;
1365 RSTRING(str)->as.heap.aux.capa = capa;
1366 }
1367
1368 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1369
1370 void
1371 rb_str_modify(VALUE str)
1372 {
1373 if (!str_independent(str))
1374 str_make_independent(str);
1375 ENC_CODERANGE_CLEAR(str);
1376 }
1377
1378 void
1379 rb_str_modify_expand(VALUE str, long expand)
1380 {
1381 if (expand < 0) {
1382 rb_raise(rb_eArgError, "negative expanding string size");
1383 }
1384 if (!str_independent(str)) {
1385 str_make_independent_expand(str, expand);
1386 }
1387 else if (expand > 0) {
1388 long len = RSTRING_LEN(str);
1389 long capa = len + expand;
1390 if (!STR_EMBED_P(str)) {
1391 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1392 RSTRING(str)->as.heap.aux.capa = capa;
1393 }
1394 else if (capa > RSTRING_EMBED_LEN_MAX) {
1395 str_make_independent_expand(str, expand);
1396 }
1397 }
1398 ENC_CODERANGE_CLEAR(str);
1399 }
1400
1401
1402 static void
1403 str_modify_keep_cr(VALUE str)
1404 {
1405 if (!str_independent(str))
1406 str_make_independent(str);
1407 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1408
1409 ENC_CODERANGE_CLEAR(str);
1410 }
1411
1412 static inline void
1413 str_discard(VALUE str)
1414 {
1415 str_modifiable(str);
1416 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1417 xfree(RSTRING_PTR(str));
1418 RSTRING(str)->as.heap.ptr = 0;
1419 RSTRING(str)->as.heap.len = 0;
1420 }
1421 }
1422
1423 void
1424 rb_str_associate(VALUE str, VALUE add)
1425 {
1426
1427 rb_check_frozen(str);
1428 if (STR_ASSOC_P(str)) {
1429
1430 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1431 }
1432 else {
1433 if (STR_SHARED_P(str)) {
1434 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1435 str_make_independent(str);
1436 if (STR_ASSOC_P(assoc)) {
1437 assoc = RSTRING(assoc)->as.heap.aux.shared;
1438 rb_ary_concat(assoc, add);
1439 add = assoc;
1440 }
1441 }
1442 else if (STR_EMBED_P(str)) {
1443 str_make_independent(str);
1444 }
1445 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1446 RESIZE_CAPA(str, RSTRING_LEN(str));
1447 }
1448 FL_SET(str, STR_ASSOC);
1449 RBASIC(add)->klass = 0;
1450 RSTRING(str)->as.heap.aux.shared = add;
1451 }
1452 }
1453
1454 VALUE
1455 rb_str_associated(VALUE str)
1456 {
1457 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1458 if (STR_ASSOC_P(str)) {
1459 return RSTRING(str)->as.heap.aux.shared;
1460 }
1461 return Qfalse;
1462 }
1463
1464 void
1465 rb_must_asciicompat(VALUE str)
1466 {
1467 rb_encoding *enc = rb_enc_get(str);
1468 if (!rb_enc_asciicompat(enc)) {
1469 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
1470 }
1471 }
1472
1473 VALUE
1474 rb_string_value(volatile VALUE *ptr)
1475 {
1476 VALUE s = *ptr;
1477 if (!RB_TYPE_P(s, T_STRING)) {
1478 s = rb_str_to_str(s);
1479 *ptr = s;
1480 }
1481 return s;
1482 }
1483
1484 char *
1485 rb_string_value_ptr(volatile VALUE *ptr)
1486 {
1487 VALUE str = rb_string_value(ptr);
1488 return RSTRING_PTR(str);
1489 }
1490
1491 char *
1492 rb_string_value_cstr(volatile VALUE *ptr)
1493 {
1494 VALUE str = rb_string_value(ptr);
1495 char *s = RSTRING_PTR(str);
1496 long len = RSTRING_LEN(str);
1497
1498 if (!s || memchr(s, 0, len)) {
1499 rb_raise(rb_eArgError, "string contains null byte");
1500 }
1501 if (s[len]) {
1502 rb_str_modify(str);
1503 s = RSTRING_PTR(str);
1504 s[RSTRING_LEN(str)] = 0;
1505 }
1506 return s;
1507 }
1508
1509 VALUE
1510 rb_check_string_type(VALUE str)
1511 {
1512 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1513 return str;
1514 }
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527 static VALUE
1528 rb_str_s_try_convert(VALUE dummy, VALUE str)
1529 {
1530 return rb_check_string_type(str);
1531 }
1532
1533 static char*
1534 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1535 {
1536 long nth = *nthp;
1537 if (rb_enc_mbmaxlen(enc) == 1) {
1538 p += nth;
1539 }
1540 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1541 p += nth * rb_enc_mbmaxlen(enc);
1542 }
1543 else if (rb_enc_asciicompat(enc)) {
1544 const char *p2, *e2;
1545 int n;
1546
1547 while (p < e && 0 < nth) {
1548 e2 = p + nth;
1549 if (e < e2) {
1550 *nthp = nth;
1551 return (char *)e;
1552 }
1553 if (ISASCII(*p)) {
1554 p2 = search_nonascii(p, e2);
1555 if (!p2) {
1556 nth -= e2 - p;
1557 *nthp = nth;
1558 return (char *)e2;
1559 }
1560 nth -= p2 - p;
1561 p = p2;
1562 }
1563 n = rb_enc_mbclen(p, e, enc);
1564 p += n;
1565 nth--;
1566 }
1567 *nthp = nth;
1568 if (nth != 0) {
1569 return (char *)e;
1570 }
1571 return (char *)p;
1572 }
1573 else {
1574 while (p < e && nth--) {
1575 p += rb_enc_mbclen(p, e, enc);
1576 }
1577 }
1578 if (p > e) p = e;
1579 *nthp = nth;
1580 return (char*)p;
1581 }
1582
1583 char*
1584 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1585 {
1586 return str_nth_len(p, e, &nth, enc);
1587 }
1588
1589 static char*
1590 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1591 {
1592 if (singlebyte)
1593 p += nth;
1594 else {
1595 p = str_nth_len(p, e, &nth, enc);
1596 }
1597 if (!p) return 0;
1598 if (p > e) p = e;
1599 return (char *)p;
1600 }
1601
1602
1603 static long
1604 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1605 {
1606 const char *pp = str_nth(p, e, nth, enc, singlebyte);
1607 if (!pp) return e - p;
1608 return pp - p;
1609 }
1610
1611 long
1612 rb_str_offset(VALUE str, long pos)
1613 {
1614 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1615 STR_ENC_GET(str), single_byte_optimizable(str));
1616 }
1617
1618 #ifdef NONASCII_MASK
1619 static char *
1620 str_utf8_nth(const char *p, const char *e, long *nthp)
1621 {
1622 long nth = *nthp;
1623 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1624 const VALUE *s, *t;
1625 const VALUE lowbits = sizeof(VALUE) - 1;
1626 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1627 t = (const VALUE*)(~lowbits & (VALUE)e);
1628 while (p < (const char *)s) {
1629 if (is_utf8_lead_byte(*p)) nth--;
1630 p++;
1631 }
1632 do {
1633 nth -= count_utf8_lead_bytes_with_word(s);
1634 s++;
1635 } while (s < t && (int)sizeof(VALUE) <= nth);
1636 p = (char *)s;
1637 }
1638 while (p < e) {
1639 if (is_utf8_lead_byte(*p)) {
1640 if (nth == 0) break;
1641 nth--;
1642 }
1643 p++;
1644 }
1645 *nthp = nth;
1646 return (char *)p;
1647 }
1648
1649 static long
1650 str_utf8_offset(const char *p, const char *e, long nth)
1651 {
1652 const char *pp = str_utf8_nth(p, e, &nth);
1653 return pp - p;
1654 }
1655 #endif
1656
1657
1658 long
1659 rb_str_sublen(VALUE str, long pos)
1660 {
1661 if (single_byte_optimizable(str) || pos < 0)
1662 return pos;
1663 else {
1664 char *p = RSTRING_PTR(str);
1665 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1666 }
1667 }
1668
1669 VALUE
1670 rb_str_subseq(VALUE str, long beg, long len)
1671 {
1672 VALUE str2;
1673
1674 if (RSTRING_LEN(str) == beg + len &&
1675 RSTRING_EMBED_LEN_MAX < len) {
1676 str2 = rb_str_new_shared(rb_str_new_frozen(str));
1677 rb_str_drop_bytes(str2, beg);
1678 }
1679 else {
1680 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1681 RB_GC_GUARD(str);
1682 }
1683
1684 rb_enc_cr_str_copy_for_substr(str2, str);
1685 OBJ_INFECT(str2, str);
1686
1687 return str2;
1688 }
1689
1690 static char *
1691 rb_str_subpos(VALUE str, long beg, long *lenp)
1692 {
1693 long len = *lenp;
1694 long slen = -1L;
1695 long blen = RSTRING_LEN(str);
1696 rb_encoding *enc = STR_ENC_GET(str);
1697 char *p, *s = RSTRING_PTR(str), *e = s + blen;
1698
1699 if (len < 0) return 0;
1700 if (!blen) {
1701 len = 0;
1702 }
1703 if (single_byte_optimizable(str)) {
1704 if (beg > blen) return 0;
1705 if (beg < 0) {
1706 beg += blen;
1707 if (beg < 0) return 0;
1708 }
1709 if (beg + len > blen)
1710 len = blen - beg;
1711 if (len < 0) return 0;
1712 p = s + beg;
1713 goto end;
1714 }
1715 if (beg < 0) {
1716 if (len > -beg) len = -beg;
1717 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1718 beg = -beg;
1719 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1720 p = e;
1721 if (!p) return 0;
1722 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1723 if (!p) return 0;
1724 len = e - p;
1725 goto end;
1726 }
1727 else {
1728 slen = str_strlen(str, enc);
1729 beg += slen;
1730 if (beg < 0) return 0;
1731 p = s + beg;
1732 if (len == 0) goto end;
1733 }
1734 }
1735 else if (beg > 0 && beg > RSTRING_LEN(str)) {
1736 return 0;
1737 }
1738 if (len == 0) {
1739 if (beg > str_strlen(str, enc)) return 0;
1740 p = s + beg;
1741 }
1742 #ifdef NONASCII_MASK
1743 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1744 enc == rb_utf8_encoding()) {
1745 p = str_utf8_nth(s, e, &beg);
1746 if (beg > 0) return 0;
1747 len = str_utf8_offset(p, e, len);
1748 }
1749 #endif
1750 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1751 int char_sz = rb_enc_mbmaxlen(enc);
1752
1753 p = s + beg * char_sz;
1754 if (p > e) {
1755 return 0;
1756 }
1757 else if (len * char_sz > e - p)
1758 len = e - p;
1759 else
1760 len *= char_sz;
1761 }
1762 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1763 if (beg > 0) return 0;
1764 len = 0;
1765 }
1766 else {
1767 len = str_offset(p, e, len, enc, 0);
1768 }
1769 end:
1770 *lenp = len;
1771 RB_GC_GUARD(str);
1772 return p;
1773 }
1774
1775 VALUE
1776 rb_str_substr(VALUE str, long beg, long len)
1777 {
1778 VALUE str2;
1779 char *p = rb_str_subpos(str, beg, &len);
1780
1781 if (!p) return Qnil;
1782 if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
1783 str2 = rb_str_new4(str);
1784 str2 = str_new3(rb_obj_class(str2), str2);
1785 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1786 RSTRING(str2)->as.heap.len = len;
1787 }
1788 else {
1789 str2 = rb_str_new5(str, p, len);
1790 rb_enc_cr_str_copy_for_substr(str2, str);
1791 OBJ_INFECT(str2, str);
1792 RB_GC_GUARD(str);
1793 }
1794
1795 return str2;
1796 }
1797
1798 VALUE
1799 rb_str_freeze(VALUE str)
1800 {
1801 if (STR_ASSOC_P(str)) {
1802 VALUE ary = RSTRING(str)->as.heap.aux.shared;
1803 OBJ_FREEZE(ary);
1804 }
1805 return rb_obj_freeze(str);
1806 }
1807
1808 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
1809 #define rb_str_dup_frozen rb_str_new_frozen
1810
1811 VALUE
1812 rb_str_locktmp(VALUE str)
1813 {
1814 if (FL_TEST(str, STR_TMPLOCK)) {
1815 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1816 }
1817 FL_SET(str, STR_TMPLOCK);
1818 return str;
1819 }
1820
1821 VALUE
1822 rb_str_unlocktmp(VALUE str)
1823 {
1824 if (!FL_TEST(str, STR_TMPLOCK)) {
1825 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1826 }
1827 FL_UNSET(str, STR_TMPLOCK);
1828 return str;
1829 }
1830
1831 void
1832 rb_str_set_len(VALUE str, long len)
1833 {
1834 long capa;
1835
1836 str_modifiable(str);
1837 if (STR_SHARED_P(str)) {
1838 rb_raise(rb_eRuntimeError, "can't set length of shared string");
1839 }
1840 if (len > (capa = (long)rb_str_capacity(str))) {
1841 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1842 }
1843 STR_SET_LEN(str, len);
1844 RSTRING_PTR(str)[len] = '\0';
1845 }
1846
1847 VALUE
1848 rb_str_resize(VALUE str, long len)
1849 {
1850 long slen;
1851 int independent;
1852
1853 if (len < 0) {
1854 rb_raise(rb_eArgError, "negative string size (or size too big)");
1855 }
1856
1857 independent = str_independent(str);
1858 ENC_CODERANGE_CLEAR(str);
1859 slen = RSTRING_LEN(str);
1860 if (len != slen) {
1861 if (STR_EMBED_P(str)) {
1862 if (len <= RSTRING_EMBED_LEN_MAX) {
1863 STR_SET_EMBED_LEN(str, len);
1864 RSTRING(str)->as.ary[len] = '\0';
1865 return str;
1866 }
1867 str_make_independent_expand(str, len - slen);
1868 STR_SET_NOEMBED(str);
1869 }
1870 else if (len <= RSTRING_EMBED_LEN_MAX) {
1871 char *ptr = RSTRING(str)->as.heap.ptr;
1872 STR_SET_EMBED(str);
1873 if (slen > len) slen = len;
1874 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1875 RSTRING(str)->as.ary[len] = '\0';
1876 STR_SET_EMBED_LEN(str, len);
1877 if (independent) xfree(ptr);
1878 return str;
1879 }
1880 else if (!independent) {
1881 str_make_independent_expand(str, len - slen);
1882 }
1883 else if (slen < len || slen - len > 1024) {
1884 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1885 }
1886 if (!STR_NOCAPA_P(str)) {
1887 RSTRING(str)->as.heap.aux.capa = len;
1888 }
1889 RSTRING(str)->as.heap.len = len;
1890 RSTRING(str)->as.heap.ptr[len] = '\0';
1891 }
1892 return str;
1893 }
1894
1895 static VALUE
1896 str_buf_cat(VALUE str, const char *ptr, long len)
1897 {
1898 long capa, total, off = -1;
1899
1900 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1901 off = ptr - RSTRING_PTR(str);
1902 }
1903 rb_str_modify(str);
1904 if (len == 0) return 0;
1905 if (STR_ASSOC_P(str)) {
1906 FL_UNSET(str, STR_ASSOC);
1907 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1908 }
1909 else if (STR_EMBED_P(str)) {
1910 capa = RSTRING_EMBED_LEN_MAX;
1911 }
1912 else {
1913 capa = RSTRING(str)->as.heap.aux.capa;
1914 }
1915 if (RSTRING_LEN(str) >= LONG_MAX - len) {
1916 rb_raise(rb_eArgError, "string sizes too big");
1917 }
1918 total = RSTRING_LEN(str)+len;
1919 if (capa <= total) {
1920 while (total > capa) {
1921 if (capa + 1 >= LONG_MAX / 2) {
1922 capa = (total + 4095) / 4096;
1923 break;
1924 }
1925 capa = (capa + 1) * 2;
1926 }
1927 RESIZE_CAPA(str, capa);
1928 }
1929 if (off != -1) {
1930 ptr = RSTRING_PTR(str) + off;
1931 }
1932 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1933 STR_SET_LEN(str, total);
1934 RSTRING_PTR(str)[total] = '\0';
1935
1936 return str;
1937 }
1938
1939 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1940
1941 VALUE
1942 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1943 {
1944 if (len == 0) return str;
1945 if (len < 0) {
1946 rb_raise(rb_eArgError, "negative string size (or size too big)");
1947 }
1948 return str_buf_cat(str, ptr, len);
1949 }
1950
1951 VALUE
1952 rb_str_buf_cat2(VALUE str, const char *ptr)
1953 {
1954 return rb_str_buf_cat(str, ptr, strlen(ptr));
1955 }
1956
1957 VALUE
1958 rb_str_cat(VALUE str, const char *ptr, long len)
1959 {
1960 if (len < 0) {
1961 rb_raise(rb_eArgError, "negative string size (or size too big)");
1962 }
1963 if (STR_ASSOC_P(str)) {
1964 char *p;
1965 rb_str_modify_expand(str, len);
1966 p = RSTRING(str)->as.heap.ptr;
1967 memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1968 len = RSTRING(str)->as.heap.len += len;
1969 p[len] = '\0';
1970 return str;
1971 }
1972
1973 return rb_str_buf_cat(str, ptr, len);
1974 }
1975
1976 VALUE
1977 rb_str_cat2(VALUE str, const char *ptr)
1978 {
1979 return rb_str_cat(str, ptr, strlen(ptr));
1980 }
1981
1982 static VALUE
1983 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1984 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1985 {
1986 int str_encindex = ENCODING_GET(str);
1987 int res_encindex;
1988 int str_cr, res_cr;
1989
1990 str_cr = ENC_CODERANGE(str);
1991
1992 if (str_encindex == ptr_encindex) {
1993 if (str_cr == ENC_CODERANGE_UNKNOWN)
1994 ptr_cr = ENC_CODERANGE_UNKNOWN;
1995 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1996 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1997 }
1998 }
1999 else {
2000 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
2001 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
2002 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2003 if (len == 0)
2004 return str;
2005 if (RSTRING_LEN(str) == 0) {
2006 rb_str_buf_cat(str, ptr, len);
2007 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2008 return str;
2009 }
2010 goto incompatible;
2011 }
2012 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2013 ptr_cr = coderange_scan(ptr, len, ptr_enc);
2014 }
2015 if (str_cr == ENC_CODERANGE_UNKNOWN) {
2016 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2017 str_cr = rb_enc_str_coderange(str);
2018 }
2019 }
2020 }
2021 if (ptr_cr_ret)
2022 *ptr_cr_ret = ptr_cr;
2023
2024 if (str_encindex != ptr_encindex &&
2025 str_cr != ENC_CODERANGE_7BIT &&
2026 ptr_cr != ENC_CODERANGE_7BIT) {
2027 incompatible:
2028 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2029 rb_enc_name(rb_enc_from_index(str_encindex)),
2030 rb_enc_name(rb_enc_from_index(ptr_encindex)));
2031 }
2032
2033 if (str_cr == ENC_CODERANGE_UNKNOWN) {
2034 res_encindex = str_encindex;
2035 res_cr = ENC_CODERANGE_UNKNOWN;
2036 }
2037 else if (str_cr == ENC_CODERANGE_7BIT) {
2038 if (ptr_cr == ENC_CODERANGE_7BIT) {
2039 res_encindex = str_encindex;
2040 res_cr = ENC_CODERANGE_7BIT;
2041 }
2042 else {
2043 res_encindex = ptr_encindex;
2044 res_cr = ptr_cr;
2045 }
2046 }
2047 else if (str_cr == ENC_CODERANGE_VALID) {
2048 res_encindex = str_encindex;
2049 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
2050 res_cr = str_cr;
2051 else
2052 res_cr = ptr_cr;
2053 }
2054 else {
2055 res_encindex = str_encindex;
2056 res_cr = str_cr;
2057 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2058 }
2059
2060 if (len < 0) {
2061 rb_raise(rb_eArgError, "negative string size (or size too big)");
2062 }
2063 str_buf_cat(str, ptr, len);
2064 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2065 return str;
2066 }
2067
2068 VALUE
2069 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2070 {
2071 return rb_enc_cr_str_buf_cat(str, ptr, len,
2072 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
2073 }
2074
2075 VALUE
2076 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2077 {
2078
2079 int encindex = ENCODING_GET(str);
2080 rb_encoding *enc = rb_enc_from_index(encindex);
2081 if (rb_enc_asciicompat(enc)) {
2082 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2083 encindex, ENC_CODERANGE_7BIT, 0);
2084 }
2085 else {
2086 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2087 while (*ptr) {
2088 unsigned int c = (unsigned char)*ptr;
2089 int len = rb_enc_codelen(c, enc);
2090 rb_enc_mbcput(c, buf, enc);
2091 rb_enc_cr_str_buf_cat(str, buf, len,
2092 encindex, ENC_CODERANGE_VALID, 0);
2093 ptr++;
2094 }
2095 return str;
2096 }
2097 }
2098
2099 VALUE
2100 rb_str_buf_append(VALUE str, VALUE str2)
2101 {
2102 int str2_cr;
2103
2104 str2_cr = ENC_CODERANGE(str2);
2105
2106 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2107 ENCODING_GET(str2), str2_cr, &str2_cr);
2108
2109 OBJ_INFECT(str, str2);
2110 ENC_CODERANGE_SET(str2, str2_cr);
2111
2112 return str;
2113 }
2114
2115 VALUE
2116 rb_str_append(VALUE str, VALUE str2)
2117 {
2118 rb_encoding *enc;
2119 int cr, cr2;
2120 long len2;
2121
2122 StringValue(str2);
2123 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2124 long len = RSTRING_LEN(str) + len2;
2125 enc = rb_enc_check(str, str2);
2126 cr = ENC_CODERANGE(str);
2127 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2128 rb_str_modify_expand(str, len2);
2129 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2130 RSTRING_PTR(str2), len2+1);
2131 RSTRING(str)->as.heap.len = len;
2132 rb_enc_associate(str, enc);
2133 ENC_CODERANGE_SET(str, cr);
2134 OBJ_INFECT(str, str2);
2135 return str;
2136 }
2137 return rb_str_buf_append(str, str2);
2138 }
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156 VALUE
2157 rb_str_concat(VALUE str1, VALUE str2)
2158 {
2159 unsigned int code;
2160 rb_encoding *enc = STR_ENC_GET(str1);
2161
2162 if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
2163 if (rb_num_to_uint(str2, &code) == 0) {
2164 }
2165 else if (FIXNUM_P(str2)) {
2166 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2167 }
2168 else {
2169 rb_raise(rb_eRangeError, "bignum out of char range");
2170 }
2171 }
2172 else {
2173 return rb_str_append(str1, str2);
2174 }
2175
2176 if (enc == rb_usascii_encoding()) {
2177
2178 char buf[1];
2179 buf[0] = (char)code;
2180 if (code > 0xFF) {
2181 rb_raise(rb_eRangeError, "%u out of char range", code);
2182 }
2183 rb_str_cat(str1, buf, 1);
2184 if (code > 127) {
2185 rb_enc_associate(str1, rb_ascii8bit_encoding());
2186 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
2187 }
2188 }
2189 else {
2190 long pos = RSTRING_LEN(str1);
2191 int cr = ENC_CODERANGE(str1);
2192 int len;
2193 char *buf;
2194
2195 switch (len = rb_enc_codelen(code, enc)) {
2196 case ONIGERR_INVALID_CODE_POINT_VALUE:
2197 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2198 break;
2199 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
2200 case 0:
2201 rb_raise(rb_eRangeError, "%u out of char range", code);
2202 break;
2203 }
2204 buf = ALLOCA_N(char, len + 1);
2205 rb_enc_mbcput(code, buf, enc);
2206 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2207 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2208 }
2209 rb_str_resize(str1, pos+len);
2210 memcpy(RSTRING_PTR(str1) + pos, buf, len);
2211 if (cr == ENC_CODERANGE_7BIT && code > 127)
2212 cr = ENC_CODERANGE_VALID;
2213 ENC_CODERANGE_SET(str1, cr);
2214 }
2215 return str1;
2216 }
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229 static VALUE
2230 rb_str_prepend(VALUE str, VALUE str2)
2231 {
2232 StringValue(str2);
2233 StringValue(str);
2234 rb_str_update(str, 0L, 0L, str2);
2235 return str;
2236 }
2237
2238 st_index_t
2239 rb_str_hash(VALUE str)
2240 {
2241 int e = ENCODING_GET(str);
2242 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2243 e = 0;
2244 }
2245 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2246 }
2247
2248 int
2249 rb_str_hash_cmp(VALUE str1, VALUE str2)
2250 {
2251 long len;
2252
2253 if (!rb_str_comparable(str1, str2)) return 1;
2254 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2255 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2256 return 0;
2257 }
2258 return 1;
2259 }
2260
2261
2262
2263
2264
2265
2266
2267
2268 static VALUE
2269 rb_str_hash_m(VALUE str)
2270 {
2271 st_index_t hval = rb_str_hash(str);
2272 return INT2FIX(hval);
2273 }
2274
2275 #define lesser(a,b) (((a)>(b))?(b):(a))
2276
2277 int
2278 rb_str_comparable(VALUE str1, VALUE str2)
2279 {
2280 int idx1, idx2;
2281 int rc1, rc2;
2282
2283 if (RSTRING_LEN(str1) == 0) return TRUE;
2284 if (RSTRING_LEN(str2) == 0) return TRUE;
2285 idx1 = ENCODING_GET(str1);
2286 idx2 = ENCODING_GET(str2);
2287 if (idx1 == idx2) return TRUE;
2288 rc1 = rb_enc_str_coderange(str1);
2289 rc2 = rb_enc_str_coderange(str2);
2290 if (rc1 == ENC_CODERANGE_7BIT) {
2291 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2292 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
2293 return TRUE;
2294 }
2295 if (rc2 == ENC_CODERANGE_7BIT) {
2296 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
2297 return TRUE;
2298 }
2299 return FALSE;
2300 }
2301
2302 int
2303 rb_str_cmp(VALUE str1, VALUE str2)
2304 {
2305 long len1, len2;
2306 const char *ptr1, *ptr2;
2307 int retval;
2308
2309 if (str1 == str2) return 0;
2310 RSTRING_GETMEM(str1, ptr1, len1);
2311 RSTRING_GETMEM(str2, ptr2, len2);
2312 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2313 if (len1 == len2) {
2314 if (!rb_str_comparable(str1, str2)) {
2315 if (ENCODING_GET(str1) > ENCODING_GET(str2))
2316 return 1;
2317 return -1;
2318 }
2319 return 0;
2320 }
2321 if (len1 > len2) return 1;
2322 return -1;
2323 }
2324 if (retval > 0) return 1;
2325 return -1;
2326 }
2327
2328
2329 static VALUE
2330 str_eql(const VALUE str1, const VALUE str2)
2331 {
2332 const long len = RSTRING_LEN(str1);
2333 const char *ptr1, *ptr2;
2334
2335 if (len != RSTRING_LEN(str2)) return Qfalse;
2336 if (!rb_str_comparable(str1, str2)) return Qfalse;
2337 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2338 return Qtrue;
2339 if (memcmp(ptr1, ptr2, len) == 0)
2340 return Qtrue;
2341 return Qfalse;
2342 }
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352 VALUE
2353 rb_str_equal(VALUE str1, VALUE str2)
2354 {
2355 if (str1 == str2) return Qtrue;
2356 if (!RB_TYPE_P(str2, T_STRING)) {
2357 if (!rb_respond_to(str2, rb_intern("to_str"))) {
2358 return Qfalse;
2359 }
2360 return rb_equal(str2, str1);
2361 }
2362 return str_eql(str1, str2);
2363 }
2364
2365
2366
2367
2368
2369
2370
2371
2372 static VALUE
2373 rb_str_eql(VALUE str1, VALUE str2)
2374 {
2375 if (str1 == str2) return Qtrue;
2376 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
2377 return str_eql(str1, str2);
2378 }
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405 static VALUE
2406 rb_str_cmp_m(VALUE str1, VALUE str2)
2407 {
2408 int result;
2409
2410 if (!RB_TYPE_P(str2, T_STRING)) {
2411 VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
2412 if (RB_TYPE_P(tmp, T_STRING)) {
2413 result = rb_str_cmp(str1, tmp);
2414 }
2415 else {
2416 return rb_invcmp(str1, str2);
2417 }
2418 }
2419 else {
2420 result = rb_str_cmp(str1, str2);
2421 }
2422 return INT2FIX(result);
2423 }
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437 static VALUE
2438 rb_str_casecmp(VALUE str1, VALUE str2)
2439 {
2440 long len;
2441 rb_encoding *enc;
2442 char *p1, *p1end, *p2, *p2end;
2443
2444 StringValue(str2);
2445 enc = rb_enc_compatible(str1, str2);
2446 if (!enc) {
2447 return Qnil;
2448 }
2449
2450 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2451 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2452 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2453 while (p1 < p1end && p2 < p2end) {
2454 if (*p1 != *p2) {
2455 unsigned int c1 = TOUPPER(*p1 & 0xff);
2456 unsigned int c2 = TOUPPER(*p2 & 0xff);
2457 if (c1 != c2)
2458 return INT2FIX(c1 < c2 ? -1 : 1);
2459 }
2460 p1++;
2461 p2++;
2462 }
2463 }
2464 else {
2465 while (p1 < p1end && p2 < p2end) {
2466 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2467 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2468
2469 if (0 <= c1 && 0 <= c2) {
2470 c1 = TOUPPER(c1);
2471 c2 = TOUPPER(c2);
2472 if (c1 != c2)
2473 return INT2FIX(c1 < c2 ? -1 : 1);
2474 }
2475 else {
2476 int r;
2477 l1 = rb_enc_mbclen(p1, p1end, enc);
2478 l2 = rb_enc_mbclen(p2, p2end, enc);
2479 len = l1 < l2 ? l1 : l2;
2480 r = memcmp(p1, p2, len);
2481 if (r != 0)
2482 return INT2FIX(r < 0 ? -1 : 1);
2483 if (l1 != l2)
2484 return INT2FIX(l1 < l2 ? -1 : 1);
2485 }
2486 p1 += l1;
2487 p2 += l2;
2488 }
2489 }
2490 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2491 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2492 return INT2FIX(-1);
2493 }
2494
2495 static long
2496 rb_str_index(VALUE str, VALUE sub, long offset)
2497 {
2498 long pos;
2499 char *s, *sptr, *e;
2500 long len, slen;
2501 rb_encoding *enc;
2502
2503 enc = rb_enc_check(str, sub);
2504 if (is_broken_string(sub)) {
2505 return -1;
2506 }
2507 len = str_strlen(str, enc);
2508 slen = str_strlen(sub, enc);
2509 if (offset < 0) {
2510 offset += len;
2511 if (offset < 0) return -1;
2512 }
2513 if (len - offset < slen) return -1;
2514 s = RSTRING_PTR(str);
2515 e = s + RSTRING_LEN(str);
2516 if (offset) {
2517 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2518 s += offset;
2519 }
2520 if (slen == 0) return offset;
2521
2522 sptr = RSTRING_PTR(sub);
2523 slen = RSTRING_LEN(sub);
2524 len = RSTRING_LEN(str) - offset;
2525 for (;;) {
2526 char *t;
2527 pos = rb_memsearch(sptr, slen, s, len, enc);
2528 if (pos < 0) return pos;
2529 t = rb_enc_right_char_head(s, s+pos, e, enc);
2530 if (t == s + pos) break;
2531 if ((len -= t - s) <= 0) return -1;
2532 offset += t - s;
2533 s = t;
2534 }
2535 return pos + offset;
2536 }
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556 static VALUE
2557 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2558 {
2559 VALUE sub;
2560 VALUE initpos;
2561 long pos;
2562
2563 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2564 pos = NUM2LONG(initpos);
2565 }
2566 else {
2567 pos = 0;
2568 }
2569 if (pos < 0) {
2570 pos += str_strlen(str, STR_ENC_GET(str));
2571 if (pos < 0) {
2572 if (RB_TYPE_P(sub, T_REGEXP)) {
2573 rb_backref_set(Qnil);
2574 }
2575 return Qnil;
2576 }
2577 }
2578
2579 if (SPECIAL_CONST_P(sub)) goto generic;
2580 switch (BUILTIN_TYPE(sub)) {
2581 case T_REGEXP:
2582 if (pos > str_strlen(str, STR_ENC_GET(str)))
2583 return Qnil;
2584 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2585 rb_enc_check(str, sub), single_byte_optimizable(str));
2586
2587 pos = rb_reg_search(sub, str, pos, 0);
2588 pos = rb_str_sublen(str, pos);
2589 break;
2590
2591 generic:
2592 default: {
2593 VALUE tmp;
2594
2595 tmp = rb_check_string_type(sub);
2596 if (NIL_P(tmp)) {
2597 rb_raise(rb_eTypeError, "type mismatch: %s given",
2598 rb_obj_classname(sub));
2599 }
2600 sub = tmp;
2601 }
2602
2603 case T_STRING:
2604 pos = rb_str_index(str, sub, pos);
2605 pos = rb_str_sublen(str, pos);
2606 break;
2607 }
2608
2609 if (pos == -1) return Qnil;
2610 return LONG2NUM(pos);
2611 }
2612
2613 static long
2614 rb_str_rindex(VALUE str, VALUE sub, long pos)
2615 {
2616 long len, slen;
2617 char *s, *sbeg, *e, *t;
2618 rb_encoding *enc;
2619 int singlebyte = single_byte_optimizable(str);
2620
2621 enc = rb_enc_check(str, sub);
2622 if (is_broken_string(sub)) {
2623 return -1;
2624 }
2625 len = str_strlen(str, enc);
2626 slen = str_strlen(sub, enc);
2627
2628 if (len < slen) return -1;
2629 if (len - pos < slen) {
2630 pos = len - slen;
2631 }
2632 if (len == 0) {
2633 return pos;
2634 }
2635 sbeg = RSTRING_PTR(str);
2636 e = RSTRING_END(str);
2637 t = RSTRING_PTR(sub);
2638 slen = RSTRING_LEN(sub);
2639 s = str_nth(sbeg, e, pos, enc, singlebyte);
2640 while (s) {
2641 if (memcmp(s, t, slen) == 0) {
2642 return pos;
2643 }
2644 if (pos == 0) break;
2645 pos--;
2646 s = rb_enc_prev_char(sbeg, s, e, enc);
2647 }
2648 return -1;
2649 }
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670 static VALUE
2671 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2672 {
2673 VALUE sub;
2674 VALUE vpos;
2675 rb_encoding *enc = STR_ENC_GET(str);
2676 long pos, len = str_strlen(str, enc);
2677
2678 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2679 pos = NUM2LONG(vpos);
2680 if (pos < 0) {
2681 pos += len;
2682 if (pos < 0) {
2683 if (RB_TYPE_P(sub, T_REGEXP)) {
2684 rb_backref_set(Qnil);
2685 }
2686 return Qnil;
2687 }
2688 }
2689 if (pos > len) pos = len;
2690 }
2691 else {
2692 pos = len;
2693 }
2694
2695 if (SPECIAL_CONST_P(sub)) goto generic;
2696 switch (BUILTIN_TYPE(sub)) {
2697 case T_REGEXP:
2698
2699 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2700 STR_ENC_GET(str), single_byte_optimizable(str));
2701
2702 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2703 pos = rb_reg_search(sub, str, pos, 1);
2704 pos = rb_str_sublen(str, pos);
2705 }
2706 if (pos >= 0) return LONG2NUM(pos);
2707 break;
2708
2709 generic:
2710 default: {
2711 VALUE tmp;
2712
2713 tmp = rb_check_string_type(sub);
2714 if (NIL_P(tmp)) {
2715 rb_raise(rb_eTypeError, "type mismatch: %s given",
2716 rb_obj_classname(sub));
2717 }
2718 sub = tmp;
2719 }
2720
2721 case T_STRING:
2722 pos = rb_str_rindex(str, sub, pos);
2723 if (pos >= 0) return LONG2NUM(pos);
2724 break;
2725 }
2726 return Qnil;
2727 }
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747 static VALUE
2748 rb_str_match(VALUE x, VALUE y)
2749 {
2750 if (SPECIAL_CONST_P(y)) goto generic;
2751 switch (BUILTIN_TYPE(y)) {
2752 case T_STRING:
2753 rb_raise(rb_eTypeError, "type mismatch: String given");
2754
2755 case T_REGEXP:
2756 return rb_reg_match(y, x);
2757
2758 generic:
2759 default:
2760 return rb_funcall(y, rb_intern("=~"), 1, x);
2761 }
2762 }
2763
2764
2765 static VALUE get_pat(VALUE, int);
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797 static VALUE
2798 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2799 {
2800 VALUE re, result;
2801 if (argc < 1)
2802 rb_check_arity(argc, 1, 2);
2803 re = argv[0];
2804 argv[0] = str;
2805 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2806 if (!NIL_P(result) && rb_block_given_p()) {
2807 return rb_yield(result);
2808 }
2809 return result;
2810 }
2811
2812 enum neighbor_char {
2813 NEIGHBOR_NOT_CHAR,
2814 NEIGHBOR_FOUND,
2815 NEIGHBOR_WRAPPED
2816 };
2817
2818 static enum neighbor_char
2819 enc_succ_char(char *p, long len, rb_encoding *enc)
2820 {
2821 long i;
2822 int l;
2823 while (1) {
2824 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2825 p[i] = '\0';
2826 if (i < 0)
2827 return NEIGHBOR_WRAPPED;
2828 ++((unsigned char*)p)[i];
2829 l = rb_enc_precise_mbclen(p, p+len, enc);
2830 if (MBCLEN_CHARFOUND_P(l)) {
2831 l = MBCLEN_CHARFOUND_LEN(l);
2832 if (l == len) {
2833 return NEIGHBOR_FOUND;
2834 }
2835 else {
2836 memset(p+l, 0xff, len-l);
2837 }
2838 }
2839 if (MBCLEN_INVALID_P(l) && i < len-1) {
2840 long len2;
2841 int l2;
2842 for (len2 = len-1; 0 < len2; len2--) {
2843 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2844 if (!MBCLEN_INVALID_P(l2))
2845 break;
2846 }
2847 memset(p+len2+1, 0xff, len-(len2+1));
2848 }
2849 }
2850 }
2851
2852 static enum neighbor_char
2853 enc_pred_char(char *p, long len, rb_encoding *enc)
2854 {
2855 long i;
2856 int l;
2857 while (1) {
2858 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2859 p[i] = '\xff';
2860 if (i < 0)
2861 return NEIGHBOR_WRAPPED;
2862 --((unsigned char*)p)[i];
2863 l = rb_enc_precise_mbclen(p, p+len, enc);
2864 if (MBCLEN_CHARFOUND_P(l)) {
2865 l = MBCLEN_CHARFOUND_LEN(l);
2866 if (l == len) {
2867 return NEIGHBOR_FOUND;
2868 }
2869 else {
2870 memset(p+l, 0, len-l);
2871 }
2872 }
2873 if (MBCLEN_INVALID_P(l) && i < len-1) {
2874 long len2;
2875 int l2;
2876 for (len2 = len-1; 0 < len2; len2--) {
2877 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2878 if (!MBCLEN_INVALID_P(l2))
2879 break;
2880 }
2881 memset(p+len2+1, 0, len-(len2+1));
2882 }
2883 }
2884 }
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895 static enum neighbor_char
2896 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2897 {
2898 enum neighbor_char ret;
2899 unsigned int c;
2900 int ctype;
2901 int range;
2902 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2903
2904 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2905 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2906 ctype = ONIGENC_CTYPE_DIGIT;
2907 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2908 ctype = ONIGENC_CTYPE_ALPHA;
2909 else
2910 return NEIGHBOR_NOT_CHAR;
2911
2912 MEMCPY(save, p, char, len);
2913 ret = enc_succ_char(p, len, enc);
2914 if (ret == NEIGHBOR_FOUND) {
2915 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2916 if (rb_enc_isctype(c, ctype, enc))
2917 return NEIGHBOR_FOUND;
2918 }
2919 MEMCPY(p, save, char, len);
2920 range = 1;
2921 while (1) {
2922 MEMCPY(save, p, char, len);
2923 ret = enc_pred_char(p, len, enc);
2924 if (ret == NEIGHBOR_FOUND) {
2925 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2926 if (!rb_enc_isctype(c, ctype, enc)) {
2927 MEMCPY(p, save, char, len);
2928 break;
2929 }
2930 }
2931 else {
2932 MEMCPY(p, save, char, len);
2933 break;
2934 }
2935 range++;
2936 }
2937 if (range == 1) {
2938 return NEIGHBOR_NOT_CHAR;
2939 }
2940
2941 if (ctype != ONIGENC_CTYPE_DIGIT) {
2942 MEMCPY(carry, p, char, len);
2943 return NEIGHBOR_WRAPPED;
2944 }
2945
2946 MEMCPY(carry, p, char, len);
2947 enc_succ_char(carry, len, enc);
2948 return NEIGHBOR_WRAPPED;
2949 }
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977 VALUE
2978 rb_str_succ(VALUE orig)
2979 {
2980 rb_encoding *enc;
2981 VALUE str;
2982 char *sbeg, *s, *e, *last_alnum = 0;
2983 int c = -1;
2984 long l;
2985 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2986 long carry_pos = 0, carry_len = 1;
2987 enum neighbor_char neighbor = NEIGHBOR_FOUND;
2988
2989 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2990 rb_enc_cr_str_copy_for_substr(str, orig);
2991 OBJ_INFECT(str, orig);
2992 if (RSTRING_LEN(str) == 0) return str;
2993
2994 enc = STR_ENC_GET(orig);
2995 sbeg = RSTRING_PTR(str);
2996 s = e = sbeg + RSTRING_LEN(str);
2997
2998 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2999 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
3000 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
3001 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
3002 s = last_alnum;
3003 break;
3004 }
3005 }
3006 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
3007 neighbor = enc_succ_alnum_char(s, l, enc, carry);
3008 switch (neighbor) {
3009 case NEIGHBOR_NOT_CHAR:
3010 continue;
3011 case NEIGHBOR_FOUND:
3012 return str;
3013 case NEIGHBOR_WRAPPED:
3014 last_alnum = s;
3015 break;
3016 }
3017 c = 1;
3018 carry_pos = s - sbeg;
3019 carry_len = l;
3020 }
3021 if (c == -1) {
3022 s = e;
3023 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3024 enum neighbor_char neighbor;
3025 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
3026 neighbor = enc_succ_char(s, l, enc);
3027 if (neighbor == NEIGHBOR_FOUND)
3028 return str;
3029 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
3030
3031 enc_succ_char(s, l, enc);
3032 }
3033 if (!rb_enc_asciicompat(enc)) {
3034 MEMCPY(carry, s, char, l);
3035 carry_len = l;
3036 }
3037 carry_pos = s - sbeg;
3038 }
3039 }
3040 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
3041 s = RSTRING_PTR(str) + carry_pos;
3042 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
3043 memmove(s, carry, carry_len);
3044 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
3045 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3046 rb_enc_str_coderange(str);
3047 return str;
3048 }
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060 static VALUE
3061 rb_str_succ_bang(VALUE str)
3062 {
3063 rb_str_shared_replace(str, rb_str_succ(str));
3064
3065 return str;
3066 }
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101 static VALUE
3102 rb_str_upto(int argc, VALUE *argv, VALUE beg)
3103 {
3104 VALUE end, exclusive;
3105 VALUE current, after_end;
3106 ID succ;
3107 int n, excl, ascii;
3108 rb_encoding *enc;
3109
3110 rb_scan_args(argc, argv, "11", &end, &exclusive);
3111 RETURN_ENUMERATOR(beg, argc, argv);
3112 excl = RTEST(exclusive);
3113 CONST_ID(succ, "succ");
3114 StringValue(end);
3115 enc = rb_enc_check(beg, end);
3116 ascii = (is_ascii_string(beg) && is_ascii_string(end));
3117
3118 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3119 char c = RSTRING_PTR(beg)[0];
3120 char e = RSTRING_PTR(end)[0];
3121
3122 if (c > e || (excl && c == e)) return beg;
3123 for (;;) {
3124 rb_yield(rb_enc_str_new(&c, 1, enc));
3125 if (!excl && c == e) break;
3126 c++;
3127 if (excl && c == e) break;
3128 }
3129 return beg;
3130 }
3131
3132 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3133 char *s, *send;
3134 VALUE b, e;
3135 int width;
3136
3137 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3138 width = rb_long2int(send - s);
3139 while (s < send) {
3140 if (!ISDIGIT(*s)) goto no_digits;
3141 s++;
3142 }
3143 s = RSTRING_PTR(end); send = RSTRING_END(end);
3144 while (s < send) {
3145 if (!ISDIGIT(*s)) goto no_digits;
3146 s++;
3147 }
3148 b = rb_str_to_inum(beg, 10, FALSE);
3149 e = rb_str_to_inum(end, 10, FALSE);
3150 if (FIXNUM_P(b) && FIXNUM_P(e)) {
3151 long bi = FIX2LONG(b);
3152 long ei = FIX2LONG(e);
3153 rb_encoding *usascii = rb_usascii_encoding();
3154
3155 while (bi <= ei) {
3156 if (excl && bi == ei) break;
3157 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3158 bi++;
3159 }
3160 }
3161 else {
3162 ID op = excl ? '<' : rb_intern("<=");
3163 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3164
3165 args[0] = INT2FIX(width);
3166 while (rb_funcall(b, op, 1, e)) {
3167 args[1] = b;
3168 rb_yield(rb_str_format(numberof(args), args, fmt));
3169 b = rb_funcall(b, succ, 0, 0);
3170 }
3171 }
3172 return beg;
3173 }
3174
3175 no_digits:
3176 n = rb_str_cmp(beg, end);
3177 if (n > 0 || (excl && n == 0)) return beg;
3178
3179 after_end = rb_funcall(end, succ, 0, 0);
3180 current = rb_str_dup(beg);
3181 while (!rb_str_equal(current, after_end)) {
3182 VALUE next = Qnil;
3183 if (excl || !rb_str_equal(current, end))
3184 next = rb_funcall(current, succ, 0, 0);
3185 rb_yield(current);
3186 if (NIL_P(next)) break;
3187 current = next;
3188 StringValue(current);
3189 if (excl && rb_str_equal(current, end)) break;
3190 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3191 break;
3192 }
3193
3194 return beg;
3195 }
3196
3197 static VALUE
3198 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3199 {
3200 if (rb_reg_search(re, str, 0, 0) >= 0) {
3201 VALUE match = rb_backref_get();
3202 int nth = rb_reg_backref_number(match, backref);
3203 return rb_reg_nth_match(nth, match);
3204 }
3205 return Qnil;
3206 }
3207
3208 static VALUE
3209 rb_str_aref(VALUE str, VALUE indx)
3210 {
3211 long idx;
3212
3213 if (FIXNUM_P(indx)) {
3214 idx = FIX2LONG(indx);
3215
3216 num_index:
3217 str = rb_str_substr(str, idx, 1);
3218 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3219 return str;
3220 }
3221
3222 if (SPECIAL_CONST_P(indx)) goto generic;
3223 switch (BUILTIN_TYPE(indx)) {
3224 case T_REGEXP:
3225 return rb_str_subpat(str, indx, INT2FIX(0));
3226
3227 case T_STRING:
3228 if (rb_str_index(str, indx, 0) != -1)
3229 return rb_str_dup(indx);
3230 return Qnil;
3231
3232 generic:
3233 default:
3234
3235 {
3236 long beg, len;
3237 VALUE tmp;
3238
3239 len = str_strlen(str, STR_ENC_GET(str));
3240 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3241 case Qfalse:
3242 break;
3243 case Qnil:
3244 return Qnil;
3245 default:
3246 tmp = rb_str_substr(str, beg, len);
3247 return tmp;
3248 }
3249 }
3250 idx = NUM2LONG(indx);
3251 goto num_index;
3252 }
3253
3254 UNREACHABLE;
3255 }
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327 static VALUE
3328 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
3329 {
3330 if (argc == 2) {
3331 if (RB_TYPE_P(argv[0], T_REGEXP)) {
3332 return rb_str_subpat(str, argv[0], argv[1]);
3333 }
3334 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3335 }
3336 rb_check_arity(argc, 1, 2);
3337 return rb_str_aref(str, argv[0]);
3338 }
3339
3340 VALUE
3341 rb_str_drop_bytes(VALUE str, long len)
3342 {
3343 char *ptr = RSTRING_PTR(str);
3344 long olen = RSTRING_LEN(str), nlen;
3345
3346 str_modifiable(str);
3347 if (len > olen) len = olen;
3348 nlen = olen - len;
3349 if (nlen <= RSTRING_EMBED_LEN_MAX) {
3350 char *oldptr = ptr;
3351 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3352 STR_SET_EMBED(str);
3353 STR_SET_EMBED_LEN(str, nlen);
3354 ptr = RSTRING(str)->as.ary;
3355 memmove(ptr, oldptr + len, nlen);
3356 if (fl == STR_NOEMBED) xfree(oldptr);
3357 }
3358 else {
3359 if (!STR_SHARED_P(str)) rb_str_new4(str);
3360 ptr = RSTRING(str)->as.heap.ptr += len;
3361 RSTRING(str)->as.heap.len = nlen;
3362 }
3363 ptr[nlen] = 0;
3364 ENC_CODERANGE_CLEAR(str);
3365 return str;
3366 }
3367
3368 static void
3369 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3370 {
3371 if (beg == 0 && RSTRING_LEN(val) == 0) {
3372 rb_str_drop_bytes(str, len);
3373 OBJ_INFECT(str, val);
3374 return;
3375 }
3376
3377 rb_str_modify(str);
3378 if (len < RSTRING_LEN(val)) {
3379
3380 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3381 }
3382
3383 if (RSTRING_LEN(val) != len) {
3384 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3385 RSTRING_PTR(str) + beg + len,
3386 RSTRING_LEN(str) - (beg + len));
3387 }
3388 if (RSTRING_LEN(val) < beg && len < 0) {
3389 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3390 }
3391 if (RSTRING_LEN(val) > 0) {
3392 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3393 }
3394 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3395 if (RSTRING_PTR(str)) {
3396 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3397 }
3398 OBJ_INFECT(str, val);
3399 }
3400
3401 static void
3402 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3403 {
3404 long slen;
3405 char *p, *e;
3406 rb_encoding *enc;
3407 int singlebyte = single_byte_optimizable(str);
3408 int cr;
3409
3410 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3411
3412 StringValue(val);
3413 enc = rb_enc_check(str, val);
3414 slen = str_strlen(str, enc);
3415
3416 if (slen < beg) {
3417 out_of_range:
3418 rb_raise(rb_eIndexError, "index %ld out of string", beg);
3419 }
3420 if (beg < 0) {
3421 if (-beg > slen) {
3422 goto out_of_range;
3423 }
3424 beg += slen;
3425 }
3426 if (slen < len || slen < beg + len) {
3427 len = slen - beg;
3428 }
3429 str_modify_keep_cr(str);
3430 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3431 if (!p) p = RSTRING_END(str);
3432 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3433 if (!e) e = RSTRING_END(str);
3434
3435 beg = p - RSTRING_PTR(str);
3436 len = e - p;
3437 rb_str_splice_0(str, beg, len, val);
3438 rb_enc_associate(str, enc);
3439 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
3440 if (cr != ENC_CODERANGE_BROKEN)
3441 ENC_CODERANGE_SET(str, cr);
3442 }
3443
3444 void
3445 rb_str_update(VALUE str, long beg, long len, VALUE val)
3446 {
3447 rb_str_splice(str, beg, len, val);
3448 }
3449
3450 static void
3451 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3452 {
3453 int nth;
3454 VALUE match;
3455 long start, end, len;
3456 rb_encoding *enc;
3457 struct re_registers *regs;
3458
3459 if (rb_reg_search(re, str, 0, 0) < 0) {
3460 rb_raise(rb_eIndexError, "regexp not matched");
3461 }
3462 match = rb_backref_get();
3463 nth = rb_reg_backref_number(match, backref);
3464 regs = RMATCH_REGS(match);
3465 if (nth >= regs->num_regs) {
3466 out_of_range:
3467 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3468 }
3469 if (nth < 0) {
3470 if (-nth >= regs->num_regs) {
3471 goto out_of_range;
3472 }
3473 nth += regs->num_regs;
3474 }
3475
3476 start = BEG(nth);
3477 if (start == -1) {
3478 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3479 }
3480 end = END(nth);
3481 len = end - start;
3482 StringValue(val);
3483 enc = rb_enc_check(str, val);
3484 rb_str_splice_0(str, start, len, val);
3485 rb_enc_associate(str, enc);
3486 }
3487
3488 static VALUE
3489 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3490 {
3491 long idx, beg;
3492
3493 if (FIXNUM_P(indx)) {
3494 idx = FIX2LONG(indx);
3495 num_index:
3496 rb_str_splice(str, idx, 1, val);
3497 return val;
3498 }
3499
3500 if (SPECIAL_CONST_P(indx)) goto generic;
3501 switch (TYPE(indx)) {
3502 case T_REGEXP:
3503 rb_str_subpat_set(str, indx, INT2FIX(0), val);
3504 return val;
3505
3506 case T_STRING:
3507 beg = rb_str_index(str, indx, 0);
3508 if (beg < 0) {
3509 rb_raise(rb_eIndexError, "string not matched");
3510 }
3511 beg = rb_str_sublen(str, beg);
3512 rb_str_splice(str, beg, str_strlen(indx, 0), val);
3513 return val;
3514
3515 generic:
3516 default:
3517
3518 {
3519 long beg, len;
3520 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3521 rb_str_splice(str, beg, len, val);
3522 return val;
3523 }
3524 }
3525 idx = NUM2LONG(indx);
3526 goto num_index;
3527 }
3528 }
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555 static VALUE
3556 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
3557 {
3558 if (argc == 3) {
3559 if (RB_TYPE_P(argv[0], T_REGEXP)) {
3560 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3561 }
3562 else {
3563 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3564 }
3565 return argv[2];
3566 }
3567 rb_check_arity(argc, 2, 3);
3568 return rb_str_aset(str, argv[0], argv[1]);
3569 }
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588 static VALUE
3589 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3590 {
3591 long pos = NUM2LONG(idx);
3592
3593 if (pos == -1) {
3594 return rb_str_append(str, str2);
3595 }
3596 else if (pos < 0) {
3597 pos++;
3598 }
3599 rb_str_splice(str, pos, 0, str2);
3600 return str;
3601 }
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623 static VALUE
3624 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3625 {
3626 VALUE result;
3627 VALUE buf[3];
3628 int i;
3629
3630 rb_check_arity(argc, 1, 2);
3631 for (i=0; i<argc; i++) {
3632 buf[i] = argv[i];
3633 }
3634 str_modify_keep_cr(str);
3635 result = rb_str_aref_m(argc, buf, str);
3636 if (!NIL_P(result)) {
3637 buf[i] = rb_str_new(0,0);
3638 rb_str_aset_m(argc+1, buf, str);
3639 }
3640 return result;
3641 }
3642
3643 static VALUE
3644 get_pat(VALUE pat, int quote)
3645 {
3646 VALUE val;
3647
3648 switch (TYPE(pat)) {
3649 case T_REGEXP:
3650 return pat;
3651
3652 case T_STRING:
3653 break;
3654
3655 default:
3656 val = rb_check_string_type(pat);
3657 if (NIL_P(val)) {
3658 Check_Type(pat, T_REGEXP);
3659 }
3660 pat = val;
3661 }
3662
3663 if (quote) {
3664 pat = rb_reg_quote(pat);
3665 }
3666
3667 return rb_reg_regcomp(pat);
3668 }
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682 static VALUE
3683 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3684 {
3685 VALUE pat, repl, hash = Qnil;
3686 int iter = 0;
3687 int tainted = 0;
3688 int untrusted = 0;
3689 long plen;
3690 int min_arity = rb_block_given_p() ? 1 : 2;
3691
3692 rb_check_arity(argc, min_arity, 2);
3693 if (argc == 1) {
3694 iter = 1;
3695 }
3696 else {
3697 repl = argv[1];
3698 hash = rb_check_hash_type(argv[1]);
3699 if (NIL_P(hash)) {
3700 StringValue(repl);
3701 }
3702 if (OBJ_TAINTED(repl)) tainted = 1;
3703 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3704 }
3705
3706 pat = get_pat(argv[0], 1);
3707 str_modifiable(str);
3708 if (rb_reg_search(pat, str, 0, 0) >= 0) {
3709 rb_encoding *enc;
3710 int cr = ENC_CODERANGE(str);
3711 VALUE match = rb_backref_get();
3712 struct re_registers *regs = RMATCH_REGS(match);
3713 long beg0 = BEG(0);
3714 long end0 = END(0);
3715 char *p, *rp;
3716 long len, rlen;
3717
3718 if (iter || !NIL_P(hash)) {
3719 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3720
3721 if (iter) {
3722 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3723 }
3724 else {
3725 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3726 repl = rb_obj_as_string(repl);
3727 }
3728 str_mod_check(str, p, len);
3729 rb_check_frozen(str);
3730 }
3731 else {
3732 repl = rb_reg_regsub(repl, str, regs, pat);
3733 }
3734 enc = rb_enc_compatible(str, repl);
3735 if (!enc) {
3736 rb_encoding *str_enc = STR_ENC_GET(str);
3737 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3738 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3739 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3740 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3741 rb_enc_name(str_enc),
3742 rb_enc_name(STR_ENC_GET(repl)));
3743 }
3744 enc = STR_ENC_GET(repl);
3745 }
3746 rb_str_modify(str);
3747 rb_enc_associate(str, enc);
3748 if (OBJ_TAINTED(repl)) tainted = 1;
3749 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3750 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3751 int cr2 = ENC_CODERANGE(repl);
3752 if (cr2 == ENC_CODERANGE_BROKEN ||
3753 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3754 cr = ENC_CODERANGE_UNKNOWN;
3755 else
3756 cr = cr2;
3757 }
3758 plen = end0 - beg0;
3759 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3760 len = RSTRING_LEN(str);
3761 if (rlen > plen) {
3762 RESIZE_CAPA(str, len + rlen - plen);
3763 }
3764 p = RSTRING_PTR(str);
3765 if (rlen != plen) {
3766 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3767 }
3768 memcpy(p + beg0, rp, rlen);
3769 len += rlen - plen;
3770 STR_SET_LEN(str, len);
3771 RSTRING_PTR(str)[len] = '\0';
3772 ENC_CODERANGE_SET(str, cr);
3773 if (tainted) OBJ_TAINT(str);
3774 if (untrusted) OBJ_UNTRUST(str);
3775
3776 return str;
3777 }
3778 return Qnil;
3779 }
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821 static VALUE
3822 rb_str_sub(int argc, VALUE *argv, VALUE str)
3823 {
3824 str = rb_str_dup(str);
3825 rb_str_sub_bang(argc, argv, str);
3826 return str;
3827 }
3828
3829 static VALUE
3830 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3831 {
3832 VALUE pat, val, repl, match, dest, hash = Qnil;
3833 struct re_registers *regs;
3834 long beg, n;
3835 long beg0, end0;
3836 long offset, blen, slen, len, last;
3837 int iter = 0;
3838 char *sp, *cp;
3839 int tainted = 0;
3840 rb_encoding *str_enc;
3841
3842 switch (argc) {
3843 case 1:
3844 RETURN_ENUMERATOR(str, argc, argv);
3845 iter = 1;
3846 break;
3847 case 2:
3848 repl = argv[1];
3849 hash = rb_check_hash_type(argv[1]);
3850 if (NIL_P(hash)) {
3851 StringValue(repl);
3852 }
3853 if (OBJ_TAINTED(repl)) tainted = 1;
3854 break;
3855 default:
3856 rb_check_arity(argc, 1, 2);
3857 }
3858
3859 pat = get_pat(argv[0], 1);
3860 beg = rb_reg_search(pat, str, 0, 0);
3861 if (beg < 0) {
3862 if (bang) return Qnil;
3863 return rb_str_dup(str);
3864 }
3865
3866 offset = 0;
3867 n = 0;
3868 blen = RSTRING_LEN(str) + 30;
3869 dest = rb_str_buf_new(blen);
3870 sp = RSTRING_PTR(str);
3871 slen = RSTRING_LEN(str);
3872 cp = sp;
3873 str_enc = STR_ENC_GET(str);
3874 rb_enc_associate(dest, str_enc);
3875 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
3876
3877 do {
3878 n++;
3879 match = rb_backref_get();
3880 regs = RMATCH_REGS(match);
3881 beg0 = BEG(0);
3882 end0 = END(0);
3883 if (iter || !NIL_P(hash)) {
3884 if (iter) {
3885 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3886 }
3887 else {
3888 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3889 val = rb_obj_as_string(val);
3890 }
3891 str_mod_check(str, sp, slen);
3892 if (val == dest) {
3893 rb_raise(rb_eRuntimeError, "block should not cheat");
3894 }
3895 }
3896 else {
3897 val = rb_reg_regsub(repl, str, regs, pat);
3898 }
3899
3900 if (OBJ_TAINTED(val)) tainted = 1;
3901
3902 len = beg - offset;
3903 if (len) {
3904 rb_enc_str_buf_cat(dest, cp, len, str_enc);
3905 }
3906
3907 rb_str_buf_append(dest, val);
3908
3909 last = offset;
3910 offset = end0;
3911 if (beg0 == end0) {
3912
3913
3914
3915
3916 if (RSTRING_LEN(str) <= end0) break;
3917 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3918 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3919 offset = end0 + len;
3920 }
3921 cp = RSTRING_PTR(str) + offset;
3922 if (offset > RSTRING_LEN(str)) break;
3923 beg = rb_reg_search(pat, str, offset, 0);
3924 } while (beg >= 0);
3925 if (RSTRING_LEN(str) > offset) {
3926 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3927 }
3928 rb_reg_search(pat, str, last, 0);
3929 if (bang) {
3930 rb_str_shared_replace(str, dest);
3931 }
3932 else {
3933 RBASIC(dest)->klass = rb_obj_class(str);
3934 OBJ_INFECT(dest, str);
3935 str = dest;
3936 }
3937
3938 if (tainted) OBJ_TAINT(str);
3939 return str;
3940 }
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954 static VALUE
3955 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3956 {
3957 str_modify_keep_cr(str);
3958 return str_gsub(argc, argv, str, 1);
3959 }
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005 static VALUE
4006 rb_str_gsub(int argc, VALUE *argv, VALUE str)
4007 {
4008 return str_gsub(argc, argv, str, 0);
4009 }
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023 VALUE
4024 rb_str_replace(VALUE str, VALUE str2)
4025 {
4026 str_modifiable(str);
4027 if (str == str2) return str;
4028
4029 StringValue(str2);
4030 str_discard(str);
4031 return str_replace(str, str2);
4032 }
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044 static VALUE
4045 rb_str_clear(VALUE str)
4046 {
4047 str_discard(str);
4048 STR_SET_EMBED(str);
4049 STR_SET_EMBED_LEN(str, 0);
4050 RSTRING_PTR(str)[0] = 0;
4051 if (rb_enc_asciicompat(STR_ENC_GET(str)))
4052 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4053 else
4054 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4055 return str;
4056 }
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068 static VALUE
4069 rb_str_chr(VALUE str)
4070 {
4071 return rb_str_substr(str, 0, 1);
4072 }
4073
4074
4075
4076
4077
4078
4079
4080 static VALUE
4081 rb_str_getbyte(VALUE str, VALUE index)
4082 {
4083 long pos = NUM2LONG(index);
4084
4085 if (pos < 0)
4086 pos += RSTRING_LEN(str);
4087 if (pos < 0 || RSTRING_LEN(str) <= pos)
4088 return Qnil;
4089
4090 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
4091 }
4092
4093
4094
4095
4096
4097
4098
4099 static VALUE
4100 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4101 {
4102 long pos = NUM2LONG(index);
4103 int byte = NUM2INT(value);
4104
4105 rb_str_modify(str);
4106
4107 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4108 rb_raise(rb_eIndexError, "index %ld out of string", pos);
4109 if (pos < 0)
4110 pos += RSTRING_LEN(str);
4111
4112 RSTRING_PTR(str)[pos] = byte;
4113
4114 return value;
4115 }
4116
4117 static VALUE
4118 str_byte_substr(VALUE str, long beg, long len)
4119 {
4120 char *p, *s = RSTRING_PTR(str);
4121 long n = RSTRING_LEN(str);
4122 VALUE str2;
4123
4124 if (beg > n || len < 0) return Qnil;
4125 if (beg < 0) {
4126 beg += n;
4127 if (beg < 0) return Qnil;
4128 }
4129 if (beg + len > n)
4130 len = n - beg;
4131 if (len <= 0) {
4132 len = 0;
4133 p = 0;
4134 }
4135 else
4136 p = s + beg;
4137
4138 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4139 str2 = rb_str_new4(str);
4140 str2 = str_new3(rb_obj_class(str2), str2);
4141 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4142 RSTRING(str2)->as.heap.len = len;
4143 }
4144 else {
4145 str2 = rb_str_new5(str, p, len);
4146 }
4147
4148 str_enc_copy(str2, str);
4149
4150 if (RSTRING_LEN(str2) == 0) {
4151 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4152 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
4153 else
4154 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4155 }
4156 else {
4157 switch (ENC_CODERANGE(str)) {
4158 case ENC_CODERANGE_7BIT:
4159 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
4160 break;
4161 default:
4162 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
4163 break;
4164 }
4165 }
4166
4167 OBJ_INFECT(str2, str);
4168
4169 return str2;
4170 }
4171
4172 static VALUE
4173 str_byte_aref(VALUE str, VALUE indx)
4174 {
4175 long idx;
4176 switch (TYPE(indx)) {
4177 case T_FIXNUM:
4178 idx = FIX2LONG(indx);
4179
4180 num_index:
4181 str = str_byte_substr(str, idx, 1);
4182 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4183 return str;
4184
4185 default:
4186
4187 {
4188 long beg, len = RSTRING_LEN(str);
4189
4190 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4191 case Qfalse:
4192 break;
4193 case Qnil:
4194 return Qnil;
4195 default:
4196 return str_byte_substr(str, beg, len);
4197 }
4198 }
4199 idx = NUM2LONG(indx);
4200 goto num_index;
4201 }
4202
4203 UNREACHABLE;
4204 }
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229 static VALUE
4230 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
4231 {
4232 if (argc == 2) {
4233 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4234 }
4235 rb_check_arity(argc, 1, 2);
4236 return str_byte_aref(str, argv[0]);
4237 }
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248 static VALUE
4249 rb_str_reverse(VALUE str)
4250 {
4251 rb_encoding *enc;
4252 VALUE rev;
4253 char *s, *e, *p;
4254 int single = 1;
4255
4256 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4257 enc = STR_ENC_GET(str);
4258 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4259 s = RSTRING_PTR(str); e = RSTRING_END(str);
4260 p = RSTRING_END(rev);
4261
4262 if (RSTRING_LEN(str) > 1) {
4263 if (single_byte_optimizable(str)) {
4264 while (s < e) {
4265 *--p = *s++;
4266 }
4267 }
4268 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4269 while (s < e) {
4270 int clen = rb_enc_fast_mbclen(s, e, enc);
4271
4272 if (clen > 1 || (*s & 0x80)) single = 0;
4273 p -= clen;
4274 memcpy(p, s, clen);
4275 s += clen;
4276 }
4277 }
4278 else {
4279 while (s < e) {
4280 int clen = rb_enc_mbclen(s, e, enc);
4281
4282 if (clen > 1 || (*s & 0x80)) single = 0;
4283 p -= clen;
4284 memcpy(p, s, clen);
4285 s += clen;
4286 }
4287 }
4288 }
4289 STR_SET_LEN(rev, RSTRING_LEN(str));
4290 OBJ_INFECT(rev, str);
4291 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4292 if (single) {
4293 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
4294 }
4295 else {
4296 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
4297 }
4298 }
4299 rb_enc_cr_str_copy_for_substr(rev, str);
4300
4301 return rev;
4302 }
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312 static VALUE
4313 rb_str_reverse_bang(VALUE str)
4314 {
4315 if (RSTRING_LEN(str) > 1) {
4316 if (single_byte_optimizable(str)) {
4317 char *s, *e, c;
4318
4319 str_modify_keep_cr(str);
4320 s = RSTRING_PTR(str);
4321 e = RSTRING_END(str) - 1;
4322 while (s < e) {
4323 c = *s;
4324 *s++ = *e;
4325 *e-- = c;
4326 }
4327 }
4328 else {
4329 rb_str_shared_replace(str, rb_str_reverse(str));
4330 }
4331 }
4332 else {
4333 str_modify_keep_cr(str);
4334 }
4335 return str;
4336 }
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351 static VALUE
4352 rb_str_include(VALUE str, VALUE arg)
4353 {
4354 long i;
4355
4356 StringValue(arg);
4357 i = rb_str_index(str, arg, 0);
4358
4359 if (i == -1) return Qfalse;
4360 return Qtrue;
4361 }
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385 static VALUE
4386 rb_str_to_i(int argc, VALUE *argv, VALUE str)
4387 {
4388 int base;
4389
4390 if (argc == 0) base = 10;
4391 else {
4392 VALUE b;
4393
4394 rb_scan_args(argc, argv, "01", &b);
4395 base = NUM2INT(b);
4396 }
4397 if (base < 0) {
4398 rb_raise(rb_eArgError, "invalid radix %d", base);
4399 }
4400 return rb_str_to_inum(str, base, FALSE);
4401 }
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418 static VALUE
4419 rb_str_to_f(VALUE str)
4420 {
4421 return DBL2NUM(rb_str_to_dbl(str, FALSE));
4422 }
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433 static VALUE
4434 rb_str_to_s(VALUE str)
4435 {
4436 if (rb_obj_class(str) != rb_cString) {
4437 return str_duplicate(rb_cString, str);
4438 }
4439 return str;
4440 }
4441
4442 #if 0
4443 static void
4444 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4445 {
4446 char s[RUBY_MAX_CHAR_LEN];
4447 int n = rb_enc_codelen(c, enc);
4448
4449 rb_enc_mbcput(c, s, enc);
4450 rb_enc_str_buf_cat(str, s, n, enc);
4451 }
4452 #endif
4453
4454 #define CHAR_ESC_LEN 13
4455
4456 int
4457 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4458 {
4459 char buf[CHAR_ESC_LEN + 1];
4460 int l;
4461
4462 #if SIZEOF_INT > 4
4463 c &= 0xffffffff;
4464 #endif
4465 if (unicode_p) {
4466 if (c < 0x7F && ISPRINT(c)) {
4467 snprintf(buf, CHAR_ESC_LEN, "%c", c);
4468 }
4469 else if (c < 0x10000) {
4470 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4471 }
4472 else {
4473 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4474 }
4475 }
4476 else {
4477 if (c < 0x100) {
4478 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4479 }
4480 else {
4481 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4482 }
4483 }
4484 l = (int)strlen(buf);
4485 rb_str_buf_cat(result, buf, l);
4486 return l;
4487 }
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501 VALUE
4502 rb_str_inspect(VALUE str)
4503 {
4504 rb_encoding *enc = STR_ENC_GET(str);
4505 const char *p, *pend, *prev;
4506 char buf[CHAR_ESC_LEN + 1];
4507 VALUE result = rb_str_buf_new(0);
4508 rb_encoding *resenc = rb_default_internal_encoding();
4509 int unicode_p = rb_enc_unicode_p(enc);
4510 int asciicompat = rb_enc_asciicompat(enc);
4511 static rb_encoding *utf16, *utf32;
4512
4513 if (!utf16) utf16 = rb_enc_find("UTF-16");
4514 if (!utf32) utf32 = rb_enc_find("UTF-32");
4515 if (resenc == NULL) resenc = rb_default_external_encoding();
4516 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4517 rb_enc_associate(result, resenc);
4518 str_buf_cat2(result, "\"");
4519
4520 p = RSTRING_PTR(str); pend = RSTRING_END(str);
4521 prev = p;
4522 if (enc == utf16) {
4523 const unsigned char *q = (const unsigned char *)p;
4524 if (q[0] == 0xFE && q[1] == 0xFF)
4525 enc = rb_enc_find("UTF-16BE");
4526 else if (q[0] == 0xFF && q[1] == 0xFE)
4527 enc = rb_enc_find("UTF-16LE");
4528 else
4529 unicode_p = 0;
4530 }
4531 else if (enc == utf32) {
4532 const unsigned char *q = (const unsigned char *)p;
4533 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4534 enc = rb_enc_find("UTF-32BE");
4535 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4536 enc = rb_enc_find("UTF-32LE");
4537 else
4538 unicode_p = 0;
4539 }
4540 while (p < pend) {
4541 unsigned int c, cc;
4542 int n;
4543
4544 n = rb_enc_precise_mbclen(p, pend, enc);
4545 if (!MBCLEN_CHARFOUND_P(n)) {
4546 if (p > prev) str_buf_cat(result, prev, p - prev);
4547 n = rb_enc_mbminlen(enc);
4548 if (pend < p + n)
4549 n = (int)(pend - p);
4550 while (n--) {
4551 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4552 str_buf_cat(result, buf, strlen(buf));
4553 prev = ++p;
4554 }
4555 continue;
4556 }
4557 n = MBCLEN_CHARFOUND_LEN(n);
4558 c = rb_enc_mbc_to_codepoint(p, pend, enc);
4559 p += n;
4560 if ((asciicompat || unicode_p) &&
4561 (c == '"'|| c == '\\' ||
4562 (c == '#' &&
4563 p < pend &&
4564 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
4565 (cc = rb_enc_codepoint(p,pend,enc),
4566 (cc == '$' || cc == '@' || cc == '{'))))) {
4567 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4568 str_buf_cat2(result, "\\");
4569 if (asciicompat || enc == resenc) {
4570 prev = p - n;
4571 continue;
4572 }
4573 }
4574 switch (c) {
4575 case '\0': cc = '0'; break;
4576 case '\n': cc = 'n'; break;
4577 case '\r': cc = 'r'; break;
4578 case '\t': cc = 't'; break;
4579 case '\f': cc = 'f'; break;
4580 case '\013': cc = 'v'; break;
4581 case '\010': cc = 'b'; break;
4582 case '\007': cc = 'a'; break;
4583 case 033: cc = 'e'; break;
4584 default: cc = 0; break;
4585 }
4586 if (cc) {
4587 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4588 buf[0] = '\\';
4589 buf[1] = (char)cc;
4590 str_buf_cat(result, buf, 2);
4591 prev = p;
4592 continue;
4593 }
4594 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4595 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4596 continue;
4597 }
4598 else {
4599 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4600 rb_str_buf_cat_escaped_char(result, c, unicode_p);
4601 prev = p;
4602 continue;
4603 }
4604 }
4605 if (p > prev) str_buf_cat(result, prev, p - prev);
4606 str_buf_cat2(result, "\"");
4607
4608 OBJ_INFECT(result, str);
4609 return result;
4610 }
4611
4612 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624 VALUE
4625 rb_str_dump(VALUE str)
4626 {
4627 rb_encoding *enc = rb_enc_get(str);
4628 long len;
4629 const char *p, *pend;
4630 char *q, *qend;
4631 VALUE result;
4632 int u8 = (enc == rb_utf8_encoding());
4633
4634 len = 2;
4635 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4636 while (p < pend) {
4637 unsigned char c = *p++;
4638 switch (c) {
4639 case '"': case '\\':
4640 case '\n': case '\r':
4641 case '\t': case '\f':
4642 case '\013': case '\010': case '\007': case '\033':
4643 len += 2;
4644 break;
4645
4646 case '#':
4647 len += IS_EVSTR(p, pend) ? 2 : 1;
4648 break;
4649
4650 default:
4651 if (ISPRINT(c)) {
4652 len++;
4653 }
4654 else {
4655 if (u8) {
4656 int n = rb_enc_precise_mbclen(p-1, pend, enc);
4657 if (MBCLEN_CHARFOUND_P(n-1)) {
4658 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4659 while (cc >>= 4) len++;
4660 len += 5;
4661 p += MBCLEN_CHARFOUND_LEN(n)-1;
4662 break;
4663 }
4664 }
4665 len += 4;
4666 }
4667 break;
4668 }
4669 }
4670 if (!rb_enc_asciicompat(enc)) {
4671 len += 19;
4672 len += strlen(enc->name);
4673 }
4674
4675 result = rb_str_new5(str, 0, len);
4676 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4677 q = RSTRING_PTR(result); qend = q + len + 1;
4678
4679 *q++ = '"';
4680 while (p < pend) {
4681 unsigned char c = *p++;
4682
4683 if (c == '"' || c == '\\') {
4684 *q++ = '\\';
4685 *q++ = c;
4686 }
4687 else if (c == '#') {
4688 if (IS_EVSTR(p, pend)) *q++ = '\\';
4689 *q++ = '#';
4690 }
4691 else if (c == '\n') {
4692 *q++ = '\\';
4693 *q++ = 'n';
4694 }
4695 else if (c == '\r') {
4696 *q++ = '\\';
4697 *q++ = 'r';
4698 }
4699 else if (c == '\t') {
4700 *q++ = '\\';
4701 *q++ = 't';
4702 }
4703 else if (c == '\f') {
4704 *q++ = '\\';
4705 *q++ = 'f';
4706 }
4707 else if (c == '\013') {
4708 *q++ = '\\';
4709 *q++ = 'v';
4710 }
4711 else if (c == '\010') {
4712 *q++ = '\\';
4713 *q++ = 'b';
4714 }
4715 else if (c == '\007') {
4716 *q++ = '\\';
4717 *q++ = 'a';
4718 }
4719 else if (c == '\033') {
4720 *q++ = '\\';
4721 *q++ = 'e';
4722 }
4723 else if (ISPRINT(c)) {
4724 *q++ = c;
4725 }
4726 else {
4727 *q++ = '\\';
4728 if (u8) {
4729 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4730 if (MBCLEN_CHARFOUND_P(n)) {
4731 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4732 p += n;
4733 snprintf(q, qend-q, "u{%x}", cc);
4734 q += strlen(q);
4735 continue;
4736 }
4737 }
4738 snprintf(q, qend-q, "x%02X", c);
4739 q += 3;
4740 }
4741 }
4742 *q++ = '"';
4743 *q = '\0';
4744 if (!rb_enc_asciicompat(enc)) {
4745 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4746 enc = rb_ascii8bit_encoding();
4747 }
4748 OBJ_INFECT(result, str);
4749
4750 rb_enc_associate(result, enc);
4751 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
4752 return result;
4753 }
4754
4755
4756 static void
4757 rb_str_check_dummy_enc(rb_encoding *enc)
4758 {
4759 if (rb_enc_dummy_p(enc)) {
4760 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4761 rb_enc_name(enc));
4762 }
4763 }
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774 static VALUE
4775 rb_str_upcase_bang(VALUE str)
4776 {
4777 rb_encoding *enc;
4778 char *s, *send;
4779 int modify = 0;
4780 int n;
4781
4782 str_modify_keep_cr(str);
4783 enc = STR_ENC_GET(str);
4784 rb_str_check_dummy_enc(enc);
4785 s = RSTRING_PTR(str); send = RSTRING_END(str);
4786 if (single_byte_optimizable(str)) {
4787 while (s < send) {
4788 unsigned int c = *(unsigned char*)s;
4789
4790 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4791 *s = 'A' + (c - 'a');
4792 modify = 1;
4793 }
4794 s++;
4795 }
4796 }
4797 else {
4798 int ascompat = rb_enc_asciicompat(enc);
4799
4800 while (s < send) {
4801 unsigned int c;
4802
4803 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4804 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4805 *s = 'A' + (c - 'a');
4806 modify = 1;
4807 }
4808 s++;
4809 }
4810 else {
4811 c = rb_enc_codepoint_len(s, send, &n, enc);
4812 if (rb_enc_islower(c, enc)) {
4813
4814 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4815 modify = 1;
4816 }
4817 s += n;
4818 }
4819 }
4820 }
4821
4822 if (modify) return str;
4823 return Qnil;
4824 }
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839 static VALUE
4840 rb_str_upcase(VALUE str)
4841 {
4842 str = rb_str_dup(str);
4843 rb_str_upcase_bang(str);
4844 return str;
4845 }
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857 static VALUE
4858 rb_str_downcase_bang(VALUE str)
4859 {
4860 rb_encoding *enc;
4861 char *s, *send;
4862 int modify = 0;
4863
4864 str_modify_keep_cr(str);
4865 enc = STR_ENC_GET(str);
4866 rb_str_check_dummy_enc(enc);
4867 s = RSTRING_PTR(str); send = RSTRING_END(str);
4868 if (single_byte_optimizable(str)) {
4869 while (s < send) {
4870 unsigned int c = *(unsigned char*)s;
4871
4872 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4873 *s = 'a' + (c - 'A');
4874 modify = 1;
4875 }
4876 s++;
4877 }
4878 }
4879 else {
4880 int ascompat = rb_enc_asciicompat(enc);
4881
4882 while (s < send) {
4883 unsigned int c;
4884 int n;
4885
4886 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4887 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4888 *s = 'a' + (c - 'A');
4889 modify = 1;
4890 }
4891 s++;
4892 }
4893 else {
4894 c = rb_enc_codepoint_len(s, send, &n, enc);
4895 if (rb_enc_isupper(c, enc)) {
4896
4897 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4898 modify = 1;
4899 }
4900 s += n;
4901 }
4902 }
4903 }
4904
4905 if (modify) return str;
4906 return Qnil;
4907 }
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922 static VALUE
4923 rb_str_downcase(VALUE str)
4924 {
4925 str = rb_str_dup(str);
4926 rb_str_downcase_bang(str);
4927 return str;
4928 }
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945 static VALUE
4946 rb_str_capitalize_bang(VALUE str)
4947 {
4948 rb_encoding *enc;
4949 char *s, *send;
4950 int modify = 0;
4951 unsigned int c;
4952 int n;
4953
4954 str_modify_keep_cr(str);
4955 enc = STR_ENC_GET(str);
4956 rb_str_check_dummy_enc(enc);
4957 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4958 s = RSTRING_PTR(str); send = RSTRING_END(str);
4959
4960 c = rb_enc_codepoint_len(s, send, &n, enc);
4961 if (rb_enc_islower(c, enc)) {
4962 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4963 modify = 1;
4964 }
4965 s += n;
4966 while (s < send) {
4967 c = rb_enc_codepoint_len(s, send, &n, enc);
4968 if (rb_enc_isupper(c, enc)) {
4969 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4970 modify = 1;
4971 }
4972 s += n;
4973 }
4974
4975 if (modify) return str;
4976 return Qnil;
4977 }
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993 static VALUE
4994 rb_str_capitalize(VALUE str)
4995 {
4996 str = rb_str_dup(str);
4997 rb_str_capitalize_bang(str);
4998 return str;
4999 }
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011 static VALUE
5012 rb_str_swapcase_bang(VALUE str)
5013 {
5014 rb_encoding *enc;
5015 char *s, *send;
5016 int modify = 0;
5017 int n;
5018
5019 str_modify_keep_cr(str);
5020 enc = STR_ENC_GET(str);
5021 rb_str_check_dummy_enc(enc);
5022 s = RSTRING_PTR(str); send = RSTRING_END(str);
5023 while (s < send) {
5024 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
5025
5026 if (rb_enc_isupper(c, enc)) {
5027
5028 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
5029 modify = 1;
5030 }
5031 else if (rb_enc_islower(c, enc)) {
5032
5033 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
5034 modify = 1;
5035 }
5036 s += n;
5037 }
5038
5039 if (modify) return str;
5040 return Qnil;
5041 }
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056 static VALUE
5057 rb_str_swapcase(VALUE str)
5058 {
5059 str = rb_str_dup(str);
5060 rb_str_swapcase_bang(str);
5061 return str;
5062 }
5063
5064 typedef unsigned char *USTR;
5065
5066 struct tr {
5067 int gen;
5068 unsigned int now, max;
5069 char *p, *pend;
5070 };
5071
5072 static unsigned int
5073 trnext(struct tr *t, rb_encoding *enc)
5074 {
5075 int n;
5076
5077 for (;;) {
5078 if (!t->gen) {
5079 nextpart:
5080 if (t->p == t->pend) return -1;
5081 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
5082 t->p += n;
5083 }
5084 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5085 t->p += n;
5086 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
5087 t->p += n;
5088 if (t->p < t->pend) {
5089 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
5090 t->p += n;
5091 if (t->now > c) {
5092 if (t->now < 0x80 && c < 0x80) {
5093 rb_raise(rb_eArgError,
5094 "invalid range \"%c-%c\" in string transliteration",
5095 t->now, c);
5096 }
5097 else {
5098 rb_raise(rb_eArgError, "invalid range in string transliteration");
5099 }
5100 continue;
5101 }
5102 t->gen = 1;
5103 t->max = c;
5104 }
5105 }
5106 return t->now;
5107 }
5108 else {
5109 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
5110 if (t->now == t->max) {
5111 t->gen = 0;
5112 goto nextpart;
5113 }
5114 }
5115 if (t->now < t->max) {
5116 return t->now;
5117 }
5118 else {
5119 t->gen = 0;
5120 return t->max;
5121 }
5122 }
5123 }
5124 }
5125
5126 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5127
5128 static VALUE
5129 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5130 {
5131 const unsigned int errc = -1;
5132 unsigned int trans[256];
5133 rb_encoding *enc, *e1, *e2;
5134 struct tr trsrc, trrepl;
5135 int cflag = 0;
5136 unsigned int c, c0, last = 0;
5137 int modify = 0, i, l;
5138 char *s, *send;
5139 VALUE hash = 0;
5140 int singlebyte = single_byte_optimizable(str);
5141 int cr;
5142
5143 #define CHECK_IF_ASCII(c) \
5144 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5145 (cr = ENC_CODERANGE_VALID) : 0)
5146
5147 StringValue(src);
5148 StringValue(repl);
5149 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5150 if (RSTRING_LEN(repl) == 0) {
5151 return rb_str_delete_bang(1, &src, str);
5152 }
5153
5154 cr = ENC_CODERANGE(str);
5155 e1 = rb_enc_check(str, src);
5156 e2 = rb_enc_check(str, repl);
5157 if (e1 == e2) {
5158 enc = e1;
5159 }
5160 else {
5161 enc = rb_enc_check(src, repl);
5162 }
5163 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5164 if (RSTRING_LEN(src) > 1 &&
5165 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5166 trsrc.p + l < trsrc.pend) {
5167 cflag = 1;
5168 trsrc.p += l;
5169 }
5170 trrepl.p = RSTRING_PTR(repl);
5171 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5172 trsrc.gen = trrepl.gen = 0;
5173 trsrc.now = trrepl.now = 0;
5174 trsrc.max = trrepl.max = 0;
5175
5176 if (cflag) {
5177 for (i=0; i<256; i++) {
5178 trans[i] = 1;
5179 }
5180 while ((c = trnext(&trsrc, enc)) != errc) {
5181 if (c < 256) {
5182 trans[c] = errc;
5183 }
5184 else {
5185 if (!hash) hash = rb_hash_new();
5186 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5187 }
5188 }
5189 while ((c = trnext(&trrepl, enc)) != errc)
5190 ;
5191 last = trrepl.now;
5192 for (i=0; i<256; i++) {
5193 if (trans[i] != errc) {
5194 trans[i] = last;
5195 }
5196 }
5197 }
5198 else {
5199 unsigned int r;
5200
5201 for (i=0; i<256; i++) {
5202 trans[i] = errc;
5203 }
5204 while ((c = trnext(&trsrc, enc)) != errc) {
5205 r = trnext(&trrepl, enc);
5206 if (r == errc) r = trrepl.now;
5207 if (c < 256) {
5208 trans[c] = r;
5209 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5210 }
5211 else {
5212 if (!hash) hash = rb_hash_new();
5213 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5214 }
5215 }
5216 }
5217
5218 if (cr == ENC_CODERANGE_VALID)
5219 cr = ENC_CODERANGE_7BIT;
5220 str_modify_keep_cr(str);
5221 s = RSTRING_PTR(str); send = RSTRING_END(str);
5222 if (sflag) {
5223 int clen, tlen;
5224 long offset, max = RSTRING_LEN(str);
5225 unsigned int save = -1;
5226 char *buf = ALLOC_N(char, max), *t = buf;
5227
5228 while (s < send) {
5229 int may_modify = 0;
5230
5231 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5232 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5233
5234 s += clen;
5235 if (c < 256) {
5236 c = trans[c];
5237 }
5238 else if (hash) {
5239 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5240 if (NIL_P(tmp)) {
5241 if (cflag) c = last;
5242 else c = errc;
5243 }
5244 else if (cflag) c = errc;
5245 else c = NUM2INT(tmp);
5246 }
5247 else {
5248 c = errc;
5249 }
5250 if (c != (unsigned int)-1) {
5251 if (save == c) {
5252 CHECK_IF_ASCII(c);
5253 continue;
5254 }
5255 save = c;
5256 tlen = rb_enc_codelen(c, enc);
5257 modify = 1;
5258 }
5259 else {
5260 save = -1;
5261 c = c0;
5262 if (enc != e1) may_modify = 1;
5263 }
5264 while (t - buf + tlen >= max) {
5265 offset = t - buf;
5266 max *= 2;
5267 REALLOC_N(buf, char, max);
5268 t = buf + offset;
5269 }
5270 rb_enc_mbcput(c, t, enc);
5271 if (may_modify && memcmp(s, t, tlen) != 0) {
5272 modify = 1;
5273 }
5274 CHECK_IF_ASCII(c);
5275 t += tlen;
5276 }
5277 if (!STR_EMBED_P(str)) {
5278 xfree(RSTRING(str)->as.heap.ptr);
5279 }
5280 *t = '\0';
5281 RSTRING(str)->as.heap.ptr = buf;
5282 RSTRING(str)->as.heap.len = t - buf;
5283 STR_SET_NOEMBED(str);
5284 RSTRING(str)->as.heap.aux.capa = max;
5285 }
5286 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5287 while (s < send) {
5288 c = (unsigned char)*s;
5289 if (trans[c] != errc) {
5290 if (!cflag) {
5291 c = trans[c];
5292 *s = c;
5293 modify = 1;
5294 }
5295 else {
5296 *s = last;
5297 modify = 1;
5298 }
5299 }
5300 CHECK_IF_ASCII(c);
5301 s++;
5302 }
5303 }
5304 else {
5305 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5306 long offset;
5307 char *buf = ALLOC_N(char, max), *t = buf;
5308
5309 while (s < send) {
5310 int may_modify = 0;
5311 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5312 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5313
5314 if (c < 256) {
5315 c = trans[c];
5316 }
5317 else if (hash) {
5318 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5319 if (NIL_P(tmp)) {
5320 if (cflag) c = last;
5321 else c = errc;
5322 }
5323 else if (cflag) c = errc;
5324 else c = NUM2INT(tmp);
5325 }
5326 else {
5327 c = cflag ? last : errc;
5328 }
5329 if (c != errc) {
5330 tlen = rb_enc_codelen(c, enc);
5331 modify = 1;
5332 }
5333 else {
5334 c = c0;
5335 if (enc != e1) may_modify = 1;
5336 }
5337 while (t - buf + tlen >= max) {
5338 offset = t - buf;
5339 max *= 2;
5340 REALLOC_N(buf, char, max);
5341 t = buf + offset;
5342 }
5343 if (s != t) {
5344 rb_enc_mbcput(c, t, enc);
5345 if (may_modify && memcmp(s, t, tlen) != 0) {
5346 modify = 1;
5347 }
5348 }
5349 CHECK_IF_ASCII(c);
5350 s += clen;
5351 t += tlen;
5352 }
5353 if (!STR_EMBED_P(str)) {
5354 xfree(RSTRING(str)->as.heap.ptr);
5355 }
5356 *t = '\0';
5357 RSTRING(str)->as.heap.ptr = buf;
5358 RSTRING(str)->as.heap.len = t - buf;
5359 STR_SET_NOEMBED(str);
5360 RSTRING(str)->as.heap.aux.capa = max;
5361 }
5362
5363 if (modify) {
5364 if (cr != ENC_CODERANGE_BROKEN)
5365 ENC_CODERANGE_SET(str, cr);
5366 rb_enc_associate(str, enc);
5367 return str;
5368 }
5369 return Qnil;
5370 }
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382 static VALUE
5383 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
5384 {
5385 return tr_trans(str, src, repl, 0);
5386 }
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424 static VALUE
5425 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5426 {
5427 str = rb_str_dup(str);
5428 tr_trans(str, src, repl, 0);
5429 return str;
5430 }
5431
5432 #define TR_TABLE_SIZE 257
5433 static void
5434 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5435 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5436 {
5437 const unsigned int errc = -1;
5438 char buf[256];
5439 struct tr tr;
5440 unsigned int c;
5441 VALUE table = 0, ptable = 0;
5442 int i, l, cflag = 0;
5443
5444 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5445 tr.gen = tr.now = tr.max = 0;
5446
5447 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5448 cflag = 1;
5449 tr.p += l;
5450 }
5451 if (first) {
5452 for (i=0; i<256; i++) {
5453 stable[i] = 1;
5454 }
5455 stable[256] = cflag;
5456 }
5457 else if (stable[256] && !cflag) {
5458 stable[256] = 0;
5459 }
5460 for (i=0; i<256; i++) {
5461 buf[i] = cflag;
5462 }
5463
5464 while ((c = trnext(&tr, enc)) != errc) {
5465 if (c < 256) {
5466 buf[c & 0xff] = !cflag;
5467 }
5468 else {
5469 VALUE key = UINT2NUM(c);
5470
5471 if (!table && (first || *tablep || stable[256])) {
5472 if (cflag) {
5473 ptable = *ctablep;
5474 table = ptable ? ptable : rb_hash_new();
5475 *ctablep = table;
5476 }
5477 else {
5478 table = rb_hash_new();
5479 ptable = *tablep;
5480 *tablep = table;
5481 }
5482 }
5483 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
5484 rb_hash_aset(table, key, Qtrue);
5485 }
5486 }
5487 }
5488 for (i=0; i<256; i++) {
5489 stable[i] = stable[i] && buf[i];
5490 }
5491 if (!table && !cflag) {
5492 *tablep = 0;
5493 }
5494 }
5495
5496
5497 static int
5498 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5499 {
5500 if (c < 256) {
5501 return table[c] != 0;
5502 }
5503 else {
5504 VALUE v = UINT2NUM(c);
5505
5506 if (del) {
5507 if (!NIL_P(rb_hash_lookup(del, v)) &&
5508 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5509 return TRUE;
5510 }
5511 }
5512 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5513 return FALSE;
5514 }
5515 return table[256] ? TRUE : FALSE;
5516 }
5517 }
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527 static VALUE
5528 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
5529 {
5530 char squeez[TR_TABLE_SIZE];
5531 rb_encoding *enc = 0;
5532 char *s, *send, *t;
5533 VALUE del = 0, nodel = 0;
5534 int modify = 0;
5535 int i, ascompat, cr;
5536
5537 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5538 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5539 for (i=0; i<argc; i++) {
5540 VALUE s = argv[i];
5541
5542 StringValue(s);
5543 enc = rb_enc_check(str, s);
5544 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5545 }
5546
5547 str_modify_keep_cr(str);
5548 ascompat = rb_enc_asciicompat(enc);
5549 s = t = RSTRING_PTR(str);
5550 send = RSTRING_END(str);
5551 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5552 while (s < send) {
5553 unsigned int c;
5554 int clen;
5555
5556 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5557 if (squeez[c]) {
5558 modify = 1;
5559 }
5560 else {
5561 if (t != s) *t = c;
5562 t++;
5563 }
5564 s++;
5565 }
5566 else {
5567 c = rb_enc_codepoint_len(s, send, &clen, enc);
5568
5569 if (tr_find(c, squeez, del, nodel)) {
5570 modify = 1;
5571 }
5572 else {
5573 if (t != s) rb_enc_mbcput(c, t, enc);
5574 t += clen;
5575 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5576 }
5577 s += clen;
5578 }
5579 }
5580 *t = '\0';
5581 STR_SET_LEN(str, t - RSTRING_PTR(str));
5582 ENC_CODERANGE_SET(str, cr);
5583
5584 if (modify) return str;
5585 return Qnil;
5586 }
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603 static VALUE
5604 rb_str_delete(int argc, VALUE *argv, VALUE str)
5605 {
5606 str = rb_str_dup(str);
5607 rb_str_delete_bang(argc, argv, str);
5608 return str;
5609 }
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620 static VALUE
5621 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
5622 {
5623 char squeez[TR_TABLE_SIZE];
5624 rb_encoding *enc = 0;
5625 VALUE del = 0, nodel = 0;
5626 char *s, *send, *t;
5627 int i, modify = 0;
5628 int ascompat, singlebyte = single_byte_optimizable(str);
5629 unsigned int save;
5630
5631 if (argc == 0) {
5632 enc = STR_ENC_GET(str);
5633 }
5634 else {
5635 for (i=0; i<argc; i++) {
5636 VALUE s = argv[i];
5637
5638 StringValue(s);
5639 enc = rb_enc_check(str, s);
5640 if (singlebyte && !single_byte_optimizable(s))
5641 singlebyte = 0;
5642 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5643 }
5644 }
5645
5646 str_modify_keep_cr(str);
5647 s = t = RSTRING_PTR(str);
5648 if (!s || RSTRING_LEN(str) == 0) return Qnil;
5649 send = RSTRING_END(str);
5650 save = -1;
5651 ascompat = rb_enc_asciicompat(enc);
5652
5653 if (singlebyte) {
5654 while (s < send) {
5655 unsigned int c = *(unsigned char*)s++;
5656 if (c != save || (argc > 0 && !squeez[c])) {
5657 *t++ = save = c;
5658 }
5659 }
5660 } else {
5661 while (s < send) {
5662 unsigned int c;
5663 int clen;
5664
5665 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5666 if (c != save || (argc > 0 && !squeez[c])) {
5667 *t++ = save = c;
5668 }
5669 s++;
5670 }
5671 else {
5672 c = rb_enc_codepoint_len(s, send, &clen, enc);
5673
5674 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5675 if (t != s) rb_enc_mbcput(c, t, enc);
5676 save = c;
5677 t += clen;
5678 }
5679 s += clen;
5680 }
5681 }
5682 }
5683
5684 *t = '\0';
5685 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5686 STR_SET_LEN(str, t - RSTRING_PTR(str));
5687 modify = 1;
5688 }
5689
5690 if (modify) return str;
5691 return Qnil;
5692 }
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710 static VALUE
5711 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
5712 {
5713 str = rb_str_dup(str);
5714 rb_str_squeeze_bang(argc, argv, str);
5715 return str;
5716 }
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727 static VALUE
5728 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
5729 {
5730 return tr_trans(str, src, repl, 1);
5731 }
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747 static VALUE
5748 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5749 {
5750 str = rb_str_dup(str);
5751 tr_trans(str, src, repl, 1);
5752 return str;
5753 }
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783 static VALUE
5784 rb_str_count(int argc, VALUE *argv, VALUE str)
5785 {
5786 char table[TR_TABLE_SIZE];
5787 rb_encoding *enc = 0;
5788 VALUE del = 0, nodel = 0;
5789 char *s, *send;
5790 int i;
5791 int ascompat;
5792
5793 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
5794 for (i=0; i<argc; i++) {
5795 VALUE tstr = argv[i];
5796 unsigned char c;
5797
5798 StringValue(tstr);
5799 enc = rb_enc_check(str, tstr);
5800 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5801 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5802 int n = 0;
5803
5804 s = RSTRING_PTR(str);
5805 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5806 send = RSTRING_END(str);
5807 while (s < send) {
5808 if (*(unsigned char*)s++ == c) n++;
5809 }
5810 return INT2NUM(n);
5811 }
5812 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5813 }
5814
5815 s = RSTRING_PTR(str);
5816 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5817 send = RSTRING_END(str);
5818 ascompat = rb_enc_asciicompat(enc);
5819 i = 0;
5820 while (s < send) {
5821 unsigned int c;
5822
5823 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5824 if (table[c]) {
5825 i++;
5826 }
5827 s++;
5828 }
5829 else {
5830 int clen;
5831 c = rb_enc_codepoint_len(s, send, &clen, enc);
5832 if (tr_find(c, table, del, nodel)) {
5833 i++;
5834 }
5835 s += clen;
5836 }
5837 }
5838
5839 return INT2NUM(i);
5840 }
5841
5842 static const char isspacetable[256] = {
5843 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5845 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5859 };
5860
5861 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910 static VALUE
5911 rb_str_split_m(int argc, VALUE *argv, VALUE str)
5912 {
5913 rb_encoding *enc;
5914 VALUE spat;
5915 VALUE limit;
5916 enum {awk, string, regexp} split_type;
5917 long beg, end, i = 0;
5918 int lim = 0;
5919 VALUE result, tmp;
5920
5921 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5922 lim = NUM2INT(limit);
5923 if (lim <= 0) limit = Qnil;
5924 else if (lim == 1) {
5925 if (RSTRING_LEN(str) == 0)
5926 return rb_ary_new2(0);
5927 return rb_ary_new3(1, str);
5928 }
5929 i = 1;
5930 }
5931
5932 enc = STR_ENC_GET(str);
5933 if (NIL_P(spat)) {
5934 if (!NIL_P(rb_fs)) {
5935 spat = rb_fs;
5936 goto fs_set;
5937 }
5938 split_type = awk;
5939 }
5940 else {
5941 fs_set:
5942 if (RB_TYPE_P(spat, T_STRING)) {
5943 rb_encoding *enc2 = STR_ENC_GET(spat);
5944
5945 split_type = string;
5946 if (RSTRING_LEN(spat) == 0) {
5947
5948 spat = rb_reg_regcomp(spat);
5949 split_type = regexp;
5950 }
5951 else if (rb_enc_asciicompat(enc2) == 1) {
5952 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5953 split_type = awk;
5954 }
5955 }
5956 else {
5957 int l;
5958 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5959 RSTRING_LEN(spat) == l) {
5960 split_type = awk;
5961 }
5962 }
5963 }
5964 else {
5965 spat = get_pat(spat, 1);
5966 split_type = regexp;
5967 }
5968 }
5969
5970 result = rb_ary_new();
5971 beg = 0;
5972 if (split_type == awk) {
5973 char *ptr = RSTRING_PTR(str);
5974 char *eptr = RSTRING_END(str);
5975 char *bptr = ptr;
5976 int skip = 1;
5977 unsigned int c;
5978
5979 end = beg;
5980 if (is_ascii_string(str)) {
5981 while (ptr < eptr) {
5982 c = (unsigned char)*ptr++;
5983 if (skip) {
5984 if (ascii_isspace(c)) {
5985 beg = ptr - bptr;
5986 }
5987 else {
5988 end = ptr - bptr;
5989 skip = 0;
5990 if (!NIL_P(limit) && lim <= i) break;
5991 }
5992 }
5993 else if (ascii_isspace(c)) {
5994 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5995 skip = 1;
5996 beg = ptr - bptr;
5997 if (!NIL_P(limit)) ++i;
5998 }
5999 else {
6000 end = ptr - bptr;
6001 }
6002 }
6003 }
6004 else {
6005 while (ptr < eptr) {
6006 int n;
6007
6008 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
6009 ptr += n;
6010 if (skip) {
6011 if (rb_isspace(c)) {
6012 beg = ptr - bptr;
6013 }
6014 else {
6015 end = ptr - bptr;
6016 skip = 0;
6017 if (!NIL_P(limit) && lim <= i) break;
6018 }
6019 }
6020 else if (rb_isspace(c)) {
6021 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6022 skip = 1;
6023 beg = ptr - bptr;
6024 if (!NIL_P(limit)) ++i;
6025 }
6026 else {
6027 end = ptr - bptr;
6028 }
6029 }
6030 }
6031 }
6032 else if (split_type == string) {
6033 char *ptr = RSTRING_PTR(str);
6034 char *temp = ptr;
6035 char *eptr = RSTRING_END(str);
6036 char *sptr = RSTRING_PTR(spat);
6037 long slen = RSTRING_LEN(spat);
6038
6039 if (is_broken_string(str)) {
6040 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
6041 }
6042 if (is_broken_string(spat)) {
6043 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
6044 }
6045 enc = rb_enc_check(str, spat);
6046 while (ptr < eptr &&
6047 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
6048
6049 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
6050 if (t != ptr + end) {
6051 ptr = t;
6052 continue;
6053 }
6054 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
6055 ptr += end + slen;
6056 if (!NIL_P(limit) && lim <= ++i) break;
6057 }
6058 beg = ptr - temp;
6059 }
6060 else {
6061 char *ptr = RSTRING_PTR(str);
6062 long len = RSTRING_LEN(str);
6063 long start = beg;
6064 long idx;
6065 int last_null = 0;
6066 struct re_registers *regs;
6067
6068 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
6069 regs = RMATCH_REGS(rb_backref_get());
6070 if (start == end && BEG(0) == END(0)) {
6071 if (!ptr) {
6072 rb_ary_push(result, str_new_empty(str));
6073 break;
6074 }
6075 else if (last_null == 1) {
6076 rb_ary_push(result, rb_str_subseq(str, beg,
6077 rb_enc_fast_mbclen(ptr+beg,
6078 ptr+len,
6079 enc)));
6080 beg = start;
6081 }
6082 else {
6083 if (ptr+start == ptr+len)
6084 start++;
6085 else
6086 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
6087 last_null = 1;
6088 continue;
6089 }
6090 }
6091 else {
6092 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
6093 beg = start = END(0);
6094 }
6095 last_null = 0;
6096
6097 for (idx=1; idx < regs->num_regs; idx++) {
6098 if (BEG(idx) == -1) continue;
6099 if (BEG(idx) == END(idx))
6100 tmp = str_new_empty(str);
6101 else
6102 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
6103 rb_ary_push(result, tmp);
6104 }
6105 if (!NIL_P(limit) && lim <= ++i) break;
6106 }
6107 }
6108 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
6109 if (RSTRING_LEN(str) == beg)
6110 tmp = str_new_empty(str);
6111 else
6112 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
6113 rb_ary_push(result, tmp);
6114 }
6115 if (NIL_P(limit) && lim == 0) {
6116 long len;
6117 while ((len = RARRAY_LEN(result)) > 0 &&
6118 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
6119 rb_ary_pop(result);
6120 }
6121
6122 return result;
6123 }
6124
6125 VALUE
6126 rb_str_split(VALUE str, const char *sep0)
6127 {
6128 VALUE sep;
6129
6130 StringValue(str);
6131 sep = rb_str_new2(sep0);
6132 return rb_str_split_m(1, &sep, str);
6133 }
6134
6135
6136 static VALUE
6137 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
6138 {
6139 rb_encoding *enc;
6140 VALUE rs;
6141 unsigned int newline;
6142 const char *p, *pend, *s, *ptr;
6143 long len, rslen;
6144 VALUE line;
6145 int n;
6146 VALUE orig = str;
6147 VALUE UNINITIALIZED_VAR(ary);
6148
6149 if (argc == 0) {
6150 rs = rb_rs;
6151 }
6152 else {
6153 rb_scan_args(argc, argv, "01", &rs);
6154 }
6155
6156 if (rb_block_given_p()) {
6157 if (wantarray) {
6158 #if STRING_ENUMERATORS_WANTARRAY
6159 rb_warn("given block not used");
6160 ary = rb_ary_new();
6161 #else
6162 rb_warning("passing a block to String#lines is deprecated");
6163 wantarray = 0;
6164 #endif
6165 }
6166 }
6167 else {
6168 if (wantarray)
6169 ary = rb_ary_new();
6170 else
6171 RETURN_ENUMERATOR(str, argc, argv);
6172 }
6173
6174 if (NIL_P(rs)) {
6175 if (wantarray) {
6176 rb_ary_push(ary, str);
6177 return ary;
6178 }
6179 else {
6180 rb_yield(str);
6181 return orig;
6182 }
6183 }
6184 str = rb_str_new4(str);
6185 ptr = p = s = RSTRING_PTR(str);
6186 pend = p + RSTRING_LEN(str);
6187 len = RSTRING_LEN(str);
6188 StringValue(rs);
6189 if (rs == rb_default_rs) {
6190 enc = rb_enc_get(str);
6191 while (p < pend) {
6192 char *p0;
6193
6194 p = memchr(p, '\n', pend - p);
6195 if (!p) break;
6196 p0 = rb_enc_left_char_head(s, p, pend, enc);
6197 if (!rb_enc_is_newline(p0, pend, enc)) {
6198 p++;
6199 continue;
6200 }
6201 p = p0 + rb_enc_mbclen(p0, pend, enc);
6202 line = rb_str_subseq(str, s - ptr, p - s);
6203 if (wantarray)
6204 rb_ary_push(ary, line);
6205 else
6206 rb_yield(line);
6207 str_mod_check(str, ptr, len);
6208 s = p;
6209 }
6210 goto finish;
6211 }
6212
6213 enc = rb_enc_check(str, rs);
6214 rslen = RSTRING_LEN(rs);
6215 if (rslen == 0) {
6216 newline = '\n';
6217 }
6218 else {
6219 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6220 }
6221
6222 while (p < pend) {
6223 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6224
6225 again:
6226 if (rslen == 0 && c == newline) {
6227 p += n;
6228 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6229 goto again;
6230 }
6231 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6232 p += n;
6233 }
6234 p -= n;
6235 }
6236 if (c == newline &&
6237 (rslen <= 1 ||
6238 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6239 const char *pp = p + (rslen ? rslen : n);
6240 line = rb_str_subseq(str, s - ptr, pp - s);
6241 if (wantarray)
6242 rb_ary_push(ary, line);
6243 else
6244 rb_yield(line);
6245 str_mod_check(str, ptr, len);
6246 s = pp;
6247 }
6248 p += n;
6249 }
6250
6251 finish:
6252 if (s != pend) {
6253 line = rb_str_subseq(str, s - ptr, pend - s);
6254 if (wantarray)
6255 rb_ary_push(ary, line);
6256 else
6257 rb_yield(line);
6258 RB_GC_GUARD(str);
6259 }
6260
6261 if (wantarray)
6262 return ary;
6263 else
6264 return orig;
6265 }
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302 static VALUE
6303 rb_str_each_line(int argc, VALUE *argv, VALUE str)
6304 {
6305 return rb_str_enumerate_lines(argc, argv, str, 0);
6306 }
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320 static VALUE
6321 rb_str_lines(int argc, VALUE *argv, VALUE str)
6322 {
6323 return rb_str_enumerate_lines(argc, argv, str, 1);
6324 }
6325
6326 static VALUE
6327 rb_str_each_byte_size(VALUE str, VALUE args)
6328 {
6329 return LONG2FIX(RSTRING_LEN(str));
6330 }
6331
6332 static VALUE
6333 rb_str_enumerate_bytes(VALUE str, int wantarray)
6334 {
6335 long i;
6336 VALUE UNINITIALIZED_VAR(ary);
6337
6338 if (rb_block_given_p()) {
6339 if (wantarray) {
6340 #if STRING_ENUMERATORS_WANTARRAY
6341 rb_warn("given block not used");
6342 ary = rb_ary_new();
6343 #else
6344 rb_warning("passing a block to String#bytes is deprecated");
6345 wantarray = 0;
6346 #endif
6347 }
6348 }
6349 else {
6350 if (wantarray)
6351 ary = rb_ary_new2(RSTRING_LEN(str));
6352 else
6353 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
6354 }
6355
6356 for (i=0; i<RSTRING_LEN(str); i++) {
6357 if (wantarray)
6358 rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6359 else
6360 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6361 }
6362 if (wantarray)
6363 return ary;
6364 else
6365 return str;
6366 }
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383 static VALUE
6384 rb_str_each_byte(VALUE str)
6385 {
6386 return rb_str_enumerate_bytes(str, 0);
6387 }
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400 static VALUE
6401 rb_str_bytes(VALUE str)
6402 {
6403 return rb_str_enumerate_bytes(str, 1);
6404 }
6405
6406 static VALUE
6407 rb_str_each_char_size(VALUE str)
6408 {
6409 long len = RSTRING_LEN(str);
6410 if (!single_byte_optimizable(str)) {
6411 const char *ptr = RSTRING_PTR(str);
6412 rb_encoding *enc = rb_enc_get(str);
6413 const char *end_ptr = ptr + len;
6414 for (len = 0; ptr < end_ptr; ++len) {
6415 ptr += rb_enc_mbclen(ptr, end_ptr, enc);
6416 }
6417 }
6418 return LONG2FIX(len);
6419 }
6420
6421 static VALUE
6422 rb_str_enumerate_chars(VALUE str, int wantarray)
6423 {
6424 VALUE orig = str;
6425 VALUE substr;
6426 long i, len, n;
6427 const char *ptr;
6428 rb_encoding *enc;
6429 VALUE UNINITIALIZED_VAR(ary);
6430
6431 if (rb_block_given_p()) {
6432 if (wantarray) {
6433 #if STRING_ENUMERATORS_WANTARRAY
6434 rb_warn("given block not used");
6435 ary = rb_ary_new();
6436 #else
6437 rb_warning("passing a block to String#chars is deprecated");
6438 wantarray = 0;
6439 #endif
6440 }
6441 }
6442 else {
6443 if (wantarray)
6444 ary = rb_ary_new();
6445 else
6446 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6447 }
6448
6449 str = rb_str_new4(str);
6450 ptr = RSTRING_PTR(str);
6451 len = RSTRING_LEN(str);
6452 enc = rb_enc_get(str);
6453 switch (ENC_CODERANGE(str)) {
6454 case ENC_CODERANGE_VALID:
6455 case ENC_CODERANGE_7BIT:
6456 for (i = 0; i < len; i += n) {
6457 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6458 substr = rb_str_subseq(str, i, n);
6459 if (wantarray)
6460 rb_ary_push(ary, substr);
6461 else
6462 rb_yield(substr);
6463 }
6464 break;
6465 default:
6466 for (i = 0; i < len; i += n) {
6467 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6468 substr = rb_str_subseq(str, i, n);
6469 if (wantarray)
6470 rb_ary_push(ary, substr);
6471 else
6472 rb_yield(substr);
6473 }
6474 }
6475 RB_GC_GUARD(str);
6476 if (wantarray)
6477 return ary;
6478 else
6479 return orig;
6480 }
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497 static VALUE
6498 rb_str_each_char(VALUE str)
6499 {
6500 return rb_str_enumerate_chars(str, 0);
6501 }
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514 static VALUE
6515 rb_str_chars(VALUE str)
6516 {
6517 return rb_str_enumerate_chars(str, 1);
6518 }
6519
6520
6521 static VALUE
6522 rb_str_enumerate_codepoints(VALUE str, int wantarray)
6523 {
6524 VALUE orig = str;
6525 int n;
6526 unsigned int c;
6527 const char *ptr, *end;
6528 rb_encoding *enc;
6529 VALUE UNINITIALIZED_VAR(ary);
6530
6531 if (single_byte_optimizable(str))
6532 return rb_str_enumerate_bytes(str, wantarray);
6533
6534 if (rb_block_given_p()) {
6535 if (wantarray) {
6536 #if STRING_ENUMERATORS_WANTARRAY
6537 rb_warn("given block not used");
6538 ary = rb_ary_new();
6539 #else
6540 rb_warning("passing a block to String#codepoints is deprecated");
6541 wantarray = 0;
6542 #endif
6543 }
6544 }
6545 else {
6546 if (wantarray)
6547 ary = rb_ary_new();
6548 else
6549 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
6550 }
6551
6552 str = rb_str_new4(str);
6553 ptr = RSTRING_PTR(str);
6554 end = RSTRING_END(str);
6555 enc = STR_ENC_GET(str);
6556 while (ptr < end) {
6557 c = rb_enc_codepoint_len(ptr, end, &n, enc);
6558 if (wantarray)
6559 rb_ary_push(ary, UINT2NUM(c));
6560 else
6561 rb_yield(UINT2NUM(c));
6562 ptr += n;
6563 }
6564 RB_GC_GUARD(str);
6565 if (wantarray)
6566 return ary;
6567 else
6568 return orig;
6569 }
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589 static VALUE
6590 rb_str_each_codepoint(VALUE str)
6591 {
6592 return rb_str_enumerate_codepoints(str, 0);
6593 }
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607 static VALUE
6608 rb_str_codepoints(VALUE str)
6609 {
6610 return rb_str_enumerate_codepoints(str, 1);
6611 }
6612
6613
6614 static long
6615 chopped_length(VALUE str)
6616 {
6617 rb_encoding *enc = STR_ENC_GET(str);
6618 const char *p, *p2, *beg, *end;
6619
6620 beg = RSTRING_PTR(str);
6621 end = beg + RSTRING_LEN(str);
6622 if (beg > end) return 0;
6623 p = rb_enc_prev_char(beg, end, end, enc);
6624 if (!p) return 0;
6625 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6626 p2 = rb_enc_prev_char(beg, p, end, enc);
6627 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6628 }
6629 return p - beg;
6630 }
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641 static VALUE
6642 rb_str_chop_bang(VALUE str)
6643 {
6644 str_modify_keep_cr(str);
6645 if (RSTRING_LEN(str) > 0) {
6646 long len;
6647 len = chopped_length(str);
6648 STR_SET_LEN(str, len);
6649 RSTRING_PTR(str)[len] = '\0';
6650 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6651 ENC_CODERANGE_CLEAR(str);
6652 }
6653 return str;
6654 }
6655 return Qnil;
6656 }
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676 static VALUE
6677 rb_str_chop(VALUE str)
6678 {
6679 return rb_str_subseq(str, 0, chopped_length(str));
6680 }
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691 static VALUE
6692 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
6693 {
6694 rb_encoding *enc;
6695 VALUE rs;
6696 int newline;
6697 char *p, *pp, *e;
6698 long len, rslen;
6699
6700 str_modify_keep_cr(str);
6701 len = RSTRING_LEN(str);
6702 if (len == 0) return Qnil;
6703 p = RSTRING_PTR(str);
6704 e = p + len;
6705 if (argc == 0) {
6706 rs = rb_rs;
6707 if (rs == rb_default_rs) {
6708 smart_chomp:
6709 enc = rb_enc_get(str);
6710 if (rb_enc_mbminlen(enc) > 1) {
6711 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6712 if (rb_enc_is_newline(pp, e, enc)) {
6713 e = pp;
6714 }
6715 pp = e - rb_enc_mbminlen(enc);
6716 if (pp >= p) {
6717 pp = rb_enc_left_char_head(p, pp, e, enc);
6718 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6719 e = pp;
6720 }
6721 }
6722 if (e == RSTRING_END(str)) {
6723 return Qnil;
6724 }
6725 len = e - RSTRING_PTR(str);
6726 STR_SET_LEN(str, len);
6727 }
6728 else {
6729 if (RSTRING_PTR(str)[len-1] == '\n') {
6730 STR_DEC_LEN(str);
6731 if (RSTRING_LEN(str) > 0 &&
6732 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6733 STR_DEC_LEN(str);
6734 }
6735 }
6736 else if (RSTRING_PTR(str)[len-1] == '\r') {
6737 STR_DEC_LEN(str);
6738 }
6739 else {
6740 return Qnil;
6741 }
6742 }
6743 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6744 return str;
6745 }
6746 }
6747 else {
6748 rb_scan_args(argc, argv, "01", &rs);
6749 }
6750 if (NIL_P(rs)) return Qnil;
6751 StringValue(rs);
6752 rslen = RSTRING_LEN(rs);
6753 if (rslen == 0) {
6754 while (len>0 && p[len-1] == '\n') {
6755 len--;
6756 if (len>0 && p[len-1] == '\r')
6757 len--;
6758 }
6759 if (len < RSTRING_LEN(str)) {
6760 STR_SET_LEN(str, len);
6761 RSTRING_PTR(str)[len] = '\0';
6762 return str;
6763 }
6764 return Qnil;
6765 }
6766 if (rslen > len) return Qnil;
6767 newline = RSTRING_PTR(rs)[rslen-1];
6768 if (rslen == 1 && newline == '\n')
6769 goto smart_chomp;
6770
6771 enc = rb_enc_check(str, rs);
6772 if (is_broken_string(rs)) {
6773 return Qnil;
6774 }
6775 pp = e - rslen;
6776 if (p[len-1] == newline &&
6777 (rslen <= 1 ||
6778 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6779 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6780 return Qnil;
6781 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6782 ENC_CODERANGE_CLEAR(str);
6783 }
6784 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6785 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6786 return str;
6787 }
6788 return Qnil;
6789 }
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811 static VALUE
6812 rb_str_chomp(int argc, VALUE *argv, VALUE str)
6813 {
6814 str = rb_str_dup(str);
6815 rb_str_chomp_bang(argc, argv, str);
6816 return str;
6817 }
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831 static VALUE
6832 rb_str_lstrip_bang(VALUE str)
6833 {
6834 rb_encoding *enc;
6835 char *s, *t, *e;
6836
6837 str_modify_keep_cr(str);
6838 enc = STR_ENC_GET(str);
6839 s = RSTRING_PTR(str);
6840 if (!s || RSTRING_LEN(str) == 0) return Qnil;
6841 e = t = RSTRING_END(str);
6842
6843 while (s < e) {
6844 int n;
6845 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6846
6847 if (!rb_isspace(cc)) break;
6848 s += n;
6849 }
6850
6851 if (s > RSTRING_PTR(str)) {
6852 STR_SET_LEN(str, t-s);
6853 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6854 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6855 return str;
6856 }
6857 return Qnil;
6858 }
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872 static VALUE
6873 rb_str_lstrip(VALUE str)
6874 {
6875 str = rb_str_dup(str);
6876 rb_str_lstrip_bang(str);
6877 return str;
6878 }
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893 static VALUE
6894 rb_str_rstrip_bang(VALUE str)
6895 {
6896 rb_encoding *enc;
6897 char *s, *t, *e;
6898
6899 str_modify_keep_cr(str);