Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
c20f4a7
Add StringScanner#integer_at to convert capture group to Integer
jinroq Mar 5, 2026
efe8d0f
Add fast path for base-10 pure digit captures to avoid String allocation
jinroq Mar 18, 2026
d74afd1
Add Ruby fallback for integer_at on platforms without C extension
jinroq Mar 22, 2026
485f27c
Use bool instead of int for boolean variables in integer_at fast path
jinroq Mar 23, 2026
ef6281d
Move integer_at fallback after scan_integer to keep docs together
jinroq Mar 23, 2026
6052143
Optimize fast path to handle leading zeros and near-LONG_MAX values
jinroq Mar 23, 2026
f64fdd8
Remove unnecessary capture group in test_integer_at_index_zero
jinroq Mar 23, 2026
6a55781
Rename test_integer_at to test_integer_at_date_parts for consistency
jinroq Mar 23, 2026
67a10ae
Simplify regex in test_integer_at_optional_group_not_matched
jinroq Mar 23, 2026
9ab1172
Rename variable huge to large in test_integer_at_large_number
jinroq Mar 23, 2026
960130b
Replace magic numbers with INT64/INT32_DECIMAL_SAFE_DIGITS constants
jinroq Mar 23, 2026
0065ecf
Remove unnecessary capture groups in test_integer_at_full_match_with_…
jinroq Mar 23, 2026
d89e54b
Remove redundant test_integer_at_full_match_with_non_digits
jinroq Mar 23, 2026
4d3583f
Rename test_integer_at_named_capture_undefined to use "unknown" consi…
jinroq Mar 23, 2026
e686711
Optimize fast path to handle underscored digit strings
jinroq Mar 24, 2026
0f2ad2a
Use proper boundary value pairs in fixnum_bignum_boundary test
jinroq Mar 24, 2026
433fd87
Use specifier instead of index for integer_at parameter name
jinroq Mar 24, 2026
72c0426
Document IndexError for undefined named capture in integer_at RDoc
jinroq Mar 25, 2026
0c5c88e
Clarify that 18/19-digit boundary tests apply to 64-bit longs
jinroq Mar 25, 2026
674e32c
Fix undefined behavior when negating LONG_MIN in overflow-checked path
jinroq Mar 25, 2026
6b44dbd
Add test for integer_at when scanner position is not at the beginning
jinroq Mar 25, 2026
b6fe693
Rename parameter idx to specifier in resolve_capture_index
jinroq Mar 31, 2026
a078c0d
Extract base-10 fast path into parse_decimal_fast static inline function
jinroq Mar 31, 2026
b1b135a
Add else for readability in overflow-checked path of parse_decimal_fast
jinroq Mar 31, 2026
db383e9
Simplify overflow check in parse_decimal_fast by computing before com…
jinroq Mar 31, 2026
a421b12
Use else if and == for boundary check in parse_decimal_fast overflow …
jinroq Mar 31, 2026
a81aa3e
Rename test_integer_at_date_parts to test_integer_at_positive_index
jinroq Mar 31, 2026
d07b086
Fix boundary test to use smallest 19-digit number in test_integer_at_…
jinroq Mar 31, 2026
1ae5772
Use "0" * 19 + "1" for readability in leading zeros test
jinroq Mar 31, 2026
04a7f84
Fix negative boundary test to use smallest 19-digit absolute value in…
jinroq Mar 31, 2026
21c6be8
Split test_integer_at_fixnum_bignum_boundary into digit_count and lon…
jinroq Mar 31, 2026
e6e5c27
Revert "Simplify overflow check in parse_decimal_fast by computing be…
jinroq Apr 1, 2026
d5b3651
Add comment explaining pre-check guards against 32-bit unsigned long …
jinroq Apr 1, 2026
ba15508
Raise TypeError for explicit nil base argument in integer_at
jinroq Apr 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 197 additions & 20 deletions ext/strscan/strscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,37 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name
rb_long2int(name_end - name), name);
}

/* Resolve capture group index from Integer, Symbol, or String.
* Returns the resolved register index, or -1 if unmatched/out of range. */
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment for resolve_capture_index says it returns -1 for unmatched/out-of-range cases, but name_to_backref_number raises IndexError for unknown named captures. Please update the comment to reflect that the function may raise for an undefined group name.

Suggested change
* Returns the resolved register index, or -1 if unmatched/out of range. */
* Returns the resolved register index, or -1 if unmatched/out of range.
* May raise IndexError when given an undefined named capture group. */

Copilot uses AI. Check for mistakes.
static long
resolve_capture_index(struct strscanner *p, VALUE specifier)
{
const char *name;
long i;

if (! MATCHED_P(p)) return -1;

switch (TYPE(specifier)) {
case T_SYMBOL:
specifier = rb_sym2str(specifier);
/* fall through */
case T_STRING:
RSTRING_GETMEM(specifier, name, i);
i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(specifier));
break;
default:
i = NUM2LONG(specifier);
}

if (i < 0)
i += p->regs.num_regs;
if (i < 0) return -1;
if (i >= p->regs.num_regs) return -1;
if (p->regs.beg[i] == -1) return -1;

return i;
}

/*
*
* :markup: markdown
Expand Down Expand Up @@ -1695,30 +1726,12 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name
static VALUE
strscan_aref(VALUE self, VALUE idx)
{
const char *name;
struct strscanner *p;
long i;

GET_SCANNER(self, p);
if (! MATCHED_P(p)) return Qnil;

switch (TYPE(idx)) {
case T_SYMBOL:
idx = rb_sym2str(idx);
/* fall through */
case T_STRING:
RSTRING_GETMEM(idx, name, i);
i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx));
break;
default:
i = NUM2LONG(idx);
}

if (i < 0)
i += p->regs.num_regs;
if (i < 0) return Qnil;
if (i >= p->regs.num_regs) return Qnil;
if (p->regs.beg[i] == -1) return Qnil;
i = resolve_capture_index(p, idx);
if (i < 0) return Qnil;

return extract_range(p,
adjust_register_position(p, p->regs.beg[i]),
Expand Down Expand Up @@ -1852,6 +1865,169 @@ strscan_values_at(int argc, VALUE *argv, VALUE self)
return new_ary;
}

/*
* call-seq:
* integer_at(specifier, base = 10) -> integer or nil
*
* Returns the captured substring at the given +specifier+ as an Integer,
* following the behavior of <tt>String#to_i(base)</tt>.
*
* +specifier+ can be an Integer (positive, negative, or zero), a Symbol,
* or a String for named capture groups.
*
* Returns +nil+ if:
* - No match has been performed or the last match failed
* - The +specifier+ is an Integer and is out of range
* - The group at +specifier+ did not participate in the match
*
* Raises IndexError if +specifier+ is a Symbol or String that does not
* correspond to a named capture group, consistent with
* <tt>StringScanner#[]</tt>.
*
* This is semantically equivalent to <tt>self[specifier].to_i(base)</tt>
* but avoids the allocation of a temporary String when possible.
*
* scanner = StringScanner.new("2024-06-15")
* scanner.scan(/(\d{4})-(\d{2})-(\d{2})/)
* scanner.integer_at(1) # => 2024
* scanner.integer_at(1, 16) # => 8228
*
*/
Comment on lines +1868 to +1895
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you move this just before static VALUE strscan_integer_at()?

/* Max decimal digits guaranteed to fit in long without overflow check.
* floor(log10(INT64_MAX)) = 18, floor(log10(INT32_MAX)) = 9 */
#define INT64_DECIMAL_SAFE_DIGITS 18
#define INT32_DECIMAL_SAFE_DIGITS 9

/* Fast path for base-10 integer parsing without temporary String allocation.
* Accepts digits and optional underscores (Ruby String#to_i semantics).
* Returns a Fixnum/Integer VALUE on success, or Qundef to signal fall-through
* to the general path (non-decimal, bignum, or non-numeric input). */
static inline VALUE
parse_decimal_fast(const char *ptr, long len)
{
long j = 0;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use meaningful name instead of j? I think that this is not a loop variable.

bool negative = false;
long digit_count = 0;
bool valid = true;

if (ptr[0] == '-') { negative = true; j = 1; }
else if (ptr[0] == '+') { j = 1; }

/* Validate: only digits and underscores (not leading/trailing/consecutive) */
{
long k;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use i not k?

bool prev_underscore = true; /* treat start as underscore to reject leading _ */
for (k = j; k < len; k++) {
if (ptr[k] >= '0' && ptr[k] <= '9') {
digit_count++;
prev_underscore = false;
}
else if (ptr[k] == '_' && !prev_underscore) {
prev_underscore = true;
}
else {
valid = false;
break;
}
}
if (prev_underscore && digit_count > 0) valid = false; /* trailing _ */
}

if (!valid || digit_count == 0) return Qundef;

/* Skip leading zeros to get effective digit count */
{
long first_nonzero = j;
long effective_digits;
long k;
while (first_nonzero < len && (ptr[first_nonzero] == '0' || ptr[first_nonzero] == '_'))
first_nonzero++;
effective_digits = 0;
for (k = first_nonzero; k < len; k++) {
if (ptr[k] != '_') effective_digits++;
}

if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS : INT32_DECIMAL_SAFE_DIGITS)) {
long result = 0;
for (; j < len; j++) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use for (k = first_nonzero; k < len; k++) here?

if (ptr[j] != '_')
result = result * 10 + (ptr[j] - '0');
}
if (negative) result = -result;
return LONG2NUM(result);
}
/* One more digit than safe: may still fit in long with overflow check */
else if (effective_digits == (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) {
unsigned long result = 0;
unsigned long limit = negative
? (unsigned long)LONG_MAX + 1
: (unsigned long)LONG_MAX;
bool overflow = false;
for (; j < len; j++) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto.

if (ptr[j] != '_') {
unsigned long d = ptr[j] - '0';
/* Pre-check before multiply to avoid unsigned long wraparound on
* 32-bit platforms, where 10-digit values can exceed ULONG_MAX. */
if (result > (limit - d) / 10) {
overflow = true;
break;
}
result = result * 10 + d;
}
}
if (!overflow) {
if (negative) {
if (result == (unsigned long)LONG_MAX + 1)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use limit here?

return LONG2NUM(LONG_MIN);
return LONG2NUM(-(long)result);
}
else
return LONG2NUM((long)result);
}
}
}
/* Bignum: signal fall-through to rb_str_to_inum */
return Qundef;
}

static VALUE
strscan_integer_at(int argc, VALUE *argv, VALUE self)
{
struct strscanner *p;
long i;
long beg, end, len;
const char *ptr;
VALUE specifier;
int base = 10;

rb_check_arity(argc, 1, 2);
specifier = argv[0];
if (argc > 1) base = NUM2INT(argv[1]);

GET_SCANNER(self, p);
i = resolve_capture_index(p, specifier);
if (i < 0) return Qnil;

beg = adjust_register_position(p, p->regs.beg[i]);
end = adjust_register_position(p, p->regs.end[i]);
len = end - beg;

if (len <= 0) return INT2FIX(0);

ptr = S_PBEG(p) + beg;

/* Fast path for base 10: parse directly from source bytes without
* temporary String allocation. This covers the Date._strptime use case. */
if (base == 10) {
VALUE result = parse_decimal_fast(ptr, len);
if (result != Qundef) return result;
}

/* General path: follow String#to_i(base) semantics via rb_str_to_inum.
* badcheck=0 returns 0 for non-numeric input instead of raising. */
return rb_str_to_inum(rb_str_new(ptr, len), base, 0);
}

/*
* :markup: markdown
* :include: strscan/link_refs.txt
Expand Down Expand Up @@ -2290,6 +2466,7 @@ Init_strscan(void)
rb_define_method(StringScanner, "size", strscan_size, 0);
rb_define_method(StringScanner, "captures", strscan_captures, 0);
rb_define_method(StringScanner, "values_at", strscan_values_at, -1);
rb_define_method(StringScanner, "integer_at", strscan_integer_at, -1);

rb_define_method(StringScanner, "rest", strscan_rest, 0);
rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0);
Expand Down
10 changes: 10 additions & 0 deletions lib/strscan/strscan.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,14 @@ def scan_integer(base: 10)
raise ArgumentError, "Unsupported integer base: #{base.inspect}, expected 10 or 16"
end
end

unless method_defined?(:integer_at)
# Fallback implementation for platforms without C extension (e.g. JRuby).
# Equivalent to self[specifier].to_i(base).
def integer_at(specifier, base = 10)
str = self[specifier]
return nil if str.nil?
str.to_i(base)
Comment on lines +30 to +32
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use self[specifier]&.to_i(base)?

end
end
end
Loading