From a6ee7ae4c453b590b9fefd6a46bf69cd8e321694 Mon Sep 17 00:00:00 2001 From: st0012 Date: Sun, 22 Mar 2026 14:29:27 +0000 Subject: [PATCH] Fix encoding error when C parser reads external source files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a C file references another source file via `/* in file.c */`, the parser read it with bare `File.read` which uses `Encoding.default_external`. On systems where this is US-ASCII (e.g. Debian CI), non-ASCII bytes in the source file cause `ArgumentError: invalid byte sequence in US-ASCII` in String#scan. Use `RDoc::Encoding.read_file` instead, which reads in binary mode and properly handles encoding detection and transcoding. This was triggered by Ruby commit a2531ba293 which added UTF-8 right arrows (→) in comments in class.c, which is referenced from object.c via `/* in class.c */`. --- lib/rdoc/parser/c.rb | 2 +- test/rdoc/parser/c_test.rb | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/lib/rdoc/parser/c.rb b/lib/rdoc/parser/c.rb index b13e7cad8a..a31b36404d 100644 --- a/lib/rdoc/parser/c.rb +++ b/lib/rdoc/parser/c.rb @@ -1016,7 +1016,7 @@ def handle_method(type, var_name, meth_name, function, param_count, file_name = File.join @file_dir, source_file if File.exist? file_name then - file_content = File.read file_name + file_content = RDoc::Encoding.read_file file_name, @options.encoding else @options.warn "unknown source #{source_file} for #{meth_name} in #{@file_name}" end diff --git a/test/rdoc/parser/c_test.rb b/test/rdoc/parser/c_test.rb index 17237c2400..00b57d5d7d 100644 --- a/test/rdoc/parser/c_test.rb +++ b/test/rdoc/parser/c_test.rb @@ -2292,6 +2292,42 @@ def test_reparse_c_file_no_duplicates assert_include method_names, 'baz' end + def test_handle_method_source_file_with_non_ascii + # Regression test: when the C parser reads an external source file + # (via "/* in file.c */"), it must use RDoc::Encoding.read_file instead + # of File.read. On systems where Encoding.default_external is US-ASCII, + # bare File.read produces a US-ASCII string that raises ArgumentError + # on String#scan when the file contains non-ASCII bytes. + source_path = File.join(File.dirname(@fn), 'greet.c') + File.binwrite source_path, <<~C.encode('UTF-8') + /* + * Returns a greeting \u2014 "h\u00e9llo w\u00f6rld" + */ + VALUE + rb_greet(VALUE obj) { + return rb_str_new2("hello"); + } + C + + parser = util_parser <<~C + void Init_Foo(void) { + VALUE cFoo = rb_define_class("Foo", rb_cObject); + rb_define_method(cFoo, "greet", rb_greet, 0); /* in greet.c */ + } + C + + parser.scan + + foo = @top_level.find_module_named 'Foo' + assert foo, 'Foo class should be found' + + greet = foo.method_list.first + assert greet, 'greet method should be found' + assert_equal 'greet', greet.name + ensure + File.delete source_path if source_path && File.exist?(source_path) + end + def util_get_class(content, name = nil) @parser = util_parser content @parser.scan