diff -rupN binutils.orig/gas/NEWS binutils-2.37/gas/NEWS --- binutils.orig/gas/NEWS 2021-11-18 16:50:39.104088534 +0000 +++ binutils-2.37/gas/NEWS 2021-11-18 16:51:16.340948280 +0000 @@ -1,5 +1,13 @@ -*- text -*- +* The --multibyte-handling=[allow|warn|warn-sym-only] option tells the + assembler what to when it encoutners multibyte characters in the input. The + default is to allow them. Setting the option to "warn" will generate a + warning message whenever any multibyte character is encountered. Using the + option to "warn-sym-only" will make the assembler generate a warning whenever a + symbol is defined containing multibyte characters. (References to undefined + symbols will not generate warnings). + Changes in 2.37: * arm-symbianelf support removed. diff -rupN binutils.orig/gas/app.c binutils-2.37/gas/app.c --- binutils.orig/gas/app.c 2021-11-18 16:50:39.104088534 +0000 +++ binutils-2.37/gas/app.c 2021-11-18 16:50:42.530075630 +0000 @@ -345,6 +345,55 @@ process_escape (int ch) } } +#define MULTIBYTE_WARN_COUNT_LIMIT 10 +static unsigned int multibyte_warn_count = 0; + +bool +scan_for_multibyte_characters (const unsigned char * start, + const unsigned char * end, + bool warn) +{ + if (end <= start) + return false; + + if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT) + return false; + + bool found = false; + + while (start < end) + { + unsigned char c; + + if ((c = * start++) <= 0x7f) + continue; + + if (!warn) + return true; + + found = true; + + const char * filename; + unsigned int lineno; + + filename = as_where (& lineno); + if (filename == NULL) + as_warn (_("multibyte character (%#x) encountered in input"), c); + else if (lineno == 0) + as_warn (_("multibyte character (%#x) encountered in %s"), c, filename); + else + as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno); + + if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT) + { + as_warn (_("further multibyte character warnings suppressed")); + break; + } + } + + return found; +} + /* This function is called to process input characters. The GET parameter is used to retrieve more input characters. GET should set its parameter to point to a buffer, and return the length of @@ -463,6 +512,11 @@ do_scrub_chars (size_t (*get) (char *, s return 0; from = input_buffer; fromend = from + fromlen; + + if (multibyte_handling == multibyte_warn) + (void) scan_for_multibyte_characters ((const unsigned char *) from, + (const unsigned char* ) fromend, + true /* Generate warnings. */); } while (1) diff -rupN binutils.orig/gas/as.c binutils-2.37/gas/as.c --- binutils.orig/gas/as.c 2021-11-18 16:50:39.104088534 +0000 +++ binutils-2.37/gas/as.c 2021-11-18 16:50:42.531075627 +0000 @@ -474,7 +474,7 @@ parse_args (int * pargc, char *** pargv) OPTION_DEBUG_PREFIX_MAP, OPTION_DEFSYM, OPTION_LISTING_LHS_WIDTH, - OPTION_LISTING_LHS_WIDTH2, + OPTION_LISTING_LHS_WIDTH2, /* = STD_BASE + 10 */ OPTION_LISTING_RHS_WIDTH, OPTION_LISTING_CONT_LINES, OPTION_DEPFILE, @@ -484,7 +484,7 @@ parse_args (int * pargc, char *** pargv) OPTION_GDWARF_3, OPTION_GDWARF_4, OPTION_GDWARF_5, - OPTION_GDWARF_SECTIONS, + OPTION_GDWARF_SECTIONS, /* = STD_BASE + 20 */ OPTION_GDWARF_CIE_VERSION, OPTION_STRIP_LOCAL_ABSOLUTE, OPTION_TRADITIONAL_FORMAT, @@ -494,7 +494,7 @@ parse_args (int * pargc, char *** pargv) OPTION_NOEXECSTACK, OPTION_SIZE_CHECK, OPTION_ELF_STT_COMMON, - OPTION_ELF_BUILD_NOTES, + OPTION_ELF_BUILD_NOTES, /* = STD_BASE + 30 */ OPTION_SECTNAME_SUBST, OPTION_ALTERNATE, OPTION_AL, @@ -503,7 +503,8 @@ parse_args (int * pargc, char *** pargv) OPTION_WARN_FATAL, OPTION_COMPRESS_DEBUG, OPTION_NOCOMPRESS_DEBUG, - OPTION_NO_PAD_SECTIONS /* = STD_BASE + 40 */ + OPTION_NO_PAD_SECTIONS, + OPTION_MULTIBYTE_HANDLING /* = STD_BASE + 40 */ /* When you add options here, check that they do not collide with OPTION_MD_BASE. See as.h. */ }; @@ -581,6 +582,7 @@ parse_args (int * pargc, char *** pargv) ,{"target-help", no_argument, NULL, OPTION_TARGET_HELP} ,{"traditional-format", no_argument, NULL, OPTION_TRADITIONAL_FORMAT} ,{"warn", no_argument, NULL, OPTION_WARN} + ,{"multibyte-handling", required_argument, NULL, OPTION_MULTIBYTE_HANDLING} }; /* Construct the option lists from the standard list and the target @@ -683,6 +685,19 @@ parse_args (int * pargc, char *** pargv) flag_traditional_format = 1; break; + case OPTION_MULTIBYTE_HANDLING: + if (strcmp (optarg, "allow") == 0) + multibyte_handling = multibyte_allow; + else if (strcmp (optarg, "warn") == 0) + multibyte_handling = multibyte_warn; + else if (strcmp (optarg, "warn-sym-only") == 0) + multibyte_handling = multibyte_warn_syms; + else if (strcmp (optarg, "warn_sym_only") == 0) + multibyte_handling = multibyte_warn_syms; + else + as_fatal (_("unexpected argument to --multibyte-input-option: '%s'"), optarg); + break; + case OPTION_VERSION: /* This output is intended to follow the GNU standards document. */ printf (_("GNU assembler %s\n"), BFD_VERSION_STRING); diff -rupN binutils.orig/gas/as.h binutils-2.37/gas/as.h --- binutils.orig/gas/as.h 2021-11-18 16:50:38.834089551 +0000 +++ binutils-2.37/gas/as.h 2021-11-18 16:50:42.531075627 +0000 @@ -344,6 +344,14 @@ COMMON int linkrelax; COMMON int do_not_pad_sections_to_alignment; +enum multibyte_input_handling +{ + multibyte_allow = 0, + multibyte_warn, + multibyte_warn_syms +}; +COMMON enum multibyte_input_handling multibyte_handling; + /* TRUE if we should produce a listing. */ extern int listing; @@ -450,6 +458,7 @@ void input_scrub_insert_file (char *); char * input_scrub_new_file (const char *); char * input_scrub_next_buffer (char **bufp); size_t do_scrub_chars (size_t (*get) (char *, size_t), char *, size_t); +bool scan_for_multibyte_characters (const unsigned char *, const unsigned char *, bool); int gen_to_words (LITTLENUM_TYPE *, int, long); int had_err (void); int ignore_input (void); diff -rupN binutils.orig/gas/doc/as.texi binutils-2.37/gas/doc/as.texi --- binutils.orig/gas/doc/as.texi 2021-11-18 16:50:38.838089536 +0000 +++ binutils-2.37/gas/doc/as.texi 2021-11-18 16:50:42.535075612 +0000 @@ -245,6 +245,7 @@ gcc(1), ld(1), and the Info entries for [@b{--sectname-subst}] [@b{--size-check=[error|warning]}] [@b{--elf-stt-common=[no|yes]}] [@b{--generate-missing-build-notes=[no|yes]}] + [@b{--multibyte-handling=[allow|warn|warn-sym-only]}] [@b{--target-help}] [@var{target-options}] [@b{--}|@var{files} @dots{}] @c @@ -866,6 +867,18 @@ Set the maximum width of an input source Set the maximum number of lines printed in a listing for a single line of input to @var{number} + 1. +@item --multibyte-handling=allow +@itemx --multibyte-handling=warn +@itemx --multibyte-handling=warn-sym-only +Controls how the assembler handles multibyte characters in the input. The +default (which can be restored by using the @option{allow} argument) is to +allow such characters without complaint. Using the @option{warn} argument will +make the assembler generate a warning message whenever any multibyte character +is encountered. Using the @option{warn-sym-only} argument will only cause a +warning to be generated when a symbol is defined with a name that contains +multibyte characters. (References to undefined symbols will not generate a +warning). + @item --no-pad-sections Stop the assembler for padding the ends of output sections to the alignment of that section. The default is to pad the sections, but this can waste space @@ -2942,9 +2955,11 @@ are noted in @ref{Machine Dependencies}. @end ifset No symbol may begin with a digit. Case is significant. There is no length limit; all characters are significant. Multibyte characters -are supported. Symbols are delimited by characters not in that set, or by the -beginning of a file (since the source program must end with a newline, the end -of a file is not a possible symbol delimiter). @xref{Symbols}. +are supported, but note that the setting of the +@option{--multibyte-handling} option might prevent their use. Symbols +are delimited by characters not in that set, or by the beginning of a file +(since the source program must end with a newline, the end of a file is not a +possible symbol delimiter). @xref{Symbols}. Symbol names may also be enclosed in double quote @code{"} characters. In such cases any characters are allowed, except for the NUL character. If a double @@ -3834,11 +3849,18 @@ than @code{Foo}. Symbol names do not start with a digit. An exception to this rule is made for Local Labels. See below. -Multibyte characters are supported. To generate a symbol name containing +Multibyte characters are supported, but note that the setting of the +@option{multibyte-handling} option might prevent their use. +To generate a symbol name containing multibyte characters enclose it within double quotes and use escape codes. cf @xref{Strings}. Generating a multibyte symbol name from a label is not currently supported. +Since multibyte symbol names are unusual, and could possibly be used +maliciously, @command{@value{AS}} provides a command line option +(@option{--multibyte-handling=warn-sym-only}) which can be used to generate a +warning message whenever a symbol name containing multibyte characters is defined. + Each symbol has exactly one name. Each name in an assembly language program refers to exactly one symbol. You may use that symbol name any number of times in a program. diff -rupN binutils.orig/gas/input-scrub.c binutils-2.37/gas/input-scrub.c --- binutils.orig/gas/input-scrub.c 2021-11-18 16:50:38.835089547 +0000 +++ binutils-2.37/gas/input-scrub.c 2021-11-18 16:50:42.535075612 +0000 @@ -377,6 +377,11 @@ input_scrub_next_buffer (char **bufp) ++p; } + if (multibyte_handling == multibyte_warn) + (void) scan_for_multibyte_characters ((const unsigned char *) p, + (const unsigned char *) limit, + true /* Generate warnings */); + /* We found a newline in the newly read chars. */ partial_where = p; partial_size = limit - p; diff -rupN binutils.orig/gas/symbols.c binutils-2.37/gas/symbols.c --- binutils.orig/gas/symbols.c 2021-11-18 16:50:39.105088530 +0000 +++ binutils-2.37/gas/symbols.c 2021-11-18 16:52:17.980716107 +0000 @@ -78,6 +78,10 @@ struct symbol_flags before. It is cleared as soon as any direct reference to the symbol is present. */ unsigned int weakrefd : 1; + + /* Set when a warning about the symbol containing multibyte characters + is generated. */ + unsigned int multibyte_warned : 1; }; /* A pointer in the symbol may point to either a complete symbol @@ -194,7 +198,7 @@ static void * symbol_entry_find (htab_t table, const char *name) { hashval_t hash = htab_hash_string (name); - symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, hash, name, 0, 0, 0 } }; return htab_find_with_hash (table, &needle, hash); } @@ -305,6 +309,18 @@ symbol_init (symbolS *symbolP, const cha symbolP->bsym->name = name; symbolP->bsym->section = sec; + if (multibyte_handling == multibyte_warn_syms + && ! symbolP->flags.local_symbol + && sec != undefined_section + && ! symbolP->flags.multibyte_warned + && scan_for_multibyte_characters ((const unsigned char *) name, + (const unsigned char *) name + strlen (name), + false /* Do not warn. */)) + { + as_warn (_("symbol '%s' contains multibyte characters"), name); + symbolP->flags.multibyte_warned = 1; + } + S_SET_VALUE (symbolP, valu); symbol_clear_list_pointers (symbolP); @@ -2413,7 +2429,21 @@ S_SET_SEGMENT (symbolS *s, segT seg) abort (); } else - s->bsym->section = seg; + { + if (multibyte_handling == multibyte_warn_syms + && ! s->flags.local_symbol + && seg != undefined_section + && ! s->flags.multibyte_warned + && scan_for_multibyte_characters ((const unsigned char *) s->name, + (const unsigned char *) s->name + strlen (s->name), + false)) + { + as_warn (_("symbol '%s' contains multibyte characters"), s->name); + s->flags.multibyte_warned = 1; + } + + s->bsym->section = seg; + } } void diff -rupN binutils.orig/gas/testsuite/gas/all/gas.exp binutils-2.37/gas/testsuite/gas/all/gas.exp --- binutils.orig/gas/testsuite/gas/all/gas.exp 2021-11-18 16:50:39.101088545 +0000 +++ binutils-2.37/gas/testsuite/gas/all/gas.exp 2021-11-18 16:50:42.538075600 +0000 @@ -494,3 +494,5 @@ run_dump_test "nop" run_dump_test "asciz" run_dump_test "pr27384" run_dump_test "pr27381" +run_dump_test "multibyte1" +run_dump_test "multibyte2" diff -rupN binutils.orig/testsuite/gas/all/multibyte.s binutils-2.37/testsuite/gas/all/multibyte.s --- binutils.orig/testsuite/gas/all/multibyte.s 1970-01-01 01:00:00.000000000 +0100 +++ binutils-2.37/testsuite/gas/all/multibyte.s 2021-11-18 16:50:42.541075589 +0000 @@ -0,0 +1,8 @@ + .text + .globl he‮oll‬ +he‮oll‬: + .nop + + .globl hello +hello: + .nop diff -rupN binutils.orig/testsuite/gas/all/multibyte1.d binutils-2.37/testsuite/gas/all/multibyte1.d --- binutils.orig/testsuite/gas/all/multibyte1.d 1970-01-01 01:00:00.000000000 +0100 +++ binutils-2.37/testsuite/gas/all/multibyte1.d 2021-11-18 16:50:42.541075589 +0000 @@ -0,0 +1,3 @@ +#source: multibyte.s +#as: --multibyte-handling=warn +#warning_output: multibyte1.l diff -rupN binutils.orig/testsuite/gas/all/multibyte1.l binutils-2.37/testsuite/gas/all/multibyte1.l --- binutils.orig/testsuite/gas/all/multibyte1.l 1970-01-01 01:00:00.000000000 +0100 +++ binutils-2.37/testsuite/gas/all/multibyte1.l 2021-11-18 16:50:42.541075589 +0000 @@ -0,0 +1,12 @@ +[^:]*: Assembler messages: +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xac\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: further multibyte character warnings suppressed diff -rupN binutils.orig/testsuite/gas/all/multibyte2.d binutils-2.37/testsuite/gas/all/multibyte2.d --- binutils.orig/testsuite/gas/all/multibyte2.d 1970-01-01 01:00:00.000000000 +0100 +++ binutils-2.37/testsuite/gas/all/multibyte2.d 2021-11-18 16:50:42.542075585 +0000 @@ -0,0 +1,3 @@ +#source: multibyte.s +#as: --multibyte-handling=warn-sym-only +#warning_output: multibyte2.l diff -rupN binutils.orig/testsuite/gas/all/multibyte2.l binutils-2.37/testsuite/gas/all/multibyte2.l --- binutils.orig/testsuite/gas/all/multibyte2.l 1970-01-01 01:00:00.000000000 +0100 +++ binutils-2.37/testsuite/gas/all/multibyte2.l 2021-11-18 16:50:42.541075589 +0000 @@ -0,0 +1,2 @@ +[^:]*: Assembler messages: +[^:]*:3: Warning: symbol '.*' contains multibyte characters