From 332e9adf944e4ea232a855b1bf75ea4ddfd7e794 Mon Sep 17 00:00:00 2001 From: Ondrej Oprala Date: Wed, 5 Aug 2015 09:15:09 +0200 Subject: [PATCH] expand,unexpand: add multibyte support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * NEWS: Mention the changes. * bootstrap.conf: Add mbfile to the list of modules. * configure.ac: Properly initialize mbfile. * po/POTFILES.in: Add new source file. * src/expand-core.c: Move functions common to both expand and unexpand to this file. * src/expand-core.h: Add function prototypes from expand-core.c. * src/expand.c (expand): Iterate over multibyte characters properly. * src/local.mk: Add expand-core.c to the lists of source codes for expand and unexpand * src/unexpand.c (unexpand): Iterate over multibyte characters properly. * tests/local.mk: Add new tests. * tests/{expand,unexpand}/mb.sh: New tests. Co-authored-by: Pádraig Brady --- bootstrap.conf | 1 + configure.ac | 2 + lib/mbfile.c | 3 + lib/mbfile.h | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++ m4/mbfile.m4 | 14 +++ po/POTFILES.in | 1 + src/expand-core.c | 150 ++++++++++++++++++++++++++++++ src/expand-core.h | 41 +++++++++ src/expand.c | 186 ++++++++----------------------------- src/local.mk | 2 + src/unexpand.c | 195 ++++++++++----------------------------- tests/expand/mb.sh | 98 ++++++++++++++++++++ tests/local.mk | 2 + tests/unexpand/mb.sh | 97 ++++++++++++++++++++ 14 files changed, 750 insertions(+), 297 deletions(-) create mode 100644 lib/mbfile.c create mode 100644 lib/mbfile.h create mode 100644 m4/mbfile.m4 create mode 100644 src/expand-core.c create mode 100644 src/expand-core.h create mode 100755 tests/expand/mb.sh create mode 100755 tests/unexpand/mb.sh diff --git a/bootstrap.conf b/bootstrap.conf index ef1c078..ea8cebc 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -151,6 +151,7 @@ gnulib_modules=" maintainer-makefile malloc-gnu manywarnings + mbfile mbrlen mbrtowc mbsalign diff --git a/configure.ac b/configure.ac index 8dc2192..b8b5114 100644 --- a/configure.ac +++ b/configure.ac @@ -425,6 +425,8 @@ fi # I'm leaving it here for now. This whole thing needs to be modernized... gl_WINSIZE_IN_PTEM +gl_MBFILE + gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \ diff --git a/po/POTFILES.in b/po/POTFILES.in index b3fe668..c594d20 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -57,6 +57,7 @@ src/dirname.c src/du.c src/echo.c src/env.c +src/expand-core.c src/expand.c src/expr.c src/factor.c diff --git a/src/expand-core.c b/src/expand-core.c new file mode 100644 index 0000000..c8445db --- /dev/null +++ b/src/expand-core.c @@ -0,0 +1,150 @@ +/* expand-core.c - elementary functions for the expand and unexpand utilities + Copyright (C) 1989-2015 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include + +#include "system.h" +#include "error.h" +#include "fadvise.h" +#include "quote.h" +#include "xstrndup.h" + +#include "expand-core.h" + +/* Add the comma or blank separated list of tab stops STOPS + to the list of tab stops. */ + +extern void +parse_tab_stops (char const *stops, void (*add_tab_stop)(uintmax_t)) +{ + bool have_tabval = false; + uintmax_t tabval IF_LINT ( = 0); + char const *num_start IF_LINT ( = NULL); + bool ok = true; + + for (; *stops; stops++) + { + if (*stops == ',' || isblank (to_uchar (*stops))) + { + if (have_tabval) + add_tab_stop (tabval); + have_tabval = false; + } + else if (ISDIGIT (*stops)) + { + if (!have_tabval) + { + tabval = 0; + have_tabval = true; + num_start = stops; + } + + /* Detect overflow. */ + if (!DECIMAL_DIGIT_ACCUMULATE (tabval, *stops - '0', uintmax_t)) + { + size_t len = strspn (num_start, "0123456789"); + char *bad_num = xstrndup (num_start, len); + error (0, 0, _("tab stop is too large %s"), quote (bad_num)); + free (bad_num); + ok = false; + stops = num_start + len - 1; + } + } + else + { + error (0, 0, _("tab size contains invalid character(s): %s"), + quote (stops)); + ok = false; + break; + } + } + + if (!ok) + exit (EXIT_FAILURE); + + if (have_tabval) + add_tab_stop (tabval); +} + +/* Check that the list of tab stops TABS, with ENTRIES entries, + contains only nonzero, ascending values. */ + +extern void +validate_tab_stops (uintmax_t const *tabs, size_t entries) +{ + uintmax_t prev_tab = 0; + size_t i; + + for (i = 0; i < entries; i++) + { + if (tabs[i] == 0) + error (EXIT_FAILURE, 0, _("tab size cannot be 0")); + if (tabs[i] <= prev_tab) + error (EXIT_FAILURE, 0, _("tab sizes must be ascending")); + prev_tab = tabs[i]; + } +} + +/* Close the old stream pointer FP if it is non-NULL, + and return a new one opened to read the next input file. + Open a filename of '-' as the standard input. + Return NULL if there are no more input files. */ + +extern FILE * +next_file (FILE *fp) +{ + static char *prev_file; + char *file; + + if (fp) + { + if (ferror (fp)) + { + error (0, errno, "%s", prev_file); + exit_status = EXIT_FAILURE; + } + if (STREQ (prev_file, "-")) + clearerr (fp); /* Also clear EOF. */ + else if (fclose (fp) != 0) + { + error (0, errno, "%s", prev_file); + exit_status = EXIT_FAILURE; + } + } + + while ((file = *file_list++) != NULL) + { + if (STREQ (file, "-")) + { + have_read_stdin = true; + fp = stdin; + } + else + fp = fopen (file, "r"); + if (fp) + { + prev_file = file; + fadvise (fp, FADVISE_SEQUENTIAL); + return fp; + } + error (0, errno, "%s", file); + exit_status = EXIT_FAILURE; + } + return NULL; +} diff --git a/src/expand-core.h b/src/expand-core.h new file mode 100644 index 0000000..2419407 --- /dev/null +++ b/src/expand-core.h @@ -0,0 +1,41 @@ +/* expand-core.h - function prototypes for the expand and unexpand utilities + Copyright (C) 1989-2015 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef EXPAND_CORE_H_ +# define EXPAND_CORE_H_ + +extern size_t first_free_tab; + +extern size_t n_tabs_allocated; + +extern uintmax_t *tab_list; + +extern int exit_status; + +extern char **file_list; + +extern bool have_read_stdin; + +void +parse_tab_stops (char const *stops, void (*add_tab_stop)(uintmax_t)); + +void +validate_tab_stops (uintmax_t const *tabs, size_t entries); + +FILE * +next_file (FILE *fp); + +#endif /* EXPAND_CORE_H_ */ diff --git a/src/expand.c b/src/expand.c index 0a40a1a..ed97fd4 100644 --- a/src/expand.c +++ b/src/expand.c @@ -37,12 +37,16 @@ #include #include #include + +#include + #include "system.h" #include "error.h" #include "fadvise.h" -#include "quote.h" #include "xstrndup.h" +#include "expand-core.h" + /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "expand" @@ -58,17 +62,17 @@ static uintmax_t tab_size; /* Array of the explicit column numbers of the tab stops; after 'tab_list' is exhausted, each additional tab is replaced by a space. The first column is column 0. */ -static uintmax_t *tab_list; +uintmax_t *tab_list; /* The number of allocated entries in 'tab_list'. */ -static size_t n_tabs_allocated; +size_t n_tabs_allocated; /* The index of the first invalid element of 'tab_list', where the next element can be added. */ -static size_t first_free_tab; +size_t first_free_tab; /* Null-terminated array of input filenames. */ -static char **file_list; +char **file_list; /* Default for 'file_list' if no files are given on the command line. */ static char *stdin_argv[] = @@ -77,10 +81,10 @@ static char *stdin_argv[] = }; /* True if we have ever read standard input. */ -static bool have_read_stdin; +bool have_read_stdin; /* The desired exit status. */ -static int exit_status; +int exit_status; static char const shortopts[] = "it:0::1::2::3::4::5::6::7::8::9::"; @@ -135,128 +139,6 @@ add_tab_stop (uintmax_t tabval) tab_list[first_free_tab++] = tabval; } -/* Add the comma or blank separated list of tab stops STOPS - to the list of tab stops. */ - -static void -parse_tab_stops (char const *stops) -{ - bool have_tabval = false; - uintmax_t tabval IF_LINT ( = 0); - char const *num_start IF_LINT ( = NULL); - bool ok = true; - - for (; *stops; stops++) - { - if (*stops == ',' || isblank (to_uchar (*stops))) - { - if (have_tabval) - add_tab_stop (tabval); - have_tabval = false; - } - else if (ISDIGIT (*stops)) - { - if (!have_tabval) - { - tabval = 0; - have_tabval = true; - num_start = stops; - } - - /* Detect overflow. */ - if (!DECIMAL_DIGIT_ACCUMULATE (tabval, *stops - '0', uintmax_t)) - { - size_t len = strspn (num_start, "0123456789"); - char *bad_num = xstrndup (num_start, len); - error (0, 0, _("tab stop is too large %s"), quote (bad_num)); - free (bad_num); - ok = false; - stops = num_start + len - 1; - } - } - else - { - error (0, 0, _("tab size contains invalid character(s): %s"), - quote (stops)); - ok = false; - break; - } - } - - if (!ok) - exit (EXIT_FAILURE); - - if (have_tabval) - add_tab_stop (tabval); -} - -/* Check that the list of tab stops TABS, with ENTRIES entries, - contains only nonzero, ascending values. */ - -static void -validate_tab_stops (uintmax_t const *tabs, size_t entries) -{ - uintmax_t prev_tab = 0; - size_t i; - - for (i = 0; i < entries; i++) - { - if (tabs[i] == 0) - error (EXIT_FAILURE, 0, _("tab size cannot be 0")); - if (tabs[i] <= prev_tab) - error (EXIT_FAILURE, 0, _("tab sizes must be ascending")); - prev_tab = tabs[i]; - } -} - -/* Close the old stream pointer FP if it is non-NULL, - and return a new one opened to read the next input file. - Open a filename of '-' as the standard input. - Return NULL if there are no more input files. */ - -static FILE * -next_file (FILE *fp) -{ - static char *prev_file; - char *file; - - if (fp) - { - if (ferror (fp)) - { - error (0, errno, "%s", quotef (prev_file)); - exit_status = EXIT_FAILURE; - } - if (STREQ (prev_file, "-")) - clearerr (fp); /* Also clear EOF. */ - else if (fclose (fp) != 0) - { - error (0, errno, "%s", quotef (prev_file)); - exit_status = EXIT_FAILURE; - } - } - - while ((file = *file_list++) != NULL) - { - if (STREQ (file, "-")) - { - have_read_stdin = true; - fp = stdin; - } - else - fp = fopen (file, "r"); - if (fp) - { - prev_file = file; - fadvise (fp, FADVISE_SEQUENTIAL); - return fp; - } - error (0, errno, "%s", quotef (file)); - exit_status = EXIT_FAILURE; - } - return NULL; -} - /* Change tabs to spaces, writing to stdout. Read each file in 'file_list', in order. */ @@ -265,19 +147,19 @@ expand (void) { /* Input stream. */ FILE *fp = next_file (NULL); + mb_file_t mbf; + mbf_char_t c; if (!fp) return; + mbf_init (mbf, fp); + while (true) { - /* Input character, or EOF. */ - int c; - /* If true, perform translations. */ bool convert = true; - /* The following variables have valid values only when CONVERT is true: */ @@ -287,17 +169,23 @@ expand (void) /* Index in TAB_LIST of next tab stop to examine. */ size_t tab_index = 0; - /* Convert a line of text. */ do { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + do { + mbf_getc (c, mbf); + if (mb_iseof (c)) + { + mbf_init (mbf, fp = next_file (fp)); + continue; + } + } + while (false); if (convert) { - if (c == '\t') + if (mb_iseq (c, '\t')) { /* Column the next input tab stop is on. */ uintmax_t next_tab_column; @@ -328,32 +216,34 @@ expand (void) if (putchar (' ') < 0) error (EXIT_FAILURE, errno, _("write error")); - c = ' '; + mb_setascii (&c, ' '); } - else if (c == '\b') + else if (mb_iseq (c, '\b')) { /* Go back one column, and force recalculation of the next tab stop. */ column -= !!column; tab_index -= !!tab_index; } - else + /* A leading control character could make us trip over. */ + else if (!mb_iscntrl (c)) { - column++; + column += mb_width (c); if (!column) error (EXIT_FAILURE, 0, _("input line is too long")); } - convert &= convert_entire_line || !! isblank (c); + convert &= convert_entire_line || mb_isblank (c); } - if (c < 0) + if (mb_iseof (c)) return; - if (putchar (c) < 0) + mb_putc (c, stdout); + if (ferror (stdout)) error (EXIT_FAILURE, errno, _("write error")); } - while (c != '\n'); + while (!mb_iseq (c, '\n')); } } @@ -385,19 +275,19 @@ main (int argc, char **argv) break; case 't': - parse_tab_stops (optarg); + parse_tab_stops (optarg, add_tab_stop); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (optarg) - parse_tab_stops (optarg - 1); + parse_tab_stops (optarg - 1, add_tab_stop); else { char tab_stop[2]; tab_stop[0] = c; tab_stop[1] = '\0'; - parse_tab_stops (tab_stop); + parse_tab_stops (tab_stop, add_tab_stop); } break; diff --git a/src/local.mk b/src/local.mk index 536b7cc..bfede88 100644 --- a/src/local.mk +++ b/src/local.mk @@ -361,6 +361,8 @@ src_coreutils_SOURCES = src/coreutils.c src_cp_SOURCES = src/cp.c $(copy_sources) $(selinux_sources) src_dir_SOURCES = src/ls.c src/ls-dir.c +src_expand_SOURCES = src/expand.c src/expand-core.c +src_unexpand_SOURCES = src/unexpand.c src/expand-core.c src_vdir_SOURCES = src/ls.c src/ls-vdir.c src_id_SOURCES = src/id.c src/group-list.c src_groups_SOURCES = src/groups.c src/group-list.c diff --git a/src/unexpand.c b/src/unexpand.c index e0f7c22..48fbb32 100644 --- a/src/unexpand.c +++ b/src/unexpand.c @@ -38,12 +38,16 @@ #include #include #include + +#include + #include "system.h" #include "error.h" #include "fadvise.h" -#include "quote.h" #include "xstrndup.h" +#include "expand-core.h" + /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "unexpand" @@ -62,17 +66,17 @@ static size_t max_column_width; /* Array of the explicit column numbers of the tab stops; after 'tab_list' is exhausted, the rest of the line is printed unchanged. The first column is column 0. */ -static uintmax_t *tab_list; +uintmax_t *tab_list; /* The number of allocated entries in 'tab_list'. */ -static size_t n_tabs_allocated; +size_t n_tabs_allocated; /* The index of the first invalid element of 'tab_list', where the next element can be added. */ -static size_t first_free_tab; +size_t first_free_tab; /* Null-terminated array of input filenames. */ -static char **file_list; +char **file_list; /* Default for 'file_list' if no files are given on the command line. */ static char *stdin_argv[] = @@ -81,10 +85,10 @@ static char *stdin_argv[] = }; /* True if we have ever read standard input. */ -static bool have_read_stdin; +bool have_read_stdin; /* The desired exit status. */ -static int exit_status; +int exit_status; /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ @@ -154,128 +158,6 @@ add_tab_stop (uintmax_t tabval) } } -/* Add the comma or blank separated list of tab stops STOPS - to the list of tab stops. */ - -static void -parse_tab_stops (char const *stops) -{ - bool have_tabval = false; - uintmax_t tabval IF_LINT ( = 0); - char const *num_start IF_LINT ( = NULL); - bool ok = true; - - for (; *stops; stops++) - { - if (*stops == ',' || isblank (to_uchar (*stops))) - { - if (have_tabval) - add_tab_stop (tabval); - have_tabval = false; - } - else if (ISDIGIT (*stops)) - { - if (!have_tabval) - { - tabval = 0; - have_tabval = true; - num_start = stops; - } - - /* Detect overflow. */ - if (!DECIMAL_DIGIT_ACCUMULATE (tabval, *stops - '0', uintmax_t)) - { - size_t len = strspn (num_start, "0123456789"); - char *bad_num = xstrndup (num_start, len); - error (0, 0, _("tab stop is too large %s"), quote (bad_num)); - free (bad_num); - ok = false; - stops = num_start + len - 1; - } - } - else - { - error (0, 0, _("tab size contains invalid character(s): %s"), - quote (stops)); - ok = false; - break; - } - } - - if (!ok) - exit (EXIT_FAILURE); - - if (have_tabval) - add_tab_stop (tabval); -} - -/* Check that the list of tab stops TABS, with ENTRIES entries, - contains only nonzero, ascending values. */ - -static void -validate_tab_stops (uintmax_t const *tabs, size_t entries) -{ - uintmax_t prev_tab = 0; - size_t i; - - for (i = 0; i < entries; i++) - { - if (tabs[i] == 0) - error (EXIT_FAILURE, 0, _("tab size cannot be 0")); - if (tabs[i] <= prev_tab) - error (EXIT_FAILURE, 0, _("tab sizes must be ascending")); - prev_tab = tabs[i]; - } -} - -/* Close the old stream pointer FP if it is non-NULL, - and return a new one opened to read the next input file. - Open a filename of '-' as the standard input. - Return NULL if there are no more input files. */ - -static FILE * -next_file (FILE *fp) -{ - static char *prev_file; - char *file; - - if (fp) - { - if (ferror (fp)) - { - error (0, errno, "%s", quotef (prev_file)); - exit_status = EXIT_FAILURE; - } - if (STREQ (prev_file, "-")) - clearerr (fp); /* Also clear EOF. */ - else if (fclose (fp) != 0) - { - error (0, errno, "%s", quotef (prev_file)); - exit_status = EXIT_FAILURE; - } - } - - while ((file = *file_list++) != NULL) - { - if (STREQ (file, "-")) - { - have_read_stdin = true; - fp = stdin; - } - else - fp = fopen (file, "r"); - if (fp) - { - prev_file = file; - fadvise (fp, FADVISE_SEQUENTIAL); - return fp; - } - error (0, errno, "%s", quotef (file)); - exit_status = EXIT_FAILURE; - } - return NULL; -} - /* Change blanks to tabs, writing to stdout. Read each file in 'file_list', in order. */ @@ -284,11 +166,12 @@ unexpand (void) { /* Input stream. */ FILE *fp = next_file (NULL); + mb_file_t mbf; /* The array of pending blanks. In non-POSIX locales, blanks can include characters other than spaces, so the blanks must be stored, not merely counted. */ - char *pending_blank; + mbf_char_t *pending_blank; if (!fp) return; @@ -296,12 +179,14 @@ unexpand (void) /* The worst case is a non-blank character, then one blank, then a tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ - pending_blank = xmalloc (max_column_width); + pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); + + mbf_init (mbf, fp); while (true) { /* Input character, or EOF. */ - int c; + mbf_char_t c; /* If true, perform translations. */ bool convert = true; @@ -335,12 +220,19 @@ unexpand (void) do { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + do { + mbf_getc (c, mbf); + if (mb_iseof (c)) + { + mbf_init (mbf, fp = next_file (fp)); + continue; + } + } + while (false); if (convert) { - bool blank = !! isblank (c); + bool blank = mb_isblank (c); if (blank) { @@ -372,16 +264,16 @@ unexpand (void) if (next_tab_column < column) error (EXIT_FAILURE, 0, _("input line is too long")); - if (c == '\t') + if (mb_iseq (c, '\t')) { column = next_tab_column; if (pending) - pending_blank[0] = '\t'; + mb_setascii (&pending_blank[0], '\t'); } else { - column++; + column += mb_width (c); if (! (prev_blank && column == next_tab_column)) { @@ -389,13 +281,14 @@ unexpand (void) will be replaced by tabs. */ if (column == next_tab_column) one_blank_before_tab_stop = true; - pending_blank[pending++] = c; + mb_copy (&pending_blank[pending++], &c); prev_blank = true; continue; } /* Replace the pending blanks by a tab or two. */ - pending_blank[0] = c = '\t'; + mb_setascii (&c, '\t'); + mb_setascii (&pending_blank[0], '\t'); } /* Discard pending blanks, unless it was a single @@ -403,7 +296,7 @@ unexpand (void) pending = one_blank_before_tab_stop; } } - else if (c == '\b') + else if (mb_iseq (c, '\b')) { /* Go back one column, and force recalculation of the next tab stop. */ @@ -413,7 +306,7 @@ unexpand (void) } else { - column++; + column += mb_width (c); if (!column) error (EXIT_FAILURE, 0, _("input line is too long")); } @@ -421,9 +314,13 @@ unexpand (void) if (pending) { if (pending > 1 && one_blank_before_tab_stop) - pending_blank[0] = '\t'; - if (fwrite (pending_blank, 1, pending, stdout) != pending) + mb_setascii (&pending_blank[0], '\t'); + + for (int n = 0; n < pending; ++n) + mb_putc (pending_blank[n], stdout); + if (ferror (stdout)) error (EXIT_FAILURE, errno, _("write error")); + pending = 0; one_blank_before_tab_stop = false; } @@ -432,16 +329,16 @@ unexpand (void) convert &= convert_entire_line || blank; } - if (c < 0) + if (mb_iseof (c)) { free (pending_blank); return; } - - if (putchar (c) < 0) + mb_putc (c, stdout); + if (ferror (stdout)) error (EXIT_FAILURE, errno, _("write error")); } - while (c != '\n'); + while (!mb_iseq (c, '\n')); } } @@ -482,7 +379,7 @@ main (int argc, char **argv) break; case 't': convert_entire_line = true; - parse_tab_stops (optarg); + parse_tab_stops (optarg, add_tab_stop); break; case CONVERT_FIRST_ONLY_OPTION: convert_first_only = true; diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh new file mode 100755 index 0000000..7971e18 --- /dev/null +++ b/tests/expand/mb.sh @@ -0,0 +1,98 @@ +#!/bin/sh + +# Copyright (C) 2012-2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ expand + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat <<\EOF > in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +cat <<\EOF > exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with display widths != 1 +env printf '12345678 +e\t|ascii(1) +\u00E9\t|composed(1) +e\u0301\t|decomposed(1) +\u3000\t|ideo-space(2) +\uFF0D\t|full-hypen(2) +' > in || framework_failure_ + +env printf '12345678 +e |ascii(1) +\u00E9 |composed(1) +e\u0301 |decomposed(1) +\u3000 |ideo-space(2) +\uFF0D |full-hypen(2) +' > exp || framework_failure_ + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#shouldn't fail with "input line too long" +#when a line starts with a control character +env printf '\n' > in || framework_failure_ + +expand < in > out || fail=1 +compare in out > /dev/null 2>&1 || fail=1 + +#non-Unicode characters interspersed between Unicode ones +env printf '12345678 +\t\xFF| +\xFF\t| +\t\xFFä| +ä\xFF\t| +\tä\xFF| +\xFF\tä| +äbcdef\xFF\t| +' > in || framework_failure_ + +env printf '12345678 + \xFF| +\xFF | + \xFFä| +ä\xFF | + ä\xFF| +\xFF ä| +äbcdef\xFF | +' > exp || framework_failure_ + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +exit $fail diff --git a/tests/local.mk b/tests/local.mk index 7df04da..d3462be 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -536,6 +536,7 @@ all_tests = \ tests/du/threshold.sh \ tests/du/trailing-slash.sh \ tests/du/two-args.sh \ + tests/expand/mb.sh \ tests/id/gnu-zero-uids.sh \ tests/id/no-context.sh \ tests/id/context.sh \ @@ -674,6 +675,7 @@ all_tests = \ tests/touch/read-only.sh \ tests/touch/relative.sh \ tests/touch/trailing-slash.sh \ + tests/unexpand/mb.sh \ $(all_root_tests) # See tests/factor/create-test.sh. diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh new file mode 100755 index 0000000..60d4c1a --- /dev/null +++ b/tests/unexpand/mb.sh @@ -0,0 +1,97 @@ +#!/bin/sh + +# Copyright (C) 2012-2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ unexpand + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat > in <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +cat > exp <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with a display width larger than 1 + +env printf '12345678 +e |ascii(1) +\u00E9 |composed(1) +e\u0301 |decomposed(1) +\u3000 |ideo-space(2) +\uFF0D |full-hypen(2) +' > in || framework_failure_ + +env printf '12345678 +e\t|ascii(1) +\u00E9\t|composed(1) +e\u0301\t|decomposed(1) +\u3000\t|ideo-space(2) +\uFF0D\t|full-hypen(2) +' > exp || framework_failure_ + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test input where a blank of width > 1 is not being substituted +in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')" +exp='   ö ü ß' + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#non-Unicode characters interspersed between Unicode ones +env printf '12345678 + \xFF| +\xFF | + \xFFä| +ä\xFF | + ä\xFF| +\xFF ä| +äbcdef\xFF | +' > in || framework_failure_ + +env printf '12345678 +\t\xFF| +\xFF\t| +\t\xFFä| +ä\xFF\t| +\tä\xFF| +\xFF\tä| +äbcdef\xFF\t| +' > exp || framework_failure_ + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 diff --git a/m4/mbfile.m4 b/m4/mbfile.m4 new file mode 100644 index 0000000..8589902 --- /dev/null +++ b/m4/mbfile.m4 @@ -0,0 +1,14 @@ +# mbfile.m4 serial 7 +dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl autoconf tests required for use of mbfile.h +dnl From Bruno Haible. + +AC_DEFUN([gl_MBFILE], +[ + AC_REQUIRE([AC_TYPE_MBSTATE_T]) + : +]) diff --git a/lib/mbfile.c b/lib/mbfile.c new file mode 100644 index 0000000..b0a468e --- /dev/null +++ b/lib/mbfile.c @@ -0,0 +1,3 @@ +#include +#define MBFILE_INLINE _GL_EXTERN_INLINE +#include "mbfile.h" diff --git a/lib/mbfile.h b/lib/mbfile.h new file mode 100644 index 0000000..11f1b12 --- /dev/null +++ b/lib/mbfile.h @@ -0,0 +1,255 @@ +/* Multibyte character I/O: macros for multi-byte encodings. + Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Mitsuru Chinen + and Bruno Haible . */ + +/* The macros in this file implement multi-byte character input from a + stream. + + mb_file_t + is the type for multibyte character input stream, usable for variable + declarations. + + mbf_char_t + is the type for multibyte character or EOF, usable for variable + declarations. + + mbf_init (mbf, stream) + initializes the MB_FILE for reading from stream. + + mbf_getc (mbc, mbf) + reads the next multibyte character from mbf and stores it in mbc. + + mb_iseof (mbc) + returns true if mbc represents the EOF value. + + Here are the function prototypes of the macros. + + extern void mbf_init (mb_file_t mbf, FILE *stream); + extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf); + extern bool mb_iseof (const mbf_char_t mbc); + */ + +#ifndef _MBFILE_H +#define _MBFILE_H 1 + +#include +#include +#include +#include + +/* Tru64 with Desktop Toolkit C has a bug: must be included before + . + BSD/OS 4.1 has a bug: and must be included before + . */ +#include +#include +#include + +#include "mbchar.h" + +#ifndef _GL_INLINE_HEADER_BEGIN + #error "Please include config.h first." +#endif +_GL_INLINE_HEADER_BEGIN +#ifndef MBFILE_INLINE +# define MBFILE_INLINE _GL_INLINE +#endif + +struct mbfile_multi { + FILE *fp; + bool eof_seen; + bool have_pushback; + mbstate_t state; + unsigned int bufcount; + char buf[MBCHAR_BUF_SIZE]; + struct mbchar pushback; +}; + +MBFILE_INLINE void +mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) +{ + size_t bytes; + + /* If EOF has already been seen, don't use getc. This matters if + mbf->fp is connected to an interactive tty. */ + if (mbf->eof_seen) + goto eof; + + /* Return character pushed back, if there is one. */ + if (mbf->have_pushback) + { + mb_copy (mbc, &mbf->pushback); + mbf->have_pushback = false; + return; + } + + /* Before using mbrtowc, we need at least one byte. */ + if (mbf->bufcount == 0) + { + int c = getc (mbf->fp); + if (c == EOF) + { + mbf->eof_seen = true; + goto eof; + } + mbf->buf[0] = (unsigned char) c; + mbf->bufcount++; + } + + /* Handle most ASCII characters quickly, without calling mbrtowc(). */ + if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0])) + { + /* These characters are part of the basic character set. ISO C 99 + guarantees that their wide character code is identical to their + char code. */ + mbc->wc = mbc->buf[0] = mbf->buf[0]; + mbc->wc_valid = true; + mbc->ptr = &mbc->buf[0]; + mbc->bytes = 1; + mbf->bufcount = 0; + return; + } + + /* Use mbrtowc on an increasing number of bytes. Read only as many bytes + from mbf->fp as needed. This is needed to give reasonable interactive + behaviour when mbf->fp is connected to an interactive tty. */ + for (;;) + { + /* We don't know whether the 'mbrtowc' function updates the state when + it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or + not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We + don't have an autoconf test for this, yet. + The new behaviour would allow us to feed the bytes one by one into + mbrtowc. But the old behaviour forces us to feed all bytes since + the end of the last character into mbrtowc. Since we want to retry + with more bytes when mbrtowc returns -2, we must backup the state + before calling mbrtowc, because implementations with the new + behaviour will clobber it. */ + mbstate_t backup_state = mbf->state; + + bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); + + if (bytes == (size_t) -1) + { + /* An invalid multibyte sequence was encountered. */ + /* Return a single byte. */ + bytes = 1; + mbc->wc_valid = false; + break; + } + else if (bytes == (size_t) -2) + { + /* An incomplete multibyte character. */ + mbf->state = backup_state; + if (mbf->bufcount == MBCHAR_BUF_SIZE) + { + /* An overlong incomplete multibyte sequence was encountered. */ + /* Return a single byte. */ + bytes = 1; + mbc->wc_valid = false; + break; + } + else + { + /* Read one more byte and retry mbrtowc. */ + int c = getc (mbf->fp); + if (c == EOF) + { + /* An incomplete multibyte character at the end. */ + mbf->eof_seen = true; + bytes = mbf->bufcount; + mbc->wc_valid = false; + break; + } + mbf->buf[mbf->bufcount] = (unsigned char) c; + mbf->bufcount++; + } + } + else + { + if (bytes == 0) + { + /* A null wide character was encountered. */ + bytes = 1; + assert (mbf->buf[0] == '\0'); + assert (mbc->wc == 0); + } + mbc->wc_valid = true; + break; + } + } + + /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ + mbc->ptr = &mbc->buf[0]; + memcpy (&mbc->buf[0], &mbf->buf[0], bytes); + mbc->bytes = bytes; + + mbf->bufcount -= bytes; + if (mbf->bufcount > 0) + { + /* It's not worth calling memmove() for so few bytes. */ + unsigned int count = mbf->bufcount; + char *p = &mbf->buf[0]; + + do + { + *p = *(p + bytes); + p++; + } + while (--count > 0); + } + return; + +eof: + /* An mbchar_t with bytes == 0 is used to indicate EOF. */ + mbc->ptr = NULL; + mbc->bytes = 0; + mbc->wc_valid = false; + return; +} + +MBFILE_INLINE void +mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf) +{ + mb_copy (&mbf->pushback, mbc); + mbf->have_pushback = true; +} + +typedef struct mbfile_multi mb_file_t; + +typedef mbchar_t mbf_char_t; + +#define mbf_init(mbf, stream) \ + ((mbf).fp = (stream), \ + (mbf).eof_seen = false, \ + (mbf).have_pushback = false, \ + memset (&(mbf).state, '\0', sizeof (mbstate_t)), \ + (mbf).bufcount = 0) + +#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf)) + +#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf)) + +#define mb_iseof(mbc) ((mbc).bytes == 0) + +#ifndef _GL_INLINE_HEADER_BEGIN + #error "Please include config.h first." +#endif +_GL_INLINE_HEADER_BEGIN + +#endif /* _MBFILE_H */ -- 2.5.5