Use the new i18n implementation for the cut utility

This commit is contained in:
Ondrej Oprala 2015-12-07 09:59:13 +01:00
parent 452aa4a7e3
commit 01067b2813
3 changed files with 590 additions and 574 deletions

583
coreutils-i18n-cut.patch Normal file
View File

@ -0,0 +1,583 @@
--- coreutils-8.24/src/cut.c 2015-06-26 19:05:22.000000000 +0200
+++ cut.c 2016-01-15 10:15:04.863804121 +0100
@@ -28,6 +28,11 @@
#include <assert.h>
#include <getopt.h>
#include <sys/types.h>
+
+#include <mbfile.h>
+#include <mbiter.h>
+#include <string.h>
+
#include "system.h"
#include "error.h"
@@ -90,25 +95,16 @@ add_range_pair (size_t lo, size_t hi)
++n_rp;
}
-/* This buffer is used to support the semantics of the -s option
- (or lack of same) when the specified field list includes (does
- not include) the first field. In both of those cases, the entire
- first field must be read into this buffer to determine whether it
- is followed by a delimiter or a newline before any of it may be
- output. Otherwise, cut_fields can do the job without using this
- buffer. */
-static char *field_1_buffer;
-
-/* The number of bytes allocated for FIELD_1_BUFFER. */
-static size_t field_1_bufsize;
-
enum operating_mode
{
undefined_mode,
- /* Output characters that are in the given bytes. */
+ /* Output the given bytes. */
byte_mode,
+ /* Output characters that are in the given positions . */
+ char_mode,
+
/* Output the given delimiter-separated fields. */
field_mode
};
@@ -120,12 +116,16 @@ static enum operating_mode operating_mod
with field mode. */
static bool suppress_non_delimited;
+/* Unless true, we do not recognize multibyte characters in byte-splitting
+ mode. */
+static bool no_break_mb_chars;
+
/* If true, print all bytes, characters, or fields _except_
those that were specified. */
static bool complement;
/* The delimiter character for field mode. */
-static unsigned char delim;
+static mbf_char_t delim;
/* True if the --output-delimiter=STRING option was specified. */
static bool output_delimiter_specified;
@@ -135,7 +135,7 @@ static size_t output_delimiter_length;
/* The output field separator string. Defaults to the 1-character
string consisting of the input delimiter. */
-static char *output_delimiter_string;
+static char const *output_delimiter_string;
/* True if we have ever read standard input. */
static bool have_read_stdin;
@@ -189,7 +189,7 @@ Print selected parts of lines from each
-f, --fields=LIST select only these fields; also print any line\n\
that contains no delimiter character, unless\n\
the -s option is specified\n\
- -n (ignored)\n\
+ -n with -b, don't split multibyte characters\n\
"), stdout);
fputs (_("\
--complement complement the set of selected bytes, characters\n\
@@ -435,6 +435,12 @@ next_item (size_t *item_idx)
current_rp++;
}
+static inline void
+next_item_n (size_t *item_idx, size_t n)
+{
+ while (n-- > 0)
+ next_item (item_idx);
+}
/* Return nonzero if the K'th field or byte is printable. */
static inline bool
@@ -443,6 +449,15 @@ print_kth (size_t k)
return current_rp->lo <= k;
}
+/* The lo and hi params should be used for the current characters byte position
+ * and byte size, respectively. */
+static inline bool
+rp_intersect (size_t lo, size_t hi)
+{
+ return ((current_rp->lo <= lo && current_rp->hi >= lo)
+ || (current_rp->lo <= hi && current_rp->hi >= hi));
+}
+
/* Return nonzero if K'th byte is the beginning of a range. */
static inline bool
@@ -505,23 +520,216 @@ cut_bytes (FILE *stream)
}
/* Read from stream STREAM, printing to standard output any selected fields. */
+extern ssize_t
+mb_getndelim2 (mbf_char_t **lineptr, size_t *linesize, size_t nmax,
+ mbf_char_t delim1, mbf_char_t delim2, mb_file_t *stream)
+{
+/* The maximum value that getndelim2 can return without suffering from
+ overflow problems, either internally (because of pointer
+ subtraction overflow) or due to the API (because of ssize_t). */
+#define GETNDELIM2_MAXIMUM (PTRDIFF_MAX < SSIZE_MAX ? PTRDIFF_MAX : SSIZE_MAX)
+
+/* Try to add at least this many bytes when extending the buffer.
+ MIN_CHUNK must be no greater than GETNDELIM2_MAXIMUM. */
+#define MIN_CHUNK 64
+ size_t nchars_avail; /* Allocated but unused chars in *LINEPTR. */
+ mbf_char_t *read_pos; /* Where we're reading into *LINEPTR. */
+ ssize_t chars_stored = -1;
+ mbf_char_t *ptr = *lineptr;
+ size_t size = *linesize;
+ bool found_delimiter;
+
+ if (!ptr)
+ {
+ size = nmax < MIN_CHUNK ? nmax : MIN_CHUNK;
+ ptr = malloc (size * sizeof (mbf_char_t));
+ if (!ptr)
+ return -1;
+ }
+
+ if (size < 0)
+ goto done;
+
+ nchars_avail = size;
+ read_pos = ptr;
+
+ if (nchars_avail == 0 && nmax <= size)
+ goto done;
+
+ /* Normalize delimiters, since memchr2 doesn't handle EOF. */
+ if (mb_iseof (delim1))
+ mb_copy (&delim1, &delim2);
+ else if (mb_iseof (delim2))
+ mb_copy (&delim2, &delim1);
+
+ flockfile (stream);
+
+ found_delimiter = false;
+ do
+ {
+ /* Here always ptr + size == read_pos + nchars_avail.
+ Also nchars_avail > 0 || size < nmax. */
+
+ mbf_char_t c IF_LINT (= 0);
+ {
+ mbf_getc (c, *stream);
+ if (mb_iseof (c))
+ {
+ /* Return partial line, if any. */
+ if (read_pos == ptr)
+ goto unlock_done;
+ else
+ break;
+ }
+ if (mb_equal (c, delim1) || mb_equal (c, delim2))
+ found_delimiter = true;
+ }
+
+ /* We always want at least one byte left in the buffer, since we
+ always (unless we get an error while reading the first byte)
+ NUL-terminate the line buffer. */
+
+ if (!nchars_avail)
+ {
+ /* Grow size proportionally, not linearly, to avoid O(n^2)
+ running time. */
+ size_t newsize = size < MIN_CHUNK ? size + MIN_CHUNK : 2 * size;
+ mbf_char_t *newptr;
+
+ /* Respect nmax. This handles possible integer overflow. */
+ if (! (size < newsize && newsize <= nmax))
+ newsize = nmax;
+
+ if (GETNDELIM2_MAXIMUM < newsize)
+ {
+ size_t newsizemax = GETNDELIM2_MAXIMUM + 1;
+ if (size == newsizemax)
+ goto unlock_done;
+ newsize = newsizemax;
+ }
+ nchars_avail = newsize - (read_pos - ptr);
+ newptr = realloc (ptr, newsize * sizeof (mbf_char_t));
+ if (!newptr)
+ goto unlock_done;
+ ptr = newptr;
+ size = newsize;
+ read_pos = size - nchars_avail + ptr;
+ }
+
+ /* Here, if size < nmax, nchars_avail >= buffer_len + 1.
+ If size == nmax, nchars_avail > 0. */
+
+ if (1 < nchars_avail)
+ {
+ mb_copy(read_pos++, &c);
+ --nchars_avail;
+ }
+
+ }
+ while (!found_delimiter);
+
+ chars_stored = (read_pos - ptr);
+
+ unlock_done:
+ funlockfile (stream);
+
+ done:
+ *lineptr = ptr;
+ *linesize = size;
+ return chars_stored;
+}
+
+static void
+cut_chars (FILE *stream)
+{
+ size_t char_idx; /* Number of chars in the line so far. */
+ bool print_delimiter;
+ mbf_char_t c;
+ mb_file_t mbf;
+
+ print_delimiter = false;
+ char_idx = 0;
+ current_rp = rp;
+
+ mbf_init (mbf, stream);
+ while (true)
+ {
+ mbf_getc (c, mbf);
+
+ if (mb_iseq (c, '\n'))
+ {
+ putc ('\n', stdout);
+ char_idx = 0;
+ print_delimiter = false;
+ current_rp = rp;
+ }
+ else if (mb_iseof (c))
+ {
+ if (char_idx > 0)
+ putc ('\n', stdout);
+ break;
+ }
+ else
+ {
+ /* Forward by one byte. */
+ next_item (&char_idx);
+
+ /* Check if the current characters byte range is within
+ * the argument list. */
+ if (rp_intersect (char_idx, char_idx + mb_len (c) - 1))
+ {
+ if (output_delimiter_specified)
+ {
+ if (print_delimiter && is_range_start_index (char_idx))
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ print_delimiter = true;
+ }
+ mb_putc (c, stdout);
+ }
+
+ /* Byte mode with multibyte characters uncut (-b -n). */
+ if (no_break_mb_chars)
+ /* Forward by an additional byte_length (c) - 1. */
+ next_item_n (&char_idx, mb_len (c) - 1);
+ }
+ }
+}
static void
cut_fields (FILE *stream)
{
- int c;
+
+ /* This buffer is used to support the semantics of the -s option
+ (or lack of same) when the specified field list includes (does
+ not include) the first field. In both of those cases, the entire
+ first field must be read into this buffer to determine whether it
+ is followed by a delimiter or a newline before any of it may be
+ output. Otherwise, cut_fields can do the job without using this
+ buffer. */
+ mbf_char_t *field_1_buffer = 0;
+ /* The number of bytes allocated for FIELD_1_BUFFER. */
+ size_t field_1_bufsize;
+
+
+ mbf_char_t c, d;
+ mb_file_t mbf;
size_t field_idx = 1;
bool found_any_selected_field = false;
bool buffer_first_field;
current_rp = rp;
- c = getc (stream);
- if (c == EOF)
+ mbf_init (mbf, stream);
+ mbf_getc (c, mbf);
+ if (mb_iseof (c))
return;
- ungetc (c, stream);
- c = 0;
+ mbf_ungetc (c, mbf);
+ mb_setascii (&c, 0);
+ mb_copy (&d, &delim);
/* To support the semantics of the -s flag, we may have to buffer
all of the first field to determine whether it is 'delimited.'
@@ -536,10 +744,14 @@ cut_fields (FILE *stream)
if (field_idx == 1 && buffer_first_field)
{
ssize_t len;
- size_t n_bytes;
+ size_t n_chars;
+ mbf_char_t nl;
+ mb_setascii (&nl, '\n');
+
+ len = mb_getndelim2 (&field_1_buffer, &field_1_bufsize,
+ GETNLINE_NO_LIMIT, d, nl, &mbf);
+
- len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
- GETNLINE_NO_LIMIT, delim, '\n', stream);
if (len < 0)
{
free (field_1_buffer);
@@ -549,15 +761,15 @@ cut_fields (FILE *stream)
xalloc_die ();
}
- n_bytes = len;
- assert (n_bytes != 0);
+ n_chars = len;
+ //assert (n_chars != 0);
- c = 0;
+ mb_setascii (&c, 0);
/* If the first field extends to the end of line (it is not
delimited) and we are printing all non-delimited lines,
print this one. */
- if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
+ if (!mb_equal (field_1_buffer[n_chars - 1], d))
{
if (suppress_non_delimited)
{
@@ -565,26 +777,30 @@ cut_fields (FILE *stream)
}
else
{
- fwrite (field_1_buffer, sizeof (char), n_bytes, stdout);
+ for (int i = 0; i < n_chars; ++i)
+ mb_putc (field_1_buffer[i], stdout);
+
/* Make sure the output line is newline terminated. */
- if (field_1_buffer[n_bytes - 1] != '\n')
+ if (!mb_iseq (field_1_buffer[n_chars - 1], '\n'))
putchar ('\n');
- c = '\n';
+ mb_setascii (&c,'\n');
}
continue;
}
if (print_kth (1))
{
/* Print the field, but not the trailing delimiter. */
- fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
+ for (int i = 0; i < n_chars - 1; ++i)
+ mb_putc (field_1_buffer[i], stdout);
/* With -d$'\n' don't treat the last '\n' as a delimiter. */
- if (delim == '\n')
+ if (mb_iseq (d, '\n'))
{
- int last_c = getc (stream);
- if (last_c != EOF)
+ mbf_char_t last_c;
+ mbf_getc (last_c, mbf);
+ if (!mb_iseof (last_c))
{
- ungetc (last_c, stream);
+ mbf_ungetc (last_c, mbf);
found_any_selected_field = true;
}
}
@@ -594,7 +810,8 @@ cut_fields (FILE *stream)
next_item (&field_idx);
}
- int prev_c = c;
+ mbf_char_t prev_c;
+ mb_copy (&prev_c, &c);
if (print_kth (field_idx))
{
@@ -605,41 +822,46 @@ cut_fields (FILE *stream)
}
found_any_selected_field = true;
- while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
+ mbf_getc (c, mbf);
+ while (!mb_equal (c, d) && !mb_iseq (c, '\n') && !mb_iseof (c))
{
- putchar (c);
- prev_c = c;
+ mb_putc (c, stdout);
+ mb_copy (&prev_c, &c);
+ mbf_getc (c, mbf);
}
}
else
{
- while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
+ mbf_getc (c, mbf);
+ while (!mb_equal (c, d) && !mb_iseq (c, '\n') && !mb_iseof (c))
{
- prev_c = c;
+ mb_copy (&prev_c, &c);
+ mbf_getc (c, mbf);
}
}
/* With -d$'\n' don't treat the last '\n' as a delimiter. */
- if (delim == '\n' && c == delim)
+ if (mb_iseq (d, '\n') && mb_equal (c, d))
{
- int last_c = getc (stream);
- if (last_c != EOF)
- ungetc (last_c, stream);
+ mbf_char_t last_c;
+ mbf_getc (last_c, mbf);
+ if (!mb_iseof (last_c))
+ mbf_ungetc (last_c, mbf);
else
- c = last_c;
+ mb_copy (&c, &last_c);
}
- if (c == delim)
+ if (mb_equal (c, d))
next_item (&field_idx);
- else if (c == '\n' || c == EOF)
+ else if (mb_iseq (c, '\n') || mb_iseof (c))
{
if (found_any_selected_field
|| !(suppress_non_delimited && field_idx == 1))
{
- if (c == '\n' || prev_c != '\n' || delim == '\n')
+ if (mb_iseq (c, '\n') || !mb_iseq (prev_c, '\n') || mb_iseq (d, '\n'))
putchar ('\n');
}
- if (c == EOF)
+ if (mb_iseof (c))
break;
field_idx = 1;
current_rp = rp;
@@ -652,7 +874,14 @@ static void
cut_stream (FILE *stream)
{
if (operating_mode == byte_mode)
- cut_bytes (stream);
+ {
+ if (no_break_mb_chars)
+ cut_chars (stream);
+ else
+ cut_bytes (stream);
+ }
+ else if (operating_mode == char_mode)
+ cut_chars (stream);
else
cut_fields (stream);
}
@@ -706,6 +935,7 @@ main (int argc, char **argv)
bool ok;
bool delim_specified = false;
char *spec_list_string IF_LINT ( = NULL);
+ mbi_iterator_t iter;
initialize_main (&argc, &argv);
set_program_name (argv[0]);
@@ -719,8 +949,10 @@ main (int argc, char **argv)
/* By default, all non-delimited lines are printed. */
suppress_non_delimited = false;
+ /* Default behaviour for -b, unless -n is also specified. */
+ no_break_mb_chars = false;
- delim = '\0';
+ mb_setascii (&delim, '\0');
have_read_stdin = false;
while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
@@ -728,7 +960,6 @@ main (int argc, char **argv)
switch (optc)
{
case 'b':
- case 'c':
/* Build the byte list. */
if (operating_mode != undefined_mode)
FATAL_ERROR (_("only one type of list may be specified"));
@@ -736,6 +967,14 @@ main (int argc, char **argv)
spec_list_string = optarg;
break;
+ case 'c':
+ /* Build the char list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = char_mode;
+ spec_list_string = optarg;
+ break;
+
case 'f':
/* Build the field list. */
if (operating_mode != undefined_mode)
@@ -747,9 +986,15 @@ main (int argc, char **argv)
case 'd':
/* New delimiter. */
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
+ mbi_init (iter, optarg, strlen (optarg));
+ if (!mbi_avail (iter))
+ mb_setascii (&delim, '\0');
+ else
+ mb_copy (&delim, &mbi_cur (iter));
+
+ mbi_advance (iter);
+ if (mbi_avail (iter))
FATAL_ERROR (_("the delimiter must be a single character"));
- delim = optarg[0];
delim_specified = true;
break;
@@ -763,6 +1008,7 @@ main (int argc, char **argv)
break;
case 'n':
+ no_break_mb_chars = true;
break;
case 's':
@@ -802,15 +1048,12 @@ main (int argc, char **argv)
}
if (!delim_specified)
- delim = '\t';
+ mb_setascii (&delim, '\t');
if (output_delimiter_string == NULL)
{
- static char dummy[2];
- dummy[0] = delim;
- dummy[1] = '\0';
- output_delimiter_string = dummy;
- output_delimiter_length = 1;
+ output_delimiter_string = mb_ptr (delim);
+ output_delimiter_length = mb_len (delim);
}
if (optind == argc)

View File

@ -23,579 +23,6 @@ diff -urNp coreutils-8.24-orig/lib/linebuffer.h coreutils-8.24/lib/linebuffer.h
};
/* Initialize linebuffer LINEBUFFER for use. */
diff -urNp coreutils-8.24-orig/src/cut.c coreutils-8.24/src/cut.c
--- coreutils-8.24-orig/src/cut.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/cut.c 2015-07-05 09:04:33.028546950 +0200
@@ -28,6 +28,11 @@
#include <assert.h>
#include <getopt.h>
#include <sys/types.h>
+
+/* Get mbstate_t, mbrtowc(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
#include "system.h"
#include "error.h"
@@ -37,6 +42,18 @@
#include "quote.h"
#include "xstrndup.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# undef MB_LEN_MAX
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "cut"
@@ -53,6 +70,52 @@
} \
while (0)
+/* Refill the buffer BUF to get a multibyte character. */
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
+ do \
+ { \
+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
+ { \
+ memmove (BUF, BUFPOS, BUFLEN); \
+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
+ BUFPOS = BUF; \
+ } \
+ } \
+ while (0)
+
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
+ do \
+ { \
+ mbstate_t state_bak; \
+ \
+ if (BUFLEN < 1) \
+ { \
+ WC = WEOF; \
+ break; \
+ } \
+ \
+ /* Get a wide character. */ \
+ CONVFAIL = false; \
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
+ CONVFAIL = true; \
+ STATE = state_bak; \
+ /* Fall througn. */ \
+ \
+ case 0: \
+ MBLENGTH = 1; \
+ break; \
+ } \
+ } \
+ while (0)
+
struct range_pair
{
@@ -75,6 +138,8 @@ static size_t n_rp;
/* Number of `struct range_pair's allocated. */
static size_t n_rp_allocated;
+/* Length of the delimiter given as argument to -d. */
+size_t delimlen;
/* Append LOW, HIGH to the list RP of range pairs, allocating additional
space if necessary. Update global variable N_RP. When allocating,
@@ -106,15 +171,25 @@ enum operating_mode
{
undefined_mode,
- /* Output characters that are in the given bytes. */
+ /* Output bytes that are at the given positions. */
byte_mode,
+ /* Output characters that are at the given positions. */
+ character_mode,
+
/* Output the given delimiter-separated fields. */
field_mode
};
static enum operating_mode operating_mode;
+/* If nonzero, when in byte mode, don't split multibyte characters. */
+static int byte_mode_character_aware;
+
+/* If nonzero, the function for single byte locale is work
+ if this program runs on multibyte locale. */
+static int force_singlebyte_mode;
+
/* If true do not output lines containing no delimiter characters.
Otherwise, all such lines are printed. This option is valid only
with field mode. */
@@ -126,6 +201,9 @@ static bool complement;
/* The delimiter character for field mode. */
static unsigned char delim;
+#if HAVE_WCHAR_H
+static wchar_t wcdelim;
+#endif
/* True if the --output-delimiter=STRING option was specified. */
static bool output_delimiter_specified;
@@ -189,7 +267,7 @@ Print selected parts of lines from each
-f, --fields=LIST select only these fields; also print any line\n\
that contains no delimiter character, unless\n\
the -s option is specified\n\
- -n (ignored)\n\
+ -n with -b: don't split multibyte characters\n\
"), stdout);
fputs (_("\
--complement complement the set of selected bytes, characters\n\
@@ -380,6 +458,9 @@ set_fields (const char *fieldstr)
if (operating_mode == byte_mode)
error (0, 0,
_("byte offset %s is too large"), quote (bad_num));
+ else if (operating_mode == character_mode)
+ error (0, 0,
+ _("character offset %s is too large"), quote (bad_num));
else
error (0, 0,
_("field number %s is too large"), quote (bad_num));
@@ -504,6 +585,82 @@ cut_bytes (FILE *stream)
}
}
+#if HAVE_MBRTOWC
+/* This function is in use for the following case.
+
+ 1. Read from the stream STREAM, printing to standard output any selected
+ characters.
+
+ 2. Read from stream STREAM, printing to standard output any selected bytes,
+ without splitting multibyte characters. */
+
+static void
+cut_characters_or_cut_bytes_no_split (FILE *stream)
+{
+ size_t idx; /* number of bytes or characters in the line so far. */
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
+ /* Whether to begin printing delimiters between ranges for the current line.
+ Set after we've begun printing data corresponding to the first range. */
+ bool print_delimiter = false;
+
+ idx = 0;
+ buflen = 0;
+ bufpos = buf;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ current_rp = rp;
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
+ (void) convfail; /* ignore unused */
+
+ if (wc == WEOF)
+ {
+ if (idx > 0)
+ putchar ('\n');
+ break;
+ }
+ else if (wc == L'\n')
+ {
+ putchar ('\n');
+ idx = 0;
+ print_delimiter = false;
+ current_rp = rp;
+ }
+ else
+ {
+ next_item (&idx);
+ if (print_kth (idx))
+ {
+ if (output_delimiter_specified)
+ {
+ if (print_delimiter && is_range_start_index (idx))
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ print_delimiter = true;
+ }
+ fwrite (bufpos, mblength, sizeof(char), stdout);
+ }
+ }
+
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+}
+#endif
+
/* Read from stream STREAM, printing to standard output any selected fields. */
static void
@@ -648,13 +805,211 @@ cut_fields (FILE *stream)
}
}
+#if HAVE_MBRTOWC
+static void
+cut_fields_mb (FILE *stream)
+{
+ int c;
+ size_t field_idx;
+ int found_any_selected_field;
+ int buffer_first_field;
+ int empty_input;
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc = 0; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
+
+ current_rp = rp;
+
+ found_any_selected_field = 0;
+ field_idx = 1;
+ bufpos = buf;
+ buflen = 0;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ c = getc (stream);
+ empty_input = (c == EOF);
+ if (c != EOF)
+ {
+ ungetc (c, stream);
+ wc = 0;
+ }
+ else
+ wc = WEOF;
+
+ /* To support the semantics of the -s flag, we may have to buffer
+ all of the first field to determine whether it is `delimited.'
+ But that is unnecessary if all non-delimited lines must be printed
+ and the first field has been selected, or if non-delimited lines
+ must be suppressed and the first field has *not* been selected.
+ That is because a non-delimited line has exactly one field. */
+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
+
+ while (1)
+ {
+ if (field_idx == 1 && buffer_first_field)
+ {
+ int len = 0;
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
+
+ if (wc == WEOF)
+ break;
+
+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
+ memcpy (field_1_buffer + len, bufpos, mblength);
+ len += mblength;
+ buflen -= mblength;
+ bufpos += mblength;
+
+ if (!convfail && (wc == L'\n' || wc == wcdelim))
+ break;
+ }
+
+ if (len <= 0 && wc == WEOF)
+ break;
+
+ /* If the first field extends to the end of line (it is not
+ delimited) and we are printing all non-delimited lines,
+ print this one. */
+ if (convfail || (!convfail && wc != wcdelim))
+ {
+ if (suppress_non_delimited)
+ {
+ /* Empty. */
+ }
+ else
+ {
+ fwrite (field_1_buffer, sizeof (char), len, stdout);
+ /* Make sure the output line is newline terminated. */
+ if (convfail || (!convfail && wc != L'\n'))
+ putchar ('\n');
+ }
+ continue;
+ }
+
+ if (print_kth (1))
+ {
+ /* Print the field, but not the trailing delimiter. */
+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
+ found_any_selected_field = 1;
+ }
+ next_item (&field_idx);
+ }
+
+ if (wc != WEOF)
+ {
+ if (print_kth (field_idx))
+ {
+ if (found_any_selected_field)
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ found_any_selected_field = 1;
+ }
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
+
+ if (wc == WEOF)
+ break;
+ else if (!convfail && (wc == wcdelim || wc == L'\n'))
+ {
+ buflen -= mblength;
+ bufpos += mblength;
+ break;
+ }
+
+ if (print_kth (field_idx))
+ fwrite (bufpos, mblength, sizeof(char), stdout);
+
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+ }
+
+ if ((!convfail || wc == L'\n') && buflen < 1)
+ wc = WEOF;
+
+ if (!convfail && wc == wcdelim)
+ next_item (&field_idx);
+ else if (wc == WEOF || (!convfail && wc == L'\n'))
+ {
+ if (found_any_selected_field
+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
+ putchar ('\n');
+ if (wc == WEOF)
+ break;
+ field_idx = 1;
+ current_rp = rp;
+ found_any_selected_field = 0;
+ }
+ }
+}
+#endif
+
static void
cut_stream (FILE *stream)
{
- if (operating_mode == byte_mode)
- cut_bytes (stream);
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ switch (operating_mode)
+ {
+ case byte_mode:
+ if (byte_mode_character_aware)
+ cut_characters_or_cut_bytes_no_split (stream);
+ else
+ cut_bytes (stream);
+ break;
+
+ case character_mode:
+ cut_characters_or_cut_bytes_no_split (stream);
+ break;
+
+ case field_mode:
+ if (delimlen == 1)
+ {
+ /* Check if we have utf8 multibyte locale, so we can use this
+ optimization because of uniqueness of characters, which is
+ not true for e.g. SJIS */
+ char * loc = setlocale(LC_CTYPE, NULL);
+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
+ {
+ cut_fields (stream);
+ break;
+ }
+ }
+ cut_fields_mb (stream);
+ break;
+
+ default:
+ abort ();
+ }
+ }
else
- cut_fields (stream);
+#endif
+ {
+ if (operating_mode == field_mode)
+ cut_fields (stream);
+ else
+ cut_bytes (stream);
+ }
}
/* Process file FILE to standard output.
@@ -706,6 +1061,7 @@ main (int argc, char **argv)
bool ok;
bool delim_specified = false;
char *spec_list_string IF_LINT ( = NULL);
+ char mbdelim[MB_LEN_MAX + 1];
initialize_main (&argc, &argv);
set_program_name (argv[0]);
@@ -728,7 +1084,6 @@ main (int argc, char **argv)
switch (optc)
{
case 'b':
- case 'c':
/* Build the byte list. */
if (operating_mode != undefined_mode)
FATAL_ERROR (_("only one type of list may be specified"));
@@ -736,6 +1091,14 @@ main (int argc, char **argv)
spec_list_string = optarg;
break;
+ case 'c':
+ /* Build the character list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = character_mode;
+ spec_list_string = optarg;
+ break;
+
case 'f':
/* Build the field list. */
if (operating_mode != undefined_mode)
@@ -747,10 +1110,38 @@ main (int argc, char **argv)
case 'd':
/* New delimiter. */
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
- FATAL_ERROR (_("the delimiter must be a single character"));
- delim = optarg[0];
- delim_specified = true;
+ {
+#if HAVE_MBRTOWC
+ if(MB_CUR_MAX > 1)
+ {
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
+
+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
+ ++force_singlebyte_mode;
+ else
+ {
+ delimlen = (delimlen < 1) ? 1 : delimlen;
+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ memcpy (mbdelim, optarg, delimlen);
+ mbdelim[delimlen] = '\0';
+ if (delimlen == 1)
+ delim = *optarg;
+ }
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
+ if (optarg[0] != '\0' && optarg[1] != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ delim = (unsigned char) optarg[0];
+ }
+ delim_specified = true;
+ }
break;
case OUTPUT_DELIMITER_OPTION:
@@ -763,6 +1154,7 @@ main (int argc, char **argv)
break;
case 'n':
+ byte_mode_character_aware = 1;
break;
case 's':
@@ -802,15 +1194,34 @@ main (int argc, char **argv)
}
if (!delim_specified)
- delim = '\t';
+ {
+ delim = '\t';
+#ifdef HAVE_MBRTOWC
+ wcdelim = L'\t';
+ mbdelim[0] = '\t';
+ mbdelim[1] = '\0';
+ delimlen = 1;
+#endif
+ }
if (output_delimiter_string == NULL)
{
- static char dummy[2];
- dummy[0] = delim;
- dummy[1] = '\0';
- output_delimiter_string = dummy;
- output_delimiter_length = 1;
+#ifdef HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ output_delimiter_string = xstrdup(mbdelim);
+ output_delimiter_length = delimlen;
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
+ static char dummy[2];
+ dummy[0] = delim;
+ dummy[1] = '\0';
+ output_delimiter_string = dummy;
+ output_delimiter_length = 1;
+ }
}
if (optind == argc)
diff -urNp coreutils-8.24-orig/src/fold.c coreutils-8.24/src/fold.c
--- coreutils-8.24-orig/src/fold.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/fold.c 2015-07-05 09:04:33.029546958 +0200

View File

@ -1,7 +1,7 @@
Summary: A set of basic GNU tools commonly used in shell scripts
Name: coreutils
Version: 8.24
Release: 106%{?dist}
Release: 107%{?dist}
License: GPLv3+
Group: System Environment/Base
Url: http://www.gnu.org/software/coreutils/
@ -52,6 +52,8 @@ Patch713: coreutils-4.5.3-langinfo.patch
Patch800: coreutils-i18n.patch
# (sb) lin18nux/lsb compliance - expand/unexpand
Patch801: coreutils-i18n-expand-unexpand.patch
# (sb) lin18nux/lsb compliance - cut
Patch802: coreutils-i18n-cut.patch
#getgrouplist() patch from Ulrich Drepper.
Patch908: coreutils-getgrouplist.patch
@ -184,6 +186,7 @@ including documentation and translations.
# li18nux/lsb
%patch800 -p1 -b .i18n
%patch801 -p1 -b .i18n-expand
%patch802 -p1 -b .i18n-cut
# Coreutils
%patch908 -p1 -b .getgrouplist
@ -356,6 +359,9 @@ fi
%license COPYING
%changelog
* Fri Jan 15 2016 Ondrej Oprala <ooprala@redhat.com> - 8.24-107
- Use the new i18n implementation for the cut utility
* Wed Jan 13 2016 Ondrej Vasik <ovasik@redhat.com> - 8.24-106
- mv: prevent dataloss when source dir is specified multiple
times (#1297464, by P.Brady)