coreutils/coreutils-8.25-sort-thousands-sep.patch

From c479153d77b419a6cae4551b63d2b73096c1130e Mon Sep 17 00:00:00 2001
From: Kamil Dudka <kdudka@redhat.com>
Date: Mon, 18 Jul 2016 19:04:43 +0200
Subject: [PATCH 1/3] maint: sort.c: deduplicate code for traversing numbers

* src/sort.c (traverse_raw_number): New function for traversing numbers.
(find_unit_order): Use traverse_raw_number() instead of open-coding it.
(debug_key): Likewise.
---
 src/sort.c | 63 ++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/src/sort.c b/src/sort.c
index 5b02343..e28bb6c 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2231,18 +2231,16 @@ static char const unit_order[UCHAR_LIM] =
 #endif
   };

-/* Return an integer that represents the order of magnitude of the
-   unit following the number.  The number may contain thousands
-   separators and a decimal point, but it may not contain leading blanks.
-   Negative numbers get negative orders; zero numbers have a zero order.  */
-
-static int _GL_ATTRIBUTE_PURE
-find_unit_order (char const *number)
+/* Traverse number given as *number consisting of digits, thousands_sep, and
+   decimal_point chars only.  Returns the highest digit found in the number,
+   or '\0' if no digit has been found.  Upon return *number points at the
+   character that immediately follows after the given number.  */
+static unsigned char
+traverse_raw_number (char const **number)
 {
-  bool minus_sign = (*number == '-');
-  char const *p = number + minus_sign;
-  int nonzero = 0;
+  char const *p = *number;
   unsigned char ch;
+  unsigned char max_digit = '\0';

   /* Scan to end of number.
      Decimals or separators not followed by digits stop the scan.
@@ -2253,16 +2251,34 @@ find_unit_order (char const *number)
   do
     {
       while (ISDIGIT (ch = *p++))
-        nonzero |= ch - '0';
+        if (max_digit < ch)
+          max_digit = ch;
     }
   while (ch == thousands_sep);

   if (ch == decimal_point)
     while (ISDIGIT (ch = *p++))
-      nonzero |= ch - '0';
+      if (max_digit < ch)
+        max_digit = ch;
+
+  *number = p - 1;
+  return max_digit;
+}
+
+/* Return an integer that represents the order of magnitude of the
+   unit following the number.  The number may contain thousands
+   separators and a decimal point, but it may not contain leading blanks.
+   Negative numbers get negative orders; zero numbers have a zero order.  */

-  if (nonzero)
+static int _GL_ATTRIBUTE_PURE
+find_unit_order (char const *number)
+{
+  bool minus_sign = (*number == '-');
+  char const *p = number + minus_sign;
+  unsigned char max_digit = traverse_raw_number (&p);
+  if ('0' < max_digit)
     {
+      unsigned char ch = *p;
       int order = unit_order[ch];
       return (minus_sign ? -order : order);
     }
@@ -2655,23 +2671,14 @@ debug_key (struct line const *line, struct keyfield const *key)
             ignore_value (strtold (beg, &tighter_lim));
           else if (key->numeric || key->human_numeric)
             {
-              char *p = beg + (beg < lim && *beg == '-');
-              bool found_digit = false;
-              unsigned char ch;
-
-              do
+              char const *p = beg + (beg < lim && *beg == '-');
+              unsigned char max_digit = traverse_raw_number (&p);
+              if ('0' <= max_digit)
                 {
-                  while (ISDIGIT (ch = *p++))
-                    found_digit = true;
+                  unsigned char ch = *p;
+                  tighter_lim = (char *) p
+                    + (key->human_numeric && unit_order[ch]);
                 }
-              while (ch == thousands_sep);
-
-              if (ch == decimal_point)
-                while (ISDIGIT (ch = *p++))
-                  found_digit = true;
-
-              if (found_digit)
-                tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
             }
           else
             tighter_lim = lim;
--
2.5.5


From 8c39465a5b0343ff7a21286dd69ed5430685d2f7 Mon Sep 17 00:00:00 2001
From: Kamil Dudka <kdudka@redhat.com>
Date: Mon, 18 Jul 2016 19:04:44 +0200
Subject: [PATCH 2/3] sort: make -h work with -k and blank used as thousands
 separator

* src/sort.c (traverse_raw_number): Allow to skip only one occurrence
of thousands_sep to avoid finding the unit in the next column in case
thousands_sep matches as blank and is used as column delimiter.
* tests/misc/sort-h-thousands-sep.sh: Add regression test for this bug.
* tests/local.mk: Reference the test.
* NEWS: Mention the bug fix.
Reported at https://bugzilla.redhat.com/1355780
Fixes http://bugs.gnu.org/24015
---
 src/sort.c                         | 14 ++++++++----
 tests/local.mk                     |  1 +
 tests/misc/sort-h-thousands-sep.sh | 47 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100755 tests/misc/sort-h-thousands-sep.sh

diff --git a/src/sort.c b/src/sort.c
index e28bb6c..dd3ba58 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2248,13 +2248,17 @@ traverse_raw_number (char const **number)
      to be lacking in units.
      FIXME: add support for multibyte thousands_sep and decimal_point.  */

-  do
+  while (ISDIGIT (ch = *p++))
     {
-      while (ISDIGIT (ch = *p++))
-        if (max_digit < ch)
-          max_digit = ch;
+      if (max_digit < ch)
+        max_digit = ch;
+
+      /* Allow to skip only one occurrence of thousands_sep to avoid finding
+         the unit in the next column in case thousands_sep matches as blank
+         and is used as column delimiter.  */
+      if (*p == thousands_sep)
+        ++p;
     }
-  while (ch == thousands_sep);

   if (ch == decimal_point)
     while (ISDIGIT (ch = *p++))
diff --git a/tests/local.mk b/tests/local.mk
index 42d39f2..dccff8d 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -344,6 +344,7 @@ all_tests =					\
   tests/misc/sort-discrim.sh			\
   tests/misc/sort-files0-from.pl		\
   tests/misc/sort-float.sh			\
+  tests/misc/sort-h-thousands-sep.sh		\
   tests/misc/sort-mb-tests.sh			\
   tests/i18n/sort.sh				\
   tests/misc/sort-merge.pl			\
diff --git a/tests/misc/sort-h-thousands-sep.sh b/tests/misc/sort-h-thousands-sep.sh
new file mode 100755
index 0000000..17f1b6c
--- /dev/null
+++ b/tests/misc/sort-h-thousands-sep.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+# exercise 'sort -h' in locales where thousands separator is blank
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ sort
+test "$(LC_ALL=sv_SE locale thousands_sep)" = ' ' \
+  || skip_ 'The Swedish locale with blank thousands separator is unavailable.'
+
+tee exp1 > in << _EOF_
+1       1k      4 003   1M
+2k      2M      4 002   2
+3M      3       4 001   3k
+_EOF_
+
+cat > exp2 << _EOF_
+3M      3       4 001   3k
+1       1k      4 003   1M
+2k      2M      4 002   2
+_EOF_
+
+cat > exp3 << _EOF_
+3M      3       4 001   3k
+2k      2M      4 002   2
+1       1k      4 003   1M
+_EOF_
+
+for i in 1 2 3; do
+  LC_ALL="sv_SE.utf8" sort -h -k $i "in" > "out${i}" || fail=1
+  compare "exp${i}" "out${i}" || fail=1
+done
+
+Exit $fail
--
2.5.5


From 46ef53f558e7bc1c0bc0abd62a86b40b4141e058 Mon Sep 17 00:00:00 2001
From: Kamil Dudka <kdudka@redhat.com>
Date: Mon, 18 Jul 2016 19:04:45 +0200
Subject: [PATCH 3/3] sort: with -h, disallow thousands separator between
 number and unit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/sort.c (traverse_raw_number): Accept thousands separator only
if it is immediately followed by a digit.
* tests/misc/sort-h-thousands-sep.sh: Cover the fix for this bug.

Suggested by Pádraig Brady in http://bugs.gnu.org/24015
---
 src/sort.c                         | 11 ++++++++++-
 tests/misc/sort-h-thousands-sep.sh | 25 +++++++++++++------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/sort.c b/src/sort.c
index dd3ba58..69ef75f 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2241,6 +2241,7 @@ traverse_raw_number (char const **number)
   char const *p = *number;
   unsigned char ch;
   unsigned char max_digit = '\0';
+  bool ends_with_thousands_sep = false;

   /* Scan to end of number.
      Decimals or separators not followed by digits stop the scan.
@@ -2256,10 +2257,18 @@ traverse_raw_number (char const **number)
       /* Allow to skip only one occurrence of thousands_sep to avoid finding
          the unit in the next column in case thousands_sep matches as blank
          and is used as column delimiter.  */
-      if (*p == thousands_sep)
+      ends_with_thousands_sep = (*p == thousands_sep);
+      if (ends_with_thousands_sep)
         ++p;
     }

+  if (ends_with_thousands_sep)
+    {
+      /* thousands_sep not followed by digit is not allowed.  */
+      *number = p - 2;
+      return max_digit;
+    }
+
   if (ch == decimal_point)
     while (ISDIGIT (ch = *p++))
       if (max_digit < ch)
diff --git a/tests/misc/sort-h-thousands-sep.sh b/tests/misc/sort-h-thousands-sep.sh
index 17f1b6c..3ffa89e 100755
--- a/tests/misc/sort-h-thousands-sep.sh
+++ b/tests/misc/sort-h-thousands-sep.sh
@@ -18,28 +18,29 @@

 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ sort
+
 test "$(LC_ALL=sv_SE locale thousands_sep)" = ' ' \
   || skip_ 'The Swedish locale with blank thousands separator is unavailable.'

-tee exp1 > in << _EOF_
-1       1k      4 003   1M
-2k      2M      4 002   2
-3M      3       4 001   3k
+tee exp1 exp3 > in << _EOF_
+1       1k      1 M     4 003   1M
+2k      2M      2 k     4 002   2
+3M      3       3 G     4 001   3k
 _EOF_

 cat > exp2 << _EOF_
-3M      3       4 001   3k
-1       1k      4 003   1M
-2k      2M      4 002   2
+3M      3       3 G     4 001   3k
+1       1k      1 M     4 003   1M
+2k      2M      2 k     4 002   2
 _EOF_

-cat > exp3 << _EOF_
-3M      3       4 001   3k
-2k      2M      4 002   2
-1       1k      4 003   1M
+cat > exp5 << _EOF_
+3M      3       3 G     4 001   3k
+2k      2M      2 k     4 002   2
+1       1k      1 M     4 003   1M
 _EOF_

-for i in 1 2 3; do
+for i in 1 2 3 5; do
   LC_ALL="sv_SE.utf8" sort -h -k $i "in" > "out${i}" || fail=1
   compare "exp${i}" "out${i}" || fail=1
 done
--
2.5.5