Fix INITCAP() word boundaries for PG_UNICODE_FAST.

author Jeff Davis <jdavis@postgresql.org>

Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)

committer Jeff Davis <jdavis@postgresql.org>

Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
author Jeff Davis <jdavis@postgresql.org>
Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
committer Jeff Davis <jdavis@postgresql.org>
Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c

index 125b10ff7ab75a66263cf651ff7ce7ebb44e5d05..f51768830cd7b6ff425749f46c5e85f1f1446791 100644 (file)
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -40,6 +40,7 @@ struct WordBoundaryState
     const char *str;
     size_t      len;
     size_t      offset;
+   bool        posix;
     bool        init;
     bool        prev_alnum;
  };
@@ -58,7 +59,7 @@ initcap_wbnext(void *state)
     {
         pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
                                         wbstate->offset);
-       bool        curr_alnum = pg_u_isalnum(u, true);
+       bool        curr_alnum = pg_u_isalnum(u, wbstate->posix);
  
         if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
         {
@@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
         .str = src,
         .len = srclen,
         .offset = 0,
+       .posix = !locale->info.builtin.casemap_full,
         .init = false,
         .prev_alnum = false,
     };
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c

index f0b38b3bdd75fd8eadc76a13e0f4367df502b413..fdfb62e855286fb1d0d2c74e8cc26b1b3ad4e4bd 100644 (file)
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -41,6 +41,7 @@ struct WordBoundaryState
     const char *str;
     size_t      len;
     size_t      offset;
+   bool        posix;
     bool        init;
     bool        prev_alnum;
  };
@@ -55,7 +56,7 @@ initcap_wbnext(void *state)
     {
         pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
                                         wbstate->offset);
-       bool        curr_alnum = pg_u_isalnum(u, true);
+       bool        curr_alnum = pg_u_isalnum(u, wbstate->posix);
  
         if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
         {
@@ -112,10 +113,13 @@ icu_test_full(char *str)
     char        icu_upper[BUFSZ];
     char        icu_fold[BUFSZ];
     UErrorCode  status;
+
+   /* full case mapping doesn't use posix semantics */
     struct WordBoundaryState wbstate = {
         .str = str,
         .len = strlen(str),
         .offset = 0,
+       .posix = false,
         .init = false,
         .prev_alnum = false,
     };
@@ -344,6 +348,12 @@ test_convert_case()
     test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
     test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
     test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
+   /* test that alphanumerics are word characters */
+   test_convert(tfunc_title, "λλ", "Λλ");
+   test_convert(tfunc_title, "1a", "1a");
+   /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
+   test_convert(tfunc_title, "\uFF11a", "\uFF11a");
+
  
  #ifdef USE_ICU
     icu_test_full("");
@@ -354,6 +364,7 @@ test_convert_case()
     icu_test_full("abc 123xyz");
     icu_test_full("σςΣ ΣΣΣ");
     icu_test_full("ıiIİ");
+   icu_test_full("\uFF11a");
     /* test <alpha><iota_subscript><acute> */
     icu_test_full("\u0391\u0345\u0301");
  #endif
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out

index 5508622b16d0113480c8f92f59aebee2422ef57f..0c3ab5c89b28432f64f6195233bf93440f6e8c52 100644 (file)
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES
    ('abc DEF 123abc'),
    ('ábc sßs ßss DÉF'),
    ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
    ('ȺȺȺ'),
    ('ⱥⱥⱥ'),
    ('ⱥȺ');
@@ -67,10 +68,11 @@ SELECT
   abc DEF 123abc  | abc def 123abc  | Abc Def 123abc  | ABC DEF 123ABC  |      14 |            14 |              14 |            14
   ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF |      19 |            19 |              19 |            19
   ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | Ǆxxǆ Ǆxxǆ Ǆxxǆ  | ǄXXǄ ǄXXǄ ǄXXǄ  |      20 |            20 |              20 |            20
+ Λλ 1a １a       | λλ 1a １a       | Λλ 1a １A       | ΛΛ 1A １A       |      12 |            12 |              12 |            12
   ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       6 |             9 |               8 |             6
   ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       9 |             9 |               8 |             6
   ⱥȺ              | ⱥⱥ              | Ⱥⱥ              | ȺȺ              |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)
  
  DROP TABLE test_pg_c_utf8;
  -- negative test: Final_Sigma not used for builtin locale C.UTF-8
@@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES
    ('abc DEF 123abc'),
    ('ábc sßs ßss DÉF'),
    ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
    ('ȺȺȺ'),
    ('ⱥⱥⱥ'),
    ('ⱥȺ');
@@ -197,10 +200,11 @@ SELECT
   abc DEF 123abc  | abc def 123abc  | Abc Def 123abc   | ABC DEF 123ABC    |      14 |            14 |              14 |            14
   ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF |      19 |            19 |              19 |            19
   ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | ǅxxǆ ǅxxǆ ǅxxǆ   | ǄXXǄ ǄXXǄ ǄXXǄ    |      20 |            20 |              20 |            20
+ Λλ 1a １a       | λλ 1a １a       | Λλ 1a １a        | ΛΛ 1A １A         |      12 |            12 |              12 |            12
   ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       6 |             9 |               8 |             6
   ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       9 |             9 |               8 |             6
   ⱥȺ              | ⱥⱥ              | Ⱥⱥ               | ȺȺ                |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)
  
  DROP TABLE test_pg_unicode_fast;
  -- test Final_Sigma
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql

index 6c7c7aec9ec32bac7ed0a3676d8761a70c397336..d6d14220ab34234306e241cec5f8cf37d67aa938 100644 (file)
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES
    ('abc DEF 123abc'),
    ('ábc sßs ßss DÉF'),
    ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
    ('ȺȺȺ'),
    ('ⱥⱥⱥ'),
    ('ⱥȺ');
@@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES
    ('abc DEF 123abc'),
    ('ábc sßs ßss DÉF'),
    ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
    ('ȺȺȺ'),
    ('ⱥⱥⱥ'),
    ('ⱥȺ');
author	Jeff Davis <jdavis@postgresql.org>
	Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
committer	Jeff Davis <jdavis@postgresql.org>
	Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
src/backend/utils/adt/pg_locale_builtin.c		patch \| blob \| blame \| history
src/common/unicode/case_test.c		patch \| blob \| blame \| history
src/test/regress/expected/collate.utf8.out		patch \| blob \| blame \| history
src/test/regress/sql/collate.utf8.sql		patch \| blob \| blame \| history