Fix INITCAP() word boundaries for PG_UNICODE_FAST.
authorJeff Davis <jdavis@postgresql.org>
Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
committerJeff Davis <jdavis@postgresql.org>
Mon, 21 Apr 2025 19:34:58 +0000 (12:34 -0700)
Word boundaries are based on whether a character is alphanumeric or
not. For the PG_UNICODE_FAST collation, alphanumeric includes
non-ASCII digits; whereas for the PG_C_UTF8 collation, it only
includes digits 0-9. Pass down the right information from the
pg_locale_t into initcap_wbnext to differentiate the behavior.

Reported-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com

src/backend/utils/adt/pg_locale_builtin.c
src/common/unicode/case_test.c
src/test/regress/expected/collate.utf8.out
src/test/regress/sql/collate.utf8.sql

index 125b10ff7ab75a66263cf651ff7ce7ebb44e5d05..f51768830cd7b6ff425749f46c5e85f1f1446791 100644 (file)
@@ -40,6 +40,7 @@ struct WordBoundaryState
    const char *str;
    size_t      len;
    size_t      offset;
+   bool        posix;
    bool        init;
    bool        prev_alnum;
 };
@@ -58,7 +59,7 @@ initcap_wbnext(void *state)
    {
        pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
                                        wbstate->offset);
-       bool        curr_alnum = pg_u_isalnum(u, true);
+       bool        curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
        if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
        {
@@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
        .str = src,
        .len = srclen,
        .offset = 0,
+       .posix = !locale->info.builtin.casemap_full,
        .init = false,
        .prev_alnum = false,
    };
index f0b38b3bdd75fd8eadc76a13e0f4367df502b413..fdfb62e855286fb1d0d2c74e8cc26b1b3ad4e4bd 100644 (file)
@@ -41,6 +41,7 @@ struct WordBoundaryState
    const char *str;
    size_t      len;
    size_t      offset;
+   bool        posix;
    bool        init;
    bool        prev_alnum;
 };
@@ -55,7 +56,7 @@ initcap_wbnext(void *state)
    {
        pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
                                        wbstate->offset);
-       bool        curr_alnum = pg_u_isalnum(u, true);
+       bool        curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
        if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
        {
@@ -112,10 +113,13 @@ icu_test_full(char *str)
    char        icu_upper[BUFSZ];
    char        icu_fold[BUFSZ];
    UErrorCode  status;
+
+   /* full case mapping doesn't use posix semantics */
    struct WordBoundaryState wbstate = {
        .str = str,
        .len = strlen(str),
        .offset = 0,
+       .posix = false,
        .init = false,
        .prev_alnum = false,
    };
@@ -344,6 +348,12 @@ test_convert_case()
    test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
    test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
    test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
+   /* test that alphanumerics are word characters */
+   test_convert(tfunc_title, "λλ", "Λλ");
+   test_convert(tfunc_title, "1a", "1a");
+   /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
+   test_convert(tfunc_title, "\uFF11a", "\uFF11a");
+
 
 #ifdef USE_ICU
    icu_test_full("");
@@ -354,6 +364,7 @@ test_convert_case()
    icu_test_full("abc 123xyz");
    icu_test_full("σςΣ ΣΣΣ");
    icu_test_full("ıiIİ");
+   icu_test_full("\uFF11a");
    /* test <alpha><iota_subscript><acute> */
    icu_test_full("\u0391\u0345\u0301");
 #endif
index 5508622b16d0113480c8f92f59aebee2422ef57f..0c3ab5c89b28432f64f6195233bf93440f6e8c52 100644 (file)
@@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('DŽxxDŽ džxxDž Džxxdž'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
@@ -67,10 +68,11 @@ SELECT
  abc DEF 123abc  | abc def 123abc  | Abc Def 123abc  | ABC DEF 123ABC  |      14 |            14 |              14 |            14
  ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF |      19 |            19 |              19 |            19
  DŽxxDŽ džxxDž Džxxdž  | džxxdž džxxdž džxxdž  | DŽxxdž DŽxxdž DŽxxdž  | DŽXXDŽ DŽXXDŽ DŽXXDŽ  |      20 |            20 |              20 |            20
+ Λλ 1a 1a       | λλ 1a 1a       | Λλ 1a 1A       | ΛΛ 1A 1A       |      12 |            12 |              12 |            12
  ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       6 |             9 |               8 |             6
  ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       9 |             9 |               8 |             6
  ⱥȺ              | ⱥⱥ              | Ⱥⱥ              | ȺȺ              |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)
 
 DROP TABLE test_pg_c_utf8;
 -- negative test: Final_Sigma not used for builtin locale C.UTF-8
@@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('DŽxxDŽ džxxDž Džxxdž'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
@@ -197,10 +200,11 @@ SELECT
  abc DEF 123abc  | abc def 123abc  | Abc Def 123abc   | ABC DEF 123ABC    |      14 |            14 |              14 |            14
  ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF |      19 |            19 |              19 |            19
  DŽxxDŽ džxxDž Džxxdž  | džxxdž džxxdž džxxdž  | Džxxdž Džxxdž Džxxdž   | DŽXXDŽ DŽXXDŽ DŽXXDŽ    |      20 |            20 |              20 |            20
+ Λλ 1a 1a       | λλ 1a 1a       | Λλ 1a 1a        | ΛΛ 1A 1A         |      12 |            12 |              12 |            12
  ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       6 |             9 |               8 |             6
  ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       9 |             9 |               8 |             6
  ⱥȺ              | ⱥⱥ              | Ⱥⱥ               | ȺȺ                |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)
 
 DROP TABLE test_pg_unicode_fast;
 -- test Final_Sigma
index 6c7c7aec9ec32bac7ed0a3676d8761a70c397336..d6d14220ab34234306e241cec5f8cf37d67aa938 100644 (file)
@@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('DŽxxDŽ džxxDž Džxxdž'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
@@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('DŽxxDŽ džxxDž Džxxdž'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');