summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaiki Ueno <ueno@gnu.org>2016-10-12 17:40:37 +0200
committerDaiki Ueno <dueno@redhat.com>2017-10-30 11:13:20 +0100
commited0bf5d5284b787af71b1b95fca1fdb6389fee3c (patch)
tree2825a6954b97ec150986e07b8b2ae32c45ea52b0
parentf466816e06ec3516567c3edcd0219bd1f9b736eb (diff)
downloadgnulib-ueno/unicode-9.0.0.tar.gz
gnulib-ueno/unicode-9.0.0.tar.bz2
libunistring: update to Unicode 9.0.0ueno/unicode-9.0.0
* lib/gen-uni-tables.c (fill_properties): Recognize Sentence_Terminal and Prepended_Concatenation_Mark. (is_property_default_ignorable_code_point): Exclude U+08E2. (fill_arabicshaping): Allow missing whitespace when parsing; recognize "AFRICAN FEH", "AFRICAN QAF", and "AFRICAN MOON". (output_blocks): Increase the element size of the level1 table to accommodate more blocks. (get_lbp): Recognize ZWJ, E_Base, and E_Modifier characters; Update each class according to the standard. (get_wbp): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. (output_gbp_table): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. * lib/unictype.in.h (UC_JOINING_GROUP_AFRICAN_FEH, UC_JOINING_GROUP_AFRICAN_QAF, UC_JOINING_GROUP_AFRICAN_MOON): New enum value. * lib/unilbrk/lbrktables.h (LBP_ZWJ, LBP_EB, LBP_EM): New enum value. * lib/unilbrk/lbrktables.c (unilbrk_table): Extend the table with LBP_ZWJ, LBP_EB, and LBP_EM. * lib/uniwbrk.in.h (WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, WBP_EBG): New enum value. * lib/uniwbrk/u-wordbreaks.h: Implement WB3c, WB15, and WB16. * lib/uniwbrk/wbrktable.h (uniwbrk_prop_index): New variable declaration. * lib/uniwbrk/wbrktable.c (uniwbrk_prop_index): New variable. (uniwbrk_table): Implement WB14. * tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Check WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, and WBP_EBG. * modules/unigbrk/u{32,16,8}-grapheme-breaks: No longer depend on uc-is-grapheme-break. * modules/unigbrk/uc-grapheme-breaks: New module. * modules/unigbrk/uc-grapheme-breaks-tests: New module. * lib/unigbrk.in.h (GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, GBP_EBG): New enum value. (uc_grapheme_breaks): New function, replacing uc_is_grapheme_break. * lib/unigbrk/u-grapheme-breaks.h: New file. * lib/unigbrk/u{32,16,8}-grapheme-breaks.c: Rewrite using u-grapheme-breaks.h instead of uc_is_grapheme_break. * lib/unigbrk/uc-grapheme-breaks.c: New file. * lib/unigbrk/uc-is-grapheme-break.c: Partially update to TR29 rev 29. * tests/unigbrk/test-uc-gbrk-prop.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. * tests/unigbrk/test-uc-grapheme-breaks.c: New test. * tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. (main): Skip unsupported rules involving 3 or more characters, namely GB10, GB12, and GB13. * lib/uniwidth/width.c (nonspacing_table_data): Update.
-rw-r--r--lib/gen-uni-tables.c316
-rw-r--r--lib/unictype.in.h5
-rw-r--r--lib/unigbrk.in.h9
-rw-r--r--lib/unigbrk/u-grapheme-breaks.h122
-rw-r--r--lib/unigbrk/u16-grapheme-breaks.c26
-rw-r--r--lib/unigbrk/u32-grapheme-breaks.c24
-rw-r--r--lib/unigbrk/u8-grapheme-breaks.c27
-rw-r--r--lib/unigbrk/uc-gbrk-prop.c3
-rw-r--r--lib/unigbrk/uc-grapheme-breaks.c39
-rw-r--r--lib/unigbrk/uc-is-grapheme-break.c29
-rw-r--r--lib/unilbrk/lbrktables.c61
-rw-r--r--lib/unilbrk/lbrktables.h23
-rwxr-xr-xlib/uniname/gen-uninames.lisp2
-rw-r--r--lib/uniwbrk.in.h7
-rw-r--r--lib/uniwbrk/u-wordbreaks.h66
-rw-r--r--lib/uniwbrk/wbrktable.c66
-rw-r--r--lib/uniwbrk/wbrktable.h3
-rw-r--r--lib/uniwidth/width.c46
-rw-r--r--modules/unigbrk/u16-grapheme-breaks5
-rw-r--r--modules/unigbrk/u32-grapheme-breaks5
-rw-r--r--modules/unigbrk/u8-grapheme-breaks5
-rw-r--r--modules/unigbrk/uc-grapheme-breaks28
-rw-r--r--modules/unigbrk/uc-grapheme-breaks-tests14
-rw-r--r--tests/unigbrk/test-uc-gbrk-prop.c5
-rw-r--r--tests/unigbrk/test-uc-grapheme-breaks.c191
-rwxr-xr-xtests/unigbrk/test-uc-grapheme-breaks.sh3
-rw-r--r--tests/unigbrk/test-uc-is-grapheme-break.c44
-rw-r--r--tests/uniwbrk/test-uc-wordbreaks.c5
-rwxr-xr-xtests/uniwidth/test-uc_width2.sh58
29 files changed, 989 insertions, 248 deletions
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index acdafe25b9..14c71ee28e 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 8.0.0
+ 9.0.0
*/
#include <assert.h>
@@ -2591,6 +2591,7 @@ enum
PROP_VARIATION_SELECTOR,
PROP_PATTERN_WHITE_SPACE,
PROP_PATTERN_SYNTAX,
+ PROP_PREPENDED_CONCATENATION_MARK,
/* DerivedCoreProperties.txt */
PROP_MATH,
PROP_ALPHABETIC,
@@ -2692,10 +2693,11 @@ fill_properties (const char *proplist_filename)
PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
PROP ("Other_ID_Start", PROP_OTHER_ID_START)
PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
- PROP ("STerm", PROP_STERM)
+ PROP ("Sentence_Terminal", PROP_STERM)
PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
+ PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
/* DerivedCoreProperties.txt */
PROP ("Math", PROP_MATH)
PROP ("Alphabetic", PROP_ALPHABETIC)
@@ -2890,7 +2892,8 @@ is_property_default_ignorable_code_point (unsigned int ch)
&& !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
/* For some reason, the following are not listed as having property
Default_Ignorable_Code_Point. */
- && !(ch == 0x110BD))
+ && !(ch == 0x110BD)
+ && !(ch == 0x8E2))
|| ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
|| ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
bool result2 =
@@ -3787,7 +3790,10 @@ enum
UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
- UC_JOINING_GROUP_MANICHAEAN_HUNDRED /* Manichaean_Hundred */
+ UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
+ UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
+ UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
+ UC_JOINING_GROUP_AFRICAN_NOON /* African_Noon */
};
static uint8_t unicode_joining_group[0x110000];
@@ -3815,30 +3821,26 @@ fill_arabicshaping (const char *arabicshaping_filename)
lineno = 0;
for (;;)
{
- char buf[100+1];
- char separator1[100+1];
- char padding1[100+1];
- char schematic_name[100+1];
- char separator2[100+1];
- char padding2[100+1];
- char joining_type_name[100+1];
- char separator3[100+1];
- char padding3[100+1];
- char joining_group_name[100+1];
+ char buf[200+1];
+ char separator1[200+1];
+ char schematic_name[200+1];
+ char separator2[200+1];
+ char joining_type_name[200+1];
+ char separator3[200+1];
+ char joining_group_name[200+1];
int joining_type;
int joining_group;
lineno++;
- if (fscanf (stream, "%100[^\n]\n", buf) < 1)
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
break;
if (buf[0] == '\0' || buf[0] == '#')
continue;
- if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
- &i, separator1, padding1, schematic_name, separator2,
- padding2, joining_type_name, separator3, padding3,
- joining_group_name) != 10)
+ if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
+ &i, separator1, schematic_name, separator2, joining_type_name,
+ separator3, joining_group_name) != 7)
{
fprintf (stderr, "parse error in '%s':%d\n",
arabicshaping_filename, lineno);
@@ -3955,6 +3957,9 @@ fill_arabicshaping (const char *arabicshaping_filename)
TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
+ TRY(UC_JOINING_GROUP_AFRICAN_FEH, "AFRICAN FEH")
+ TRY(UC_JOINING_GROUP_AFRICAN_QAF, "AFRICAN QAF")
+ TRY(UC_JOINING_GROUP_AFRICAN_NOON, "AFRICAN NOON")
#undef TRY
else
{
@@ -4264,6 +4269,9 @@ joining_group_as_c_identifier (int joining_group)
TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
+ TRY(UC_JOINING_GROUP_AFRICAN_FEH)
+ TRY(UC_JOINING_GROUP_AFRICAN_QAF)
+ TRY(UC_JOINING_GROUP_AFRICAN_NOON)
#undef TRY
abort ();
}
@@ -4901,7 +4909,7 @@ output_blocks (const char *version)
fprintf (stream, "};\n");
fprintf (stream, "#define blocks_level1_shift %d\n", shift);
fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
- fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
+ fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
threshold >> shift);
fprintf (stream, "{\n");
for (i1 = 0; i1 < (threshold >> shift); i1++)
@@ -6292,22 +6300,22 @@ output_width_property_test (const char *filename)
enum
{
- /* Values >= 27 are resolved at run time. */
- LBP_BK = 27, /* mandatory break */
+ /* Values >= 30 are resolved at run time. */
+ LBP_BK = 30, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 28, /* attached characters and combining marks */
+ LBP_CM = 31, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 29, /* zero width space */
+ LBP_ZW = 32, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 30, /* space */
+ LBP_SP = 33, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 31, /* contingent break opportunity */
+ LBP_CB = 34, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
LBP_CP = 7, /* closing parenthesis */
LBP_EX = 8, /* exclamation/interrogation */
@@ -6320,7 +6328,7 @@ enum
LBP_PO = 15, /* postfix (numeric) */
LBP_PR = 16, /* prefix (numeric) */
LBP_SY = 17, /* symbols allowing breaks */
- LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
+ LBP_AI = 35, /* ambiguous (alphabetic or ideograph) */
LBP_AL = 18, /* ordinary alphabetic and symbol characters */
/*LBP_CJ, conditional Japanese starter, resolved to NS */
LBP_H2 = 19, /* Hangul LV syllable */
@@ -6331,8 +6339,11 @@ enum
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
LBP_RI = 26, /* regional indicator */
- LBP_SA = 33, /* complex context (South East Asian) */
- LBP_XX = 34 /* unknown */
+ LBP_SA = 36, /* complex context (South East Asian) */
+ LBP_ZWJ = 27, /* zero width joiner */
+ LBP_EB = 28, /* emoji base */
+ LBP_EM = 29, /* emoji modifier */
+ LBP_XX = 37 /* unknown */
};
/* Returns the line breaking classification for ch, as a bit mask. */
@@ -6363,6 +6374,45 @@ get_lbp (unsigned int ch)
if (ch == 0x200B /* ZERO WIDTH SPACE */)
attr |= (int64_t) 1 << LBP_ZW;
+ /* zero width joiner */
+ if (ch == 0x200D /* ZERO WIDTH JOINER */)
+ attr |= (int64_t) 1 << LBP_ZWJ;
+
+ /* emoji base */
+ if (ch == 0x261D /* WHITE UP POINTING INDEX */
+ || ch == 0x26F9 /* PERSON WITH BALL */
+ || (ch >= 0x270A && ch <= 0x270D) /* RAISED FIST..WRITING HAND */
+ || ch == 0x1F385 /* FATHER CHRISTMAS */
+ || (ch >= 0x1F3C3 && ch <= 0x1F3C4) /* RUNNER..SURFER */
+ || (ch >= 0x1F3CA && ch <= 0x1F3CB) /* SWIMMER..WEIGHT LIFTER */
+ || (ch >= 0x1F442 && ch <= 0x1F443) /* EAR..NOSE */
+ || (ch >= 0x1F446 && ch <= 0x1F450) /* WHITE UP POINTING BACKHAND INDEX..OPEN HANDS SIGN */
+ || (ch >= 0x1F466 && ch <= 0x1F469) /* BOY..WOMAN */
+ || ch == 0x1F46E /* POLICE OFFICER */
+ || (ch >= 0x1F470 && ch <= 0x1F478) /* BRIDE WITH VEIL..PRINCESS */
+ || ch == 0x1F47C /* BABY ANGEL */
+ || (ch >= 0x1F481 && ch <= 0x1F483) /* INFORMATION DESK PERSON..DANCER */
+ || (ch >= 0x1F485 && ch <= 0x1F487) /* NAIL POLISH..HAIRCUT */
+ || ch == 0x1F4AA /* FLEXED BICEPS */
+ || ch == 0x1F575 /* SLEUTH OR SPY */
+ || ch == 0x1F57A /* MAN DANCING */
+ || ch == 0x1F590 /* RAISED HAND WITH FINGERS SPLAYED */
+ || (ch >= 0x1F595 && ch <= 0x1F596) /* REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS */
+ || (ch >= 0x1F645 && ch <= 0x1F647) /* FACE WITH NO GOOD GESTURE..PERSON BOWING DEEPLY */
+ || (ch >= 0x1F64B && ch <= 0x1F64F) /* HAPPY PERSON RAISING ONE HAND..PERSON WITH FOLDED HANDS */
+ || ch == 0x1F6A3 /* ROWBOAT */
+ || (ch >= 0x1F6B4 && ch <= 0x1F6B6) /* BICYCLIST..PEDESTRIAN */
+ || ch == 0x1F6C0 /* BATH */
+ || (ch >= 0x1F918 && ch <= 0x1F91E) /* SIGN OF THE HORNS..HAND WITH INDEX AND MIDDLE FINGERS CROSSED */
+ || ch == 0x1F926 /* FACE PALM */
+ || ch == 0x1F930 /* PREGNANT WOMAN */
+ || (ch >= 0x1F933 && ch <= 0x1F939) /* SELFIE..JUGGLING */
+ || (ch >= 0x1F93C && ch <= 0x1F93E) /* WRESTLERS..HANDBALL */)
+ attr |= (int64_t) 1 << LBP_EB;
+
+ if ((ch >= 0x1F3FB && ch <= 0x1F3FF) /* EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 */)
+ attr |= (int64_t) 1 << LBP_EM;
+
/* non-breaking (glue) */
if (ch == 0x00A0 /* NO-BREAK SPACE */
|| ch == 0x202F /* NARROW NO-BREAK SPACE */
@@ -6496,6 +6546,8 @@ get_lbp (unsigned int ch)
|| ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
|| (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
|| ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
+ || ch == 0x2E43 /* DASH WITH LEFT UPTURN */
+ || ch == 0x2E44 /* DOUBLE SUSPENSION MARK */
|| ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
|| ch == 0x2E3D /* VERTICAL SIX DOTS */
|| ch == 0x2E3E /* WIGGLY VERTICAL LINE */
@@ -6554,12 +6606,15 @@ get_lbp (unsigned int ch)
|| ch == 0x1123B /* KHOJKI SECTION MARK */
|| ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
|| ch == 0x112A9 /* MULTANI SECTION MARK */
+ || (ch >= 0x1144B && ch <= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
+ || ch == 0x1145B /* NEWA PLACEHOLDER MARK */
|| ch == 0x115C2 /* SIDDHAM DANDA */
|| ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
|| (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
|| ch == 0x11641 /* MODI DANDA */
|| ch == 0x11642 /* MODI DOUBLE DANDA */
|| (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
+ || (ch >= 0x11C41 && ch <= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
|| ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
|| ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
|| ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
@@ -6598,7 +6653,9 @@ get_lbp (unsigned int ch)
|| ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
|| ch == 0x11175 /* MAHAJANI SECTION MARK */
|| ch == 0x111DB /* SHARADA SIGN SIDDHAM */
- || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */)
+ || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */
+ || (ch >= 0x11660 && ch <= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
+ || ch == 0x11C70 /* MARCHEN HEAD MARK */)
attr |= (int64_t) 1 << LBP_BB;
/* hyphen */
@@ -6676,7 +6733,8 @@ get_lbp (unsigned int ch)
|| ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
|| ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
|| ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
- || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */)
+ || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */
+ || ch == 0x11C71 /* MARCHEN MARK SHAD */)
attr |= (int64_t) 1 << LBP_EX;
/* inseparable */
@@ -6717,6 +6775,7 @@ get_lbp (unsigned int ch)
|| ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
|| ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
|| ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
+ || ch == 0x16FE0 /* TANGUT ITERATION MARK */
|| ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
|| ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
|| ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
@@ -6737,7 +6796,8 @@ get_lbp (unsigned int ch)
|| ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
|| ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
|| ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
- || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */)
+ || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
+ || (ch >= 0x1E95E && ch <= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
attr |= (int64_t) 1 << LBP_OP;
/* ambiguous quotation */
@@ -6905,9 +6965,10 @@ get_lbp (unsigned int ch)
|| (unicode_attributes[ch].category[0] == 'C'
&& (unicode_attributes[ch].category[1] == 'c'
|| unicode_attributes[ch].category[1] == 'f')
- && ch != 0x110BD /* KAITHI NUMBER SIGN */)
+ && ch != 0x110BD /* KAITHI NUMBER SIGN */
+ && ch != 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
|| ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
- if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
+ if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
attr |= (int64_t) 1 << LBP_CM;
/* ideographic */
@@ -6983,6 +7044,7 @@ get_lbp (unsigned int ch)
|| ch == 0x270B /* RAISED HAND */
|| ch == 0x270C /* VICTORY HAND */
|| ch == 0x270D /* WRITING HAND */
+ || ch == 0x2764 /* HEAVY BLACK HEART */
|| (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
|| (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
|| (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
@@ -7046,6 +7108,15 @@ get_lbp (unsigned int ch)
|| ch == 0xFFE3 /* FULLWIDTH MACRON */
|| ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0xFF66 /* Halfwidth Katakana */
+ || (ch >= 0xFF71 && ch <= 0xFF9D) /* Halfwidth Katakana */
+ || (ch >= 0xFFA0 && ch <= 0xFFBE) /* Halfwidth Hangul */
+ || (ch >= 0xFFC2 && ch <= 0xFFC7) /* Halfwidth Hangul */
+ || (ch >= 0xFFCA && ch <= 0xFFCF) /* Halfwidth Hangul */
+ || (ch >= 0xFFD2 && ch <= 0xFFD7) /* Halfwidth Hangul */
+ || (ch >= 0xFFDA && ch <= 0xFFDC) /* Halfwidth Hangul */
+ || (ch >= 0x17000 && ch <= 0x187EC) /* Tangut Ideograph */
+ || (ch >= 0x18800 && ch <= 0x18AF2) /* Tangut Ideograph */
|| (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
|| (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
|| (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
@@ -7064,14 +7135,14 @@ get_lbp (unsigned int ch)
&& !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
&& !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
|| (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
- || (ch >= 0x1F680 && ch <= 0x1F6D0) /* Transport and Map Symbols */
+ || (ch >= 0x1F680 && ch <= 0x1F6DF) /* Transport and Map Symbols */
|| (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
- || (ch >= 0x1F6F0 && ch <= 0x1F6F3) /* Transport and Map Symbols */
+ || (ch >= 0x1F6F0 && ch <= 0x1F6F6) /* Transport and Map Symbols */
|| (ch >= 0x1F900 && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
|| (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
|| (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
|| (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */)
- if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
+ if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
{
/* ambiguous (ideograph) ? */
if ((unicode_width[ch] != NULL
@@ -7134,13 +7205,14 @@ get_lbp (unsigned int ch)
|| ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
|| ch == 0x06DD /* ARABIC END OF AYAH */
|| ch == 0x070F /* SYRIAC ABBREVIATION MARK */
+ || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */
|| ch == 0x2061 /* FUNCTION APPLICATION */
|| ch == 0x2062 /* INVISIBLE TIMES */
|| ch == 0x2063 /* INVISIBLE SEPARATOR */
|| ch == 0x2064 /* INVISIBLE PLUS */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x110BD /* KAITHI NUMBER SIGN */)
- if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
&& ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
{
/* ambiguous (alphabetic) ? */
@@ -7192,7 +7264,11 @@ get_lbp (unsigned int ch)
|| ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
|| ch == 0x2616 /* WHITE SHOGI PIECE */
|| ch == 0x2617 /* BLACK SHOGI PIECE */
+ || ch == 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
+ || ch == 0x2B55 /* HEAVY LARGE CIRCLE */
|| ch == 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
+ || ch == 0x1F18E /* NEGATIVE SQUARED AB */
+ || (ch >= 0x1F191 && ch <= 0x1F19A) /* SQUARED CL..SQUARED VS */
|| ch == 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
attr |= (int64_t) 1 << LBP_AI;
else
@@ -7206,6 +7282,38 @@ get_lbp (unsigned int ch)
if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
|| (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
|| (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+ || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */
+ || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */
+ || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */
+ || ch == 0x1F0C0 /* reserved */
+ || ch == 0x1F0D0 /* reserved */
+ || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */
+ || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */
+ || ch == 0x1F12F /* reserved */
+ || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */
+ || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */
+ || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */
+ || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */
+ || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */
+ || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */
+ || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */
+ || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */
+ || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */
+ || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */
+ || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */
+ || (ch >= 0x1F80C && ch <= 0x1F80F) /* reserved */
+ || (ch >= 0x1F848 && ch <= 0x1F84F) /* reserved */
+ || (ch >= 0x1F85A && ch <= 0x1F85F) /* reserved */
+ || (ch >= 0x1F888 && ch <= 0x1F88F) /* reserved */
+ || (ch >= 0x1F8AE && ch <= 0x1F90F) /* reserved */
+ || ch == 0x1F91F /* reserved */
+ || ch == 0x1F93F /* reserved */
+ || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */
+ || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */
+ || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */
+ || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */
+ || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */
+ || (ch >= 0x1F9C1 && ch <= 0x1FFFD) /* reserved */
|| (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
|| (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
Supplementary Ideographic Plane (Plane 2) outside of blocks */
@@ -7270,6 +7378,9 @@ debug_output_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JT);
PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_ZWJ);
+ PRINT_BIT(attr,LBP_EB);
+ PRINT_BIT(attr,LBP_EM);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
fprintf (stream, "\n");
@@ -7386,6 +7497,9 @@ fill_org_lbp (const char *linebreak_filename)
TRY(LBP_JT)
TRY(LBP_RI)
TRY(LBP_SA)
+ TRY(LBP_ZWJ)
+ TRY(LBP_EB)
+ TRY(LBP_EM)
TRY(LBP_XX)
#undef TRY
else if (strcmp (field1, "LF") == 0) value = LBP_BK;
@@ -7469,6 +7583,9 @@ debug_output_org_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JT);
PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_ZWJ);
+ PRINT_BIT(attr,LBP_EB);
+ PRINT_BIT(attr,LBP_EM);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
fprintf (stream, "\n");
@@ -7643,6 +7760,9 @@ output_lbp (FILE *stream1, FILE *stream2)
CASE(LBP_JT);
CASE(LBP_RI);
CASE(LBP_SA);
+ CASE(LBP_ZWJ);
+ CASE(LBP_EB);
+ CASE(LBP_EM);
CASE(LBP_XX);
#undef CASE
default:
@@ -7745,7 +7865,12 @@ enum
WBP_RI = 13,
WBP_DQ = 14,
WBP_SQ = 15,
- WBP_HL = 16
+ WBP_HL = 16,
+ WBP_ZWJ = 17,
+ WBP_EB = 18,
+ WBP_EM = 19,
+ WBP_GAZ = 20,
+ WBP_EBG = 21
};
/* Returns the word breaking property for ch, as a bit mask. */
@@ -7768,13 +7893,15 @@ get_wbp (unsigned int ch)
attr |= 1 << WBP_NEWLINE;
if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
+ || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
|| (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Mc") == 0))
attr |= 1 << WBP_EXTEND;
if (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Cf") == 0
- && ch != 0x200B && ch != 0x200C && ch != 0x200D)
+ && ch != 0x200B && ch != 0x200C && ch != 0x200D
+ && !(ch >= 0xe0020 && ch <= 0xe007f))
attr |= 1 << WBP_FORMAT;
if ((unicode_scripts[ch] < numscripts
@@ -7816,8 +7943,9 @@ get_wbp (unsigned int ch)
&& ch != 0x066C)
attr |= 1 << WBP_NUMERIC;
- if (unicode_attributes[ch].category != NULL
- && strcmp (unicode_attributes[ch].category, "Pc") == 0)
+ if ((unicode_attributes[ch].category != NULL
+ && strcmp (unicode_attributes[ch].category, "Pc") == 0)
+ || ch == 0x202F /* NARROW NO-BREAK SPACE */)
attr |= 1 << WBP_EXTENDNUMLET;
if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
@@ -7828,6 +7956,20 @@ get_wbp (unsigned int ch)
if (ch == 0x0027)
attr |= 1 << WBP_SQ;
+
+ if (ch == 0x200D)
+ attr |= 1 << WBP_ZWJ;
+
+ if (ch >= 0x1F466 && ch <= 0x1F469)
+ attr |= 1 << WBP_EBG;
+ else if (((get_lbp (ch) >> LBP_EB) & 1) != 0)
+ attr |= 1 << WBP_EB;
+
+ if (((get_lbp (ch) >> LBP_EM) & 1) != 0)
+ attr |= 1 << WBP_EM;
+
+ if (ch == 0x2764 || ch == 0x1F48B || ch == 0x1F5E8)
+ attr |= 1 << WBP_GAZ;
}
if (attr == 0)
@@ -7881,6 +8023,16 @@ debug_output_wbp (FILE *stream)
fprintf (stream, " Single_Quote");
if (attr & (1 << WBP_HL))
fprintf (stream, " Hebrew_Letter");
+ if (attr & (1 << WBP_ZWJ))
+ fprintf (stream, " ZWJ");
+ if (attr & (1 << WBP_EB))
+ fprintf (stream, " E_Base");
+ if (attr & (1 << WBP_EM))
+ fprintf (stream, " E_Modifier");
+ if (attr & (1 << WBP_GAZ))
+ fprintf (stream, " Glue_After_Zwj");
+ if (attr & (1 << WBP_EBG))
+ fprintf (stream, " E_Base_GAZ");
fprintf (stream, "\n");
}
}
@@ -7970,6 +8122,11 @@ fill_org_wbp (const char *wordbreakproperty_filename)
PROP ("Double_Quote", WBP_DQ)
PROP ("Single_Quote", WBP_SQ)
PROP ("Hebrew_Letter", WBP_HL)
+ PROP ("ZWJ", WBP_ZWJ)
+ PROP ("E_Base", WBP_EB)
+ PROP ("E_Modifier", WBP_EM)
+ PROP ("Glue_After_Zwj", WBP_GAZ)
+ PROP ("E_Base_GAZ", WBP_EBG)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -8019,6 +8176,11 @@ debug_output_org_wbp (FILE *stream)
PROP ("Double_Quote", WBP_DQ)
PROP ("Single_Quote", WBP_SQ)
PROP ("Hebrew_Letter", WBP_HL)
+ PROP ("ZWJ", WBP_ZWJ)
+ PROP ("E_Base", WBP_EB)
+ PROP ("E_Modifier", WBP_EM)
+ PROP ("Glue_After_Zwj", WBP_GAZ)
+ PROP ("E_Base_GAZ", WBP_EBG)
#undef PROP
fprintf (stream, " ??");
fprintf (stream, "\n");
@@ -8174,6 +8336,11 @@ output_wbp (FILE *stream)
CASE(WBP_DQ);
CASE(WBP_SQ);
CASE(WBP_HL);
+ CASE(WBP_ZWJ);
+ CASE(WBP_EB);
+ CASE(WBP_EM);
+ CASE(WBP_GAZ);
+ CASE(WBP_EBG);
#undef CASE
default:
abort ();
@@ -8238,7 +8405,7 @@ output_wbrk_tables (const char *filename, const char *version)
/* ========================================================================= */
/* Grapheme break property.
- Updated for Unicode TR #29 revision 17. */
+ Updated for Unicode TR #29 revision 29. */
/* Possible values of the Grapheme_Cluster_Break property. */
enum
@@ -8255,7 +8422,12 @@ enum
GBP_T = 9,
GBP_LV = 10,
GBP_LVT = 11,
- GBP_RI = 12
+ GBP_RI = 12,
+ GBP_ZWJ = 13,
+ GBP_EB = 14,
+ GBP_EM = 15,
+ GBP_GAZ = 16,
+ GBP_EBG = 17
};
/* Construction of sparse 3-level tables. */
@@ -8327,6 +8499,11 @@ output_gbp_test (const char *filename)
CASE (GBP_LV)
CASE (GBP_LVT)
CASE (GBP_RI)
+ CASE (GBP_ZWJ)
+ CASE (GBP_EB)
+ CASE (GBP_EM)
+ CASE (GBP_GAZ)
+ CASE (GBP_EBG)