Skip to content
Snippets Groups Projects
Commit b63b368c authored by Michael Paquier's avatar Michael Paquier
Browse files

pg_sasl_prepare: set of fixes

The following things are adjusted:
- Ignore correctly characters from the exclusion tables.
- Improve variable name consistency.
- Simplify readability of Hangul character calculation.
parent 57f2b99e
No related branches found
No related tags found
No related merge requests found
Loading
Loading
@@ -38,6 +38,93 @@ die "Usage: $0 INPUT_FILE OUTPUT_PUT\n" if @ARGV != 2;
my $input_file = $ARGV[0];
my $output_file = $ARGV[1];
 
# Script-specific and post composition that need to be excluded from the tables
# generated per http://www.unicode.org/reports/tr15/.
my @no_recomp_codes = (
'0958', # DEVANAGARI LETTER QA
'0959', # DEVANAGARI LETTER KHHA
'095A', # DEVANAGARI LETTER GHHA
'095B', # DEVANAGARI LETTER ZA
'095C', # DEVANAGARI LETTER DDDHA
'095D', # DEVANAGARI LETTER RHA
'095E', # DEVANAGARI LETTER FA
'095F', # DEVANAGARI LETTER YYA
'09DC', # BENGALI LETTER RRA
'09DD', # BENGALI LETTER RHA
'09DF', # BENGALI LETTER YYA
'0A33', # GURMUKHI LETTER LLA
'0A36', # GURMUKHI LETTER SHA
'0A59', # GURMUKHI LETTER KHHA
'0A5A', # GURMUKHI LETTER GHHA
'0A5B', # GURMUKHI LETTER ZA
'0A5E', # GURMUKHI LETTER FA
'0B5C', # ORIYA LETTER RRA
'0B5D', # ORIYA LETTER RHA
'0F43', # TIBETAN LETTER GHA
'0F4D', # TIBETAN LETTER DDHA
'0F52', # TIBETAN LETTER DHA
'0F57', # TIBETAN LETTER BHA
'0F5C', # TIBETAN LETTER DZHA
'0F69', # TIBETAN LETTER KSSA
'0F76', # TIBETAN VOWEL SIGN VOCALIC R
'0F78', # TIBETAN VOWEL SIGN VOCALIC L
'0F93', # TIBETAN SUBJOINED LETTER GHA
'0F9D', # TIBETAN SUBJOINED LETTER DDHA
'0FA2', # TIBETAN SUBJOINED LETTER DHA
'0FA7', # TIBETAN SUBJOINED LETTER BHA
'0FAC', # TIBETAN SUBJOINED LETTER DZHA
'0FB9', # TIBETAN SUBJOINED LETTER KSSA
'FB1D', # HEBREW LETTER YOD WITH HIRIQ:
'FB1F', # HEBREW LIGATURE YIDDISH YOD YOD PATAH
'FB2A', # HEBREW LETTER SHIN WITH SHIN DOT
'FB2B', # HEBREW LETTER SHIN WITH SIN DOT
'FB2C', # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
'FB2D', # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
'FB2E', # HEBREW LETTER ALEF WITH PATAH
'FB2F', # HEBREW LETTER ALEF WITH QAMATS
'FB30', # HEBREW LETTER ALEF WITH MAPIQ
'FB31', # HEBREW LETTER BET WITH DAGESH
'FB32', # HEBREW LETTER GIMEL WITH DAGESH
'FB33', # HEBREW LETTER DALET WITH DAGESH
'FB34', # HEBREW LETTER HE WITH MAPIQ
'FB35', # HEBREW LETTER VAV WITH DAGESH
'FB36', # HEBREW LETTER ZAYIN WITH DAGESH
'FB38', # HEBREW LETTER TET WITH DAGESH
'FB39', # HEBREW LETTER YOD WITH DAGESH
'FB3A', # HEBREW LETTER FINAL KAF WITH DAGESH
'FB3B', # HEBREW LETTER KAF WITH DAGESH
'FB3C', # HEBREW LETTER LAMED WITH DAGESH
'FB3E', # HEBREW LETTER MEM WITH DAGESH
'FB40', # HEBREW LETTER NUN WITH DAGESH
'FB41', # HEBREW LETTER SAMEKH WITH DAGESH
'FB43', # HEBREW LETTER FINAL PE WITH DAGESH
'FB44', # HEBREW LETTER PE WITH DAGESH
'FB46', # HEBREW LETTER TSADI WITH DAGESH
'FB47', # HEBREW LETTER QOF WITH DAGESH
'FB48', # HEBREW LETTER RESH WITH DAGESH
'FB49', # HEBREW LETTER SHIN WITH DAGESH
'FB4A', # HEBREW LETTER TAV WITH DAGESH
'FB4B', # HEBREW LETTER VAV WITH HOLAM
'FB4C', # HEBREW LETTER BET WITH RAFE
'FB4D', # HEBREW LETTER KAF WITH RAFE
'FB4E', # HEBREW LETTER PE WITH RAFE
# post composition exclusion
'2ADC', # FORKING
'1D15E', # MUSICAL SYMBOL HALF NOTE
'1D15F', # MUSICAL SYMBOL QUARTER NOTE
'1D160', # MUSICAL SYMBOL EIGHTH NOTE
'1D161', # MUSICAL SYMBOL SIXTEENTH NOTE
'1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE
'1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE
'1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
'1D1BB', # MUSICAL SYMBOL MINIMA
'1D1BC', # MUSICAL SYMBOL MINIMA BLACK
'1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE
'1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK
'1D1BF', # MUSICAL SYMBOL FUSA WHITE
'1D1C0' # MUSICAL SYMBOL FUSA BLACK
);
# Count number of lines in input file to get size of table.
my $input_lines = 0;
open(my $FH, $input_file) or die "Could not open input file $input_file: $!.";
Loading
Loading
@@ -49,6 +136,18 @@ while (my $line = <$FH>)
# Skip codes longer than 4 bytes, or 8 characters.
next if length($code) > 8;
 
# Skip codes that cannot be composed
my $found_no_recomp = 0;
foreach my $lcode (@no_recomp_codes)
{
if ($lcode eq $elts[0])
{
$found_no_recomp = 1;
last;
}
}
next if $found_no_recomp;
# Skip characters with no decompositions and a class of 0.
next if $elts[3] eq '0' && $elts[5] eq '';
 
Loading
Loading
@@ -101,6 +200,18 @@ while ( my $line = <$INPUT> )
# to reduce the table size.
next if $elts[3] eq '0' && $elts[5] eq '';
 
# Skip codes that cannot be composed
my $found_no_recomp = 0;
foreach my $lcode (@no_recomp_codes)
{
if ($lcode eq $elts[0])
{
$found_no_recomp = 1;
last;
}
}
next if $found_no_recomp;
# Print a comma for all items except the first one.
if ($first_item)
{
Loading
Loading
Loading
Loading
@@ -33,6 +33,17 @@ PG_MODULE_MAGIC;
#define ARRPTR(x) ((int32 *) ARR_DATA_PTR(x))
#define ARRNELEMS(x) ArrayGetNItems(ARR_NDIM(x), ARR_DIMS(x))
 
/* Constants for calculations wih Hangul characters */
#define SBASE 0xAC00
#define LBASE 0x1100
#define VBASE 0x1161
#define TBASE 0x11A7
#define LCOUNT 19
#define VCOUNT 21
#define TCOUNT 28
#define NCOUNT VCOUNT * TCOUNT
#define SCOUNT LCOUNT * NCOUNT
/*
* Create a new int array with room for "num" elements.
* Taken from contrib/intarray/.
Loading
Loading
@@ -186,16 +197,14 @@ get_decomposed_size(uint32 code)
* See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
* on the matter.
*/
if (code >= 0xAC00 && code < 0xD7A4)
if (code >= SBASE && code < SBASE + SCOUNT)
{
uint32 l, v, t, hindex;
uint32 tindex, sindex;
 
hindex = code - 0xAC00;
l = 0x1100 + hindex / (21 * 28);
v = 0x1161 + (hindex % (21 * 28)) / 28;
t = hindex % 28;
sindex = code - SBASE;
tindex = sindex % TCOUNT;
 
if (t != 0)
if (tindex != 0)
return 3;
return 2;
}
Loading
Loading
@@ -234,21 +243,21 @@ static bool
recompose_code(uint32 start, uint32 code, uint32 *result)
{
/* No need to care about ascii characters */
if (start <= 0xef || code <= 0xef)
if (start <= 0x7f || code <= 0x7f)
return false;
 
/* Hangul characters go here */
if (start >= 0x1100 && start < 0x1113 &&
code >= 0x1161 && code < 0x1176)
if (start >= LBASE && start < LBASE + LCOUNT &&
code >= VBASE && code < VBASE + VCOUNT)
{
*result = ((start - 0x1100) * 21 + code - 0x1161) * 28 + 0xAC00;
*result = ((start - LBASE) * VCOUNT + code - VBASE) * TCOUNT + SBASE;
return true;
}
else if (start >= 0xAC00 && start < 0xD7A4 &&
!((start - 0xAC00) % 28) &&
code >= 0x11A8 && code < 0x11C3)
else if (start >= SBASE && start < (SBASE + SCOUNT) &&
((start - SBASE) % TCOUNT) == 0 &&
code >= TBASE && code < (TBASE + TCOUNT))
{
*result = start + code - 0x11A7;
*result = start + code - TBASE;
return true;
}
else
Loading
Loading
@@ -296,29 +305,29 @@ decompose_code(uint32 code, int **result, int *current)
* See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
* on the matter.
*/
if (code >= 0xAC00 && code < 0xD7A4)
if (code >= SBASE && code < SBASE + SCOUNT)
{
uint32 l, v, t, hindex;
int *res = *result;
uint32 l, v, tindex, sindex;
int *res = *result;
 
hindex = code - 0xAC00;
l = 0x1100 + hindex / (21 * 28);
v = 0x1161 + (hindex % (21 * 28)) / 28;
t = hindex % 28;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
tindex = sindex % TCOUNT;
 
res[*current] = l;
(*current)++;
res[*current] = v;
(*current)++;
 
if (t != 0)
if (tindex != 0)
{
res[*current] = 0x11A7 + t;
res[*current] = TBASE + tindex;
(*current)++;
}
 
return;
}
}
 
entry = get_code_entry(code);
 
Loading
Loading
@@ -371,13 +380,11 @@ pg_sasl_prepare(PG_FUNCTION_ARGS)
int size = 0;
int decomp_size = 0;
int recomp_size = 0;
/* variables for recomposition */
int lastClass;
int starterPos;
int sourceLength;
int targetPos;
uint32 starterCh;
int last_class;
int starter_pos;
int target_pos;
uint32 starter_ch;
 
/* First do the compatibility decomposition */
 
Loading
Loading
@@ -464,40 +471,39 @@ pg_sasl_prepare(PG_FUNCTION_ARGS)
* make the allocation of the recomposed string based on that assumption.
*/
recomp_ptr = (int *) palloc(decomp_size * sizeof(int));
lastClass = -1; /* this eliminates a special check */
starterPos = 0;
sourceLength = decomp_size;
targetPos = 1;
starterCh = recomp_ptr[0] = decomp_ptr[0];
last_class = -1; /* this eliminates a special check */
starter_pos = 0;
target_pos = 1;
starter_ch = recomp_ptr[0] = decomp_ptr[0];
 
for (count = 1; count < decomp_size; count++)
{
uint32 ch = (uint32) decomp_ptr[count];
pg_utf_decomposition *chEntry = get_code_entry(ch);
int chClass = chEntry == NULL ? 0 : chEntry->class;
uint32 composite;
bool found_match = recompose_code(starterCh, ch, &composite);
pg_utf_decomposition *ch_entry = get_code_entry(ch);
int ch_class = ch_entry == NULL ? 0 : ch_entry->class;
pg_wchar composite;
 
if (found_match && lastClass < chClass)
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
{
recomp_ptr[starterPos] = (int) composite;
starterCh = composite;
recomp_ptr[starter_pos] = composite;
starter_ch = composite;
}
else if (chClass == 0)
else if (ch_class == 0)
{
starterPos = targetPos;
starterCh = ch;
lastClass = -1;
recomp_ptr[targetPos++] = (int) ch;
starter_pos = target_pos;
starter_ch = ch;
last_class = -1;
recomp_ptr[target_pos++] = ch;
}
else
{
lastClass = chClass;
recomp_ptr[targetPos++] = (int) ch;
last_class = ch_class;
recomp_ptr[target_pos++] = ch;
}
}
 
recomp_size = targetPos;
recomp_size = target_pos;
 
/* And finally fill-in the result */
result = new_intArrayType(recomp_size);
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment