pg_sasl_prepare: set of fixes

The following things are adjusted: - Ignore correctly characters from the exclusion tables. - Improve variable name consistency. - Simplify readability of Hangul character calculation.

pg_sasl_prepare: set of fixes
b63b368c · Michael Paquier · 57f2b99e · b63b368c · b63b368c
Commit b63b368c authored 8 years ago by Michael Paquier
--- a/pg_sasl_prepare/generate_conv.pl
+++ b/pg_sasl_prepare/generate_conv.pl
@@ -38,6 +38,93 @@ die "Usage: $0 INPUT_FILE OUTPUT_PUT\n" if @ARGV != 2;
 my $input_file = $ARGV[0];
 my $output_file = $ARGV[1];
  
+# Script-specific and post composition that need to be excluded from the tables
+# generated per http://www.unicode.org/reports/tr15/.
+my @no_recomp_codes = (
+	'0958',  # DEVANAGARI LETTER QA
+	'0959',  # DEVANAGARI LETTER KHHA
+	'095A',  # DEVANAGARI LETTER GHHA
+	'095B',  # DEVANAGARI LETTER ZA
+	'095C',  # DEVANAGARI LETTER DDDHA
+	'095D',  # DEVANAGARI LETTER RHA
+	'095E',  # DEVANAGARI LETTER FA
+	'095F',  # DEVANAGARI LETTER YYA
+	'09DC',  # BENGALI LETTER RRA
+	'09DD',  # BENGALI LETTER RHA
+	'09DF',  # BENGALI LETTER YYA
+	'0A33',  # GURMUKHI LETTER LLA
+	'0A36',  # GURMUKHI LETTER SHA
+	'0A59',  # GURMUKHI LETTER KHHA
+	'0A5A',  # GURMUKHI LETTER GHHA
+	'0A5B',  # GURMUKHI LETTER ZA
+	'0A5E',  # GURMUKHI LETTER FA
+	'0B5C',  # ORIYA LETTER RRA
+	'0B5D',  # ORIYA LETTER RHA
+	'0F43',  # TIBETAN LETTER GHA
+	'0F4D',  # TIBETAN LETTER DDHA
+	'0F52',  # TIBETAN LETTER DHA
+	'0F57',  # TIBETAN LETTER BHA
+	'0F5C',  # TIBETAN LETTER DZHA
+	'0F69',  # TIBETAN LETTER KSSA
+	'0F76',  # TIBETAN VOWEL SIGN VOCALIC R
+	'0F78',  # TIBETAN VOWEL SIGN VOCALIC L
+	'0F93',  # TIBETAN SUBJOINED LETTER GHA
+	'0F9D',  # TIBETAN SUBJOINED LETTER DDHA
+	'0FA2',  # TIBETAN SUBJOINED LETTER DHA
+	'0FA7',  # TIBETAN SUBJOINED LETTER BHA
+	'0FAC',  # TIBETAN SUBJOINED LETTER DZHA
+	'0FB9',  # TIBETAN SUBJOINED LETTER KSSA
+	'FB1D',  # HEBREW LETTER YOD WITH HIRIQ:
+	'FB1F',  # HEBREW LIGATURE YIDDISH YOD YOD PATAH
+	'FB2A',  # HEBREW LETTER SHIN WITH SHIN DOT
+	'FB2B',  # HEBREW LETTER SHIN WITH SIN DOT
+	'FB2C',  # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
+	'FB2D',  # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
+	'FB2E',  # HEBREW LETTER ALEF WITH PATAH
+	'FB2F',  # HEBREW LETTER ALEF WITH QAMATS
+	'FB30',  # HEBREW LETTER ALEF WITH MAPIQ
+	'FB31',  # HEBREW LETTER BET WITH DAGESH
+	'FB32',  # HEBREW LETTER GIMEL WITH DAGESH
+	'FB33',  # HEBREW LETTER DALET WITH DAGESH
+	'FB34',  # HEBREW LETTER HE WITH MAPIQ
+	'FB35',  # HEBREW LETTER VAV WITH DAGESH
+	'FB36',  # HEBREW LETTER ZAYIN WITH DAGESH
+	'FB38',  # HEBREW LETTER TET WITH DAGESH
+	'FB39',  # HEBREW LETTER YOD WITH DAGESH
+	'FB3A',  # HEBREW LETTER FINAL KAF WITH DAGESH
+	'FB3B',  # HEBREW LETTER KAF WITH DAGESH
+	'FB3C',  # HEBREW LETTER LAMED WITH DAGESH
+	'FB3E',  # HEBREW LETTER MEM WITH DAGESH
+	'FB40',  # HEBREW LETTER NUN WITH DAGESH
+	'FB41',  # HEBREW LETTER SAMEKH WITH DAGESH
+	'FB43',  # HEBREW LETTER FINAL PE WITH DAGESH
+	'FB44',  # HEBREW LETTER PE WITH DAGESH
+	'FB46',  # HEBREW LETTER TSADI WITH DAGESH
+	'FB47',  # HEBREW LETTER QOF WITH DAGESH
+	'FB48',  # HEBREW LETTER RESH WITH DAGESH
+	'FB49',  # HEBREW LETTER SHIN WITH DAGESH
+	'FB4A',  # HEBREW LETTER TAV WITH DAGESH
+	'FB4B',  # HEBREW LETTER VAV WITH HOLAM
+	'FB4C',  # HEBREW LETTER BET WITH RAFE
+	'FB4D',  # HEBREW LETTER KAF WITH RAFE
+	'FB4E',  # HEBREW LETTER PE WITH RAFE
+	# post composition exclusion
+	'2ADC',  #  FORKING
+	'1D15E', # MUSICAL SYMBOL HALF NOTE
+	'1D15F', # MUSICAL SYMBOL QUARTER NOTE
+	'1D160', # MUSICAL SYMBOL EIGHTH NOTE
+	'1D161', # MUSICAL SYMBOL SIXTEENTH NOTE
+	'1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE
+	'1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE
+	'1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
+	'1D1BB', # MUSICAL SYMBOL MINIMA
+	'1D1BC', # MUSICAL SYMBOL MINIMA BLACK
+	'1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE
+	'1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK
+	'1D1BF', # MUSICAL SYMBOL FUSA WHITE
+	'1D1C0'  # MUSICAL SYMBOL FUSA BLACK
+    );
+
 # Count number of lines in input file to get size of table.
 my $input_lines = 0;
 open(my $FH, $input_file) or die "Could not open input file $input_file: $!.";
@@ -49,6 +136,18 @@ while (my $line = <$FH>)
 	# Skip codes longer than 4 bytes, or 8 characters.
 	next if length($code) > 8;
  
+	# Skip codes that cannot be composed
+	my $found_no_recomp = 0;
+	foreach my $lcode  (@no_recomp_codes)
+	{
+		if ($lcode eq $elts[0])
+		{
+			$found_no_recomp = 1;
+			last;
+		}
+	}
+	next if $found_no_recomp;
+
 	# Skip characters with no decompositions and a class of 0.
 	next if $elts[3] eq '0' && $elts[5] eq '';
  
@@ -101,6 +200,18 @@ while ( my $line = <$INPUT> )
 	# to reduce the table size.
 	next if $elts[3] eq '0' && $elts[5] eq '';
  
+	# Skip codes that cannot be composed
+	my $found_no_recomp = 0;
+	foreach my $lcode  (@no_recomp_codes)
+	{
+		if ($lcode eq $elts[0])
+		{
+			$found_no_recomp = 1;
+			last;
+		}
+	}
+	next if $found_no_recomp;
+
 	# Print a comma for all items except the first one.
 	if ($first_item)
 	{

--- a/pg_sasl_prepare/pg_sasl_prepare.c
+++ b/pg_sasl_prepare/pg_sasl_prepare.c
@@ -33,6 +33,17 @@ PG_MODULE_MAGIC;
 #define ARRPTR(x)  ((int32 *) ARR_DATA_PTR(x))
 #define ARRNELEMS(x)  ArrayGetNItems(ARR_NDIM(x), ARR_DIMS(x))
  
+/* Constants for calculations wih Hangul characters */
+#define SBASE		0xAC00
+#define LBASE		0x1100
+#define VBASE		0x1161
+#define TBASE		0x11A7
+#define LCOUNT		19
+#define VCOUNT		21
+#define TCOUNT		28
+#define NCOUNT		VCOUNT * TCOUNT
+#define SCOUNT		LCOUNT * NCOUNT
+
 /*
 * Create a new int array with room for "num" elements.
 * Taken from contrib/intarray/.
@@ -186,16 +197,14 @@ get_decomposed_size(uint32 code)
 	 * See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
 	 * on the matter.
 	 */
-	if (code >= 0xAC00 && code < 0xD7A4)
+	if (code >= SBASE && code < SBASE + SCOUNT)
 	{
-		uint32	l, v, t, hindex;
+		uint32	tindex, sindex;
  
-		hindex = code - 0xAC00;
-		l = 0x1100 + hindex / (21 * 28);
-		v = 0x1161 + (hindex % (21 * 28)) / 28;
-		t = hindex % 28;
+		sindex = code - SBASE;
+		tindex = sindex % TCOUNT;
  
-		if (t != 0)
+		if (tindex != 0)
 			return 3;
 		return 2;
 	}
@@ -234,21 +243,21 @@ static bool
 recompose_code(uint32 start, uint32 code, uint32 *result)
 {
 	/* No need to care about ascii characters */
-	if (start <= 0xef || code <= 0xef)
+	if (start <= 0x7f || code <= 0x7f)
 		return false;
  
 	/* Hangul characters go here */
-	if (start >= 0x1100 && start < 0x1113 &&
-		code >= 0x1161 && code < 0x1176)
+	if (start >= LBASE && start < LBASE + LCOUNT &&
+		code >= VBASE && code < VBASE + VCOUNT)
 	{
-		*result = ((start - 0x1100) * 21 + code - 0x1161) * 28 + 0xAC00;
+		*result = ((start - LBASE) * VCOUNT + code - VBASE) * TCOUNT + SBASE;
 		return true;
 	}
-	else if (start >= 0xAC00 && start < 0xD7A4 &&
-			 !((start - 0xAC00) % 28) &&
-			 code >= 0x11A8 && code < 0x11C3)
+	else if (start >= SBASE && start < (SBASE + SCOUNT) &&
+			 ((start - SBASE) % TCOUNT) == 0 &&
+			 code >= TBASE && code < (TBASE + TCOUNT))
 	{
-		*result = start + code - 0x11A7;
+		*result = start + code - TBASE;
 		return true;
 	}
 	else
@@ -296,29 +305,29 @@ decompose_code(uint32 code, int **result, int *current)
 	 * See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
 	 * on the matter.
 	 */
-	if (code >= 0xAC00 && code < 0xD7A4)
+	if (code >= SBASE && code < SBASE + SCOUNT)
 	{
-		uint32	l, v, t, hindex;
-		int	   *res = *result;
+		uint32	l, v, tindex, sindex;
+		int   *res = *result;
  
-		hindex = code - 0xAC00;
-		l = 0x1100 + hindex / (21 * 28);
-		v = 0x1161 + (hindex % (21 * 28)) / 28;
-		t = hindex % 28;
+		sindex = code - SBASE;
+		l = LBASE + sindex / (VCOUNT * TCOUNT);
+		v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
+		tindex = sindex % TCOUNT;
  
 		res[*current] = l;
 		(*current)++;
 		res[*current] = v;
 		(*current)++;
  
-		if (t != 0)
+		if (tindex != 0)
 		{
-			res[*current] = 0x11A7 + t;
+			res[*current] = TBASE + tindex;
 			(*current)++;
 		}
  
 		return;
-    }
+	}
  
 	entry = get_code_entry(code);
  
@@ -371,13 +380,11 @@ pg_sasl_prepare(PG_FUNCTION_ARGS)
 	int			size = 0;
 	int			decomp_size = 0;
 	int			recomp_size = 0;
-
 	/* variables for recomposition */
-	int		lastClass;
-	int		starterPos;
-	int		sourceLength;
-	int		targetPos;
-	uint32	starterCh;
+	int			last_class;
+	int			starter_pos;
+	int			target_pos;
+	uint32		starter_ch;
  
 	/* First do the compatibility decomposition */
  
@@ -464,40 +471,39 @@ pg_sasl_prepare(PG_FUNCTION_ARGS)
 	 * make the allocation of the recomposed string based on that assumption.
 	 */
 	recomp_ptr = (int *) palloc(decomp_size * sizeof(int));
-	lastClass = -1;		/* this eliminates a special check */
-	starterPos = 0;
-	sourceLength = decomp_size;
-	targetPos = 1;
-	starterCh = recomp_ptr[0] = decomp_ptr[0];
+	last_class = -1;	 /* this eliminates a special check */
+	starter_pos = 0;
+	target_pos = 1;
+	starter_ch = recomp_ptr[0] = decomp_ptr[0];
  
 	for (count = 1; count < decomp_size; count++)
 	{
 		uint32 ch = (uint32) decomp_ptr[count];
-		pg_utf_decomposition *chEntry = get_code_entry(ch);
-		int chClass = chEntry == NULL ? 0 : chEntry->class;
-		uint32 composite;
-		bool	found_match = recompose_code(starterCh, ch, &composite);
+		pg_utf_decomposition *ch_entry = get_code_entry(ch);
+		int			ch_class = ch_entry == NULL ? 0 : ch_entry->class;
+		pg_wchar	composite;
  
-		if (found_match && lastClass < chClass)
+		if (last_class < ch_class &&
+			recompose_code(starter_ch, ch, &composite))
 		{
-			recomp_ptr[starterPos] = (int) composite;
-			starterCh = composite;
+			recomp_ptr[starter_pos] = composite;
+			starter_ch = composite;
 		}
-		else if (chClass == 0)
+		else if (ch_class == 0)
 		{
-			starterPos = targetPos;
-			starterCh  = ch;
-			lastClass  = -1;
-			recomp_ptr[targetPos++] = (int) ch;
+			starter_pos = target_pos;
+			starter_ch  = ch;
+			last_class  = -1;
+			recomp_ptr[target_pos++] = ch;
 		}
 		else
 		{
-			lastClass = chClass;
-			recomp_ptr[targetPos++] = (int) ch;
+			last_class = ch_class;
+			recomp_ptr[target_pos++] = ch;
 		}
 	}
  
-	recomp_size = targetPos;
+	recomp_size = target_pos;
  
 	/* And finally fill-in the result */
 	result = new_intArrayType(recomp_size);