pg_sasl_prepare: Implement SASLprep on UTF-8 strings

A couple of utilities are added at the same time to manipulate easily UTF-8 strings as arrays of integers: - UTF-8 string to integer array conversion. - integer array to UTF-8 string conversion. - Generation of header file from UnicodeData.txt that Postgres can refer to. - Set returning function to view at SQL level the conversion table. - SASLprep function.

pg_sasl_prepare: Implement SASLprep on UTF-8 strings
df3f18a8 · Michael Paquier · 0ace538a · df3f18a8 · df3f18a8 · df3f18a8
Commit df3f18a8 authored 8 years ago by Michael Paquier
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ SUBDIRS = blackhole	\
 	kill_idle	\
 	mcxtalloc_test	\
 	pg_rep_state	\
+	pg_sasl_prepare	\
 	pg_wal_blocks	\
 	pgmpc		\
 	receiver_raw

--- a/pg_sasl_prepare/.gitignore
+++ b/pg_sasl_prepare/.gitignore
+/utf8_table.h
+/UnicodeData.txt
--- a/pg_sasl_prepare/Makefile
+++ b/pg_sasl_prepare/Makefile
+MODULES = pg_sasl_prepare
+
+EXTENSION = pg_sasl_prepare
+DATA = pg_sasl_prepare--1.0.sql
+PGFILEDESC = "pg_sasl_prepare - SASLprepare for UTF-8 strings"
+
+DOWNLOAD = wget -O $@ --no-use-server-timestamps
+
+all: utf8_table pg_sasl_prepare.so
+
+UnicodeData.txt:
+	$(DOWNLOAD) http://unicode.org/Public/UNIDATA/UnicodeData.txt
+
+utf8_table: UnicodeData.txt
+	$(PERL) generate_conv.pl UnicodeData.txt utf8_table.h
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/pg_sasl_prepare/README
+++ b/pg_sasl_prepare/README
+pg_sasl_prepare
+===============
+
+Implementation of RFC4013, SASLprepare to work on strings for SCRAM
+authentication protocol. A couple of utilities are included as well
+to manipulate UTF-8 strings as array of integers. UnicodeData.txt
+is as well used as a base to generate a conversion table that Postgres
+can directly refer to for decomposition and class.
--- a/pg_sasl_prepare/generate_conv.pl
+++ b/pg_sasl_prepare/generate_conv.pl
+# Generate a conversion table using a Unicode data file as input, saving
+# in the output as a header file in the location specified by the caller
+# of this script.
+
+use strict;
+use warnings;
+
+use utf8;
+use open ':std', ':encoding(UTF-8)';
+
+# Convert a single unicode character using code given by caller to
+# hexadecimal and return it to caller. This is useful to treat the
+# first or sixth columns of UnicodeData.txt and print it in
+# hexadecimal format.
+sub get_hexa_code
+{
+	my $code = shift;
+
+	# First generate a unicode string, and then convert it.
+	my $s = sprintf("\\u%s", $code);
+	$s =~ s/\\u(....)/chr(hex($1))/eg;
+	# Encode it to get the set of bytes wanted.
+	utf8::encode($s);
+
+	# Compute result
+	my $result = "";
+	for (my $key = 0; $key < length($s); $key++)
+	{
+		my $char = substr($s, $key, 1);
+		$char = sprintf("%x", ord($char));
+		$result = $result . $char;
+	}
+
+	return $result;
+}
+
+die "Usage: $0 INPUT_FILE OUTPUT_PUT\n" if @ARGV != 2;
+my $input_file = $ARGV[0];
+my $output_file = $ARGV[1];
+
+# Count number of lines in input file to get size of table.
+my $input_lines = 0;
+open(my $FH, $input_file) or die "Could not open input file $input_file: $!.";
+while (my $line = <$FH>)
+{
+	my @elts = split(';', $line);
+	my $code = get_hexa_code($elts[0]);
+
+	# Skip codes longer than 4 bytes, or 8 characters.
+	next if length($code) > 8;
+	$input_lines++;
+}
+close $FH;
+
+# Open the input file and treat it line by line, one for each Unicode
+# character.
+open my $INPUT, $input_file or die "Could not open input file $input_file: $!";
+open my $OUTPUT, "> $output_file" or die "Could not open output file $output_file: $!\n";
+
+# Print header of output file.
+print $OUTPUT <<HEADER;
+/*
+ * File auto-generated from generate_conv.pl, do not edit, There is
+ * deliberately not an #ifndef PG_UTF8_TABLE_H here.
+ */
+typedef struct
+{
+    uint32      utf;        /* UTF-8 */
+    uint8       class;      /* combining class of character */
+    uint32      codes[18];   /* decomposition codes */
+} pg_utf_decomposition;
+
+/* conversion table */
+HEADER
+print $OUTPUT "static const pg_utf_decomposition SASLPrepConv[ $input_lines ] = {\n";
+
+my $first_item = 1;
+while ( my $line = <$INPUT> )
+{
+	# Split the line wanted and get the fields needed:
+	# - Unicode number
+	# - Combining class
+	# - Decomposition table
+	my @elts = split(';', $line);
+	my $code = get_hexa_code($elts[0]);
+	my $class = sprintf("0x%02x", $elts[3]);
+	my $decom = $elts[5];
+
+	# Skip codes longer than 4 bytes, or 8 characters.
+	next if length($code) > 8;
+
+	# Print a comma for all items except the first one.
+	if ($first_item)
+	{
+	    $first_item = 0;
+	}
+	else
+	{
+	    print $OUTPUT ",\n";
+	}
+
+	# Now print a single entry in the conversion table.
+	print $OUTPUT "\t{";
+	# Code number
+	print $OUTPUT "0x$code, ";
+	# Combining class
+	print $OUTPUT "$class, {";
+
+	# Remove decomposition type if any, keep only character codes and
+	# then print them.
+	$decom =~ s/\<[^][]*\>//g;
+	my @decom_elts = split(" ", $decom);
+	my $first_decom = 1;
+	foreach(@decom_elts)
+	{
+		if ($first_decom)
+		{
+		    $first_decom = 0;
+		}
+		else
+		{
+		    print $OUTPUT ", ";
+		}
+		my $decom_data = get_hexa_code($_);
+		print $OUTPUT "0x$decom_data";
+	}
+	print $OUTPUT "}}";
+}
+
+print $OUTPUT "\n};\n";
+close $OUTPUT;
+close $INPUT;
--- a/pg_sasl_prepare/pg_sasl_prepare--1.0.sql
+++ b/pg_sasl_prepare/pg_sasl_prepare--1.0.sql
+/* pg_sasl_prepare/pg_sasl_prepare--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_sasl_prepare" to load this file. \quit
+
+-- This is a pg_sasl_prepare
+CREATE FUNCTION pg_sasl_prepare(_int4)
+RETURNS _int4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+-- Conversion functions
+CREATE FUNCTION utf8_to_array(text)
+RETURNS _int4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE FUNCTION array_to_utf8(_int4)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+-- Conversion table fetch
+CREATE OR REPLACE FUNCTION utf8_conv_table(
+    OUT code int,
+    OUT class smallint,
+    OUT decomposition _int4)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
--- a/pg_sasl_prepare/pg_sasl_prepare.c
+++ b/pg_sasl_prepare/pg_sasl_prepare.c
+/*-------------------------------------------------------------------------
+ *
+ * pg_sasl_prepare.c
+ *		Set of functions for a minimal extension template
+ *
+ * Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		  pg_sasl_prepare/pg_sasl_prepare.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/tupdesc.h"
+#include "catalog/pg_type.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/tuplestore.h"
+
+/* local includes */
+#include "utf8_table.h"
+
+PG_MODULE_MAGIC;
+
+/* Utilities for array manipulation */
+#define ARRPTR(x)  ((int32 *) ARR_DATA_PTR(x))
+#define ARRNELEMS(x)  ArrayGetNItems(ARR_NDIM(x), ARR_DIMS(x))
+
+/*
+ * Create a new int array with room for "num" elements.
+ * Taken from contrib/intarray/.
+ */
+static ArrayType *
+new_intArrayType(int num)
+{
+	ArrayType  *r;
+	int		 nbytes = ARR_OVERHEAD_NONULLS(1) + sizeof(int) * num;
+
+	r = (ArrayType *) palloc0(nbytes);
+
+	SET_VARSIZE(r, nbytes);
+	ARR_NDIM(r) = 1;
+	r->dataoffset = 0;		  /* marker for no null bitmap */
+	ARR_ELEMTYPE(r) = INT4OID;
+	ARR_DIMS(r)[0] = num;
+	ARR_LBOUND(r)[0] = 1;
+
+	return r;
+}
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for UTF8 code -> conversion entry
+ */
+static int
+conv_compare(const void *p1, const void *p2)
+{
+    uint32      v1,
+                v2;
+
+    v1 = *(const uint32 *) p1;
+    v2 = ((const pg_utf_decomposition *) p2)->utf;
+    return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * Get the entry corresponding to code in the comversion table. This
+ * is useful to avoid repeating the calls to bsearch everywhere.
+ */
+static pg_utf_decomposition *
+get_code_entry(uint32 code)
+{
+	pg_utf_decomposition *entry;
+
+	/*
+	 * bsearch works as follows:
+	 * - a key to check for matches.
+	 * - a pointer pointing to the base of the conversion table.
+	 * - number of elements in the array to look for,
+	 * - size of an array element.
+	 * - comparison function.
+	 * If a match cannot be found, NULL is returned.
+	 */
+	entry = bsearch(&code,
+					(void *) SASLPrepConv,
+					lengthof(SASLPrepConv),
+					sizeof(pg_utf_decomposition),
+					conv_compare);
+
+	Assert(entry != NULL);
+	return entry;
+}
+
+/*
+ * Recursively look at the number of elements in the conversion table
+ * to calculate how many characters are used for the given code.
+ */
+static int
+get_decomposed_size(uint32 code)
+{
+	pg_utf_decomposition *entry;
+	int		size = 0;
+	int		i;
+
+	entry = get_code_entry(code);
+
+	/* Just count current code if no other decompositions */
+	if (entry->codes[0] == 0x0)
+		return 1;
+
+	/*
+	 * If this entry has other decomposition codes look at them as well.
+	 */
+	for (i = 0; i < lengthof(SASLPrepConv[0].codes); i++)
+	{
+		uint32 lcode = entry->codes[i];
+
+		/* Leave if no more decompositions */
+		if (lcode == 0x0)
+			break;
+		size += get_decomposed_size(lcode);
+	}
+
+	return size;
+}
+
+/*
+ * Decompose the given code into the array given by caller. The
+ * decomposition begins at the position given by caller, saving one
+ * lookup at the conversion table. The current position needs to be
+ * updated here to let the caller know from where to continue filling
+ * in the array result.
+ */
+static void
+decompose_code(uint32 code, int **result, int *current)
+{
+	pg_utf_decomposition *entry;
+	int		i;
+
+	entry = get_code_entry(code);
+
+	/*
+	 * Just fill in with the current decomposition if there are no
+	 * decomposition codes to recurse to.
+	 */
+	if (entry->codes[0] == 0x0)
+	{
+		int *res = *result;
+
+		res[*current] = (int) code;
+		(*current)++;
+		return;
+	}
+
+	/*
+	 * If this entry has other decomposition codes look at them as well.
+	 */
+	for (i = 0; i < lengthof(SASLPrepConv->codes); i++)
+	{
+		uint32 lcode = entry->codes[i];
+
+		/* Leave if no more decompositions */
+		if (lcode == 0x0)
+			break;
+		decompose_code(lcode, result, current);
+	}
+}
+
+
+/*
+ * pg_sasl_prepare
+ *
+ * Perform SASLprepare (NKFC) on a integer array identifying individual
+ * multibyte UTF-8 characters.
+ */
+PG_FUNCTION_INFO_V1(pg_sasl_prepare);
+Datum
+pg_sasl_prepare(PG_FUNCTION_ARGS)
+{
+	ArrayType  *input = PG_GETARG_ARRAYTYPE_P(0);
+	int		   *input_ptr = ARRPTR(input);
+	ArrayType  *result;
+	int		   *result_ptr;
+	int			count;
+	int			size = 0;
+
+	/* First do the character decomposition */
+
+	/*
+	 * Look recursively at the convertion table to understand the number
+	 * of elements that need to be created.
+	 */
+	for (count = 0; count < ARRNELEMS(input); count++)
+	{
+		uint32 code = input_ptr[count];
+
+		/*
+		 * Recursively look at the conversion table to determine into
+		 * how many characters the given code need to be decomposed.
+		 */
+		size += get_decomposed_size(code);
+	}
+
+	/*
+	 * Now fill in each entry recursively. This needs a second pass on
+	 * the conversion table.
+	 */
+	result = new_intArrayType(size);
+	result_ptr = ARRPTR(result);
+	size = 0;
+	for (count = 0; count < ARRNELEMS(input); count++)
+	{
+		uint32 code = input_ptr[count];
+
+		decompose_code(code, &result_ptr, &size);
+	}
+
+	/*
+	 * Now that the decomposition is done, apply the combining class
+	 * for each character of the.
+	 */
+	for (count = 1; count < ARRNELEMS(result); count++)
+	{
+		uint32	prev = result_ptr[count - 1];
+		uint32	next = result_ptr[count];
+		uint32	tmp;
+		pg_utf_decomposition *prevEntry = get_code_entry(prev);
+		pg_utf_decomposition *nextEntry = get_code_entry(next);
+
+		/*
+		 * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
+		 * a sequence of two adjacent characters in a string is an exchangeable
+		 * pair if the combining class (from the Unicode Character Database)
+		 * for the first character is greater than the combining class for the
+		 * second, and the second is not a starter.  A character is a starter
+		 * if its combining class is 0.
+		 */
+		if (nextEntry->class == 0x0 || prevEntry->class == 0x0)
+			continue;
+
+		if (prevEntry->class <= nextEntry->class)
+			continue;
+
+		/* exchange can happen */
+		tmp = result_ptr[count - 1];
+		result_ptr[count - 1] = result_ptr[count];
+		result_ptr[count] = tmp;
+
+		/* backtrack to check again */
+		if (count > 1)
+			count -= 2;
+	}
+
+	PG_RETURN_POINTER(result);
+}
+
+/*
+ * utf8_to_array
+ * Convert a UTF-8 string into an integer array.
+ */
+PG_FUNCTION_INFO_V1(utf8_to_array);
+Datum
+utf8_to_array(PG_FUNCTION_ARGS)
+{
+	char	   *input = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	ArrayType  *result;
+	int		   *result_ptr;
+	int			size = 0;
+	int			count;
+	int			encoding = GetDatabaseEncoding();
+	const unsigned char *utf = (unsigned char *) input;
+
+	if (encoding != PG_UTF8)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("Database encoding is not UTF-8")));
+
+	/*
+	 * Calculate the array size first by doing a first pass on the UTF-8 string
+	 */
+	while (*utf)
+	{
+		int l;
+
+		l = pg_utf_mblen(utf);
+
+		if (!pg_utf8_islegal(utf, l))
+			elog(ERROR, "incorrect utf-8 input");
+
+		size++;
+		utf += l;
+	}
+
+	/*
+	 * And now fill in the array with all the data from each character by
+	 * doing a second pass.
+	 */
+	result = new_intArrayType(size);
+	result_ptr = ARRPTR(result);
+	utf = (unsigned char *) input;
+	count = 0;
+	while (*utf)
+	{
+		uint32	iutf = 0;
+		int		l;
+
+		l = pg_utf_mblen(utf);
+
+		/* Calculate entry for character input for conversion table lookup */
+		if (l == 1)
+		{
+			iutf = *utf++;
+		}
+		else if (l == 2)
+		{
+			iutf = *utf++ << 8;
+			iutf |= *utf++;
+		}
+		else if (l == 3)
+		{
+			iutf = *utf++ << 16;
+			iutf |= *utf++ << 8;
+			iutf |= *utf++;
+		}
+		else if (l == 4)
+		{
+			iutf = *utf++ << 24;
+			iutf |= *utf++ << 16;
+			iutf |= *utf++ << 8;
+			iutf |= *utf++;
+		}
+		else
+			elog(ERROR, "incorrect multibyte length %d", l);
+
+		/* Let's not care about any signing */
+		result_ptr[count++] = (int32) iutf;
+	}
+
+	Assert(count == ARRNELEMS(result));
+
+	PG_RETURN_POINTER(result);
+}
+
+/*
+ * array_to_utf8
+ * Convert a UTF-8 string into an integer array.
+ */
+PG_FUNCTION_INFO_V1(array_to_utf8);
+Datum
+array_to_utf8(PG_FUNCTION_ARGS)
+{
+	ArrayType	   *input = PG_GETARG_ARRAYTYPE_P(0);
+	int			   *input_ptr = ARRPTR(input);
+	char		   *result;
+	int				size = 0;
+	int				count = 0;
+	int				i;
+
+	/*
+	 * Do a first pass on the array elements to calculate the size of the
+	 * string to return.
+	 */
+	for (i = 0; i < ARRNELEMS(input); i++)
+	{
+		uint32 code = input_ptr[i];
+
+		if (code & 0xff000000)
+			size++;
+		if (code & 0x00ff0000)
+			size++;
+		if (code & 0x0000ff00)
+			size++;
+		if (code & 0x000000ff)
+			size++;
+	}
+
+	/* Now fill in the string */
+	result = palloc0(size + 1);
+	for (i = 0; i < ARRNELEMS(input); i++)
+	{
+		uint32 code = input_ptr[i];
+
+		if (code & 0xff000000)
+			result[count++] = code >> 24;
+		if (code & 0x00ff0000)
+			result[count++] = code >> 16;
+		if (code & 0x0000ff00)
+			result[count++] = code >> 8;
+		if (code & 0x000000ff)
+			result[count++] = code;
+	}
+	result[count] = '\0';
+
+	Assert(count == size);
+	PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/*
+ * utf8_conv_table
+ * Return a full copy of the UTF-8 conversion table.
+ */
+PG_FUNCTION_INFO_V1(utf8_conv_table);
+Datum
+utf8_conv_table(PG_FUNCTION_ARGS)
+{
+    TupleDesc		tupdesc;
+	Tuplestorestate *tupstore;
+	ReturnSetInfo  *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	MemoryContext	per_query_ctx;
+	MemoryContext	oldcontext;
+	int				i;
+
+    /* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	/* Build tuple descriptor */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Print out all the values on the table */
+	for (i = 0; i < lengthof(SASLPrepConv); i++)
+	{
+		Datum		values[3];
+		bool		nulls[3];
+		pg_utf_decomposition entry = SASLPrepConv[i];
+		int			size, count;
+		ArrayType  *decomp = NULL;
+		int		   *decomp_ptr = NULL;
+
+
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		/* Fill in values, code first */
+		values[0] = Int32GetDatum(entry.utf);
+
+		/* class */
+		values[1] = Int16GetDatum((int16) entry.class);
+
+		/* decomposition array */
+		size = 0;
+		for (count = 0; count < lengthof(entry.codes); count++)
+		{
+			if (entry.codes[count] == 0x0)
+				break;
+			size++;
+		}
+		if (size == 0)
+			nulls[2] = true;
+		else
+		{
+			decomp = new_intArrayType(size);
+			decomp_ptr = ARRPTR(decomp);
+			for (count = 0; count < size; count++)
+				decomp_ptr[count] = (int) entry.codes[count];
+			values[2] = PointerGetDatum(decomp);
+		}
+
+		/* Save tuple values */
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+		if (decomp != NULL)
+			pfree(decomp);
+	}
+
+    /* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
--- a/pg_sasl_prepare/pg_sasl_prepare.control
+++ b/pg_sasl_prepare/pg_sasl_prepare.control
+# pg_sasl_prepare extension
+comment = 'SASLprepare for UTF-8 strings'
+default_version = '1.0'
+module_pathname = '$libdir/pg_sasl_prepare'
+relocatable = true