Skip to content
Snippets Groups Projects
Commit a4111375 authored by Michael Paquier's avatar Michael Paquier
Browse files

pg_sasl_prepare: Use upstream implementation of SASLprep

No need to maintain twice the same code.
parent bed5a92c
No related branches found
No related tags found
No related merge requests found
Loading
Loading
@@ -4,16 +4,6 @@ EXTENSION = pg_sasl_prepare
DATA = pg_sasl_prepare--1.0.sql
PGFILEDESC = "pg_sasl_prepare - SASLprepare for UTF-8 strings"
 
DOWNLOAD = wget -O $@ --no-use-server-timestamps
all: utf8_table pg_sasl_prepare.so
UnicodeData.txt:
$(DOWNLOAD) http://unicode.org/Public/UNIDATA/UnicodeData.txt
utf8_table: UnicodeData.txt
$(PERL) generate_conv.pl UnicodeData.txt utf8_table.h
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
pg_sasl_prepare
===============
 
Implementation of RFC4013, SASLprepare to work on strings for SCRAM
authentication protocol. A couple of utilities are included as well
to manipulate UTF-8 strings as array of integers. UnicodeData.txt
is as well used as a base to generate a conversion table that Postgres
can directly refer to for decomposition and class.
Simple utility wrapper on top of PostgreSQL implementation of SASLprep,
as explained in RFC4013.
# Generate a conversion table using a Unicode data file as input, saving
# in the output as a header file in the location specified by the caller
# of this script.
use strict;
use warnings;
use utf8;
use open ':std', ':encoding(UTF-8)';
# Convert a single unicode character using code given by caller to
# hexadecimal and return it to caller. This is useful to treat the
# first or sixth columns of UnicodeData.txt and print it in
# hexadecimal format.
sub get_hexa_code
{
my $code = shift;
# First generate a unicode string, and then convert it.
my $s = sprintf("\\u%s", $code);
$s =~ s/\\u(....)/chr(hex($1))/eg;
# Encode it to get the set of bytes wanted.
utf8::encode($s);
# Compute result
my $result = "";
for (my $key = 0; $key < length($s); $key++)
{
my $char = substr($s, $key, 1);
$char = sprintf("%x", ord($char));
$result = $result . $char;
}
return $result;
}
die "Usage: $0 INPUT_FILE OUTPUT_PUT\n" if @ARGV != 2;
my $input_file = $ARGV[0];
my $output_file = $ARGV[1];
# Script-specific and post composition that need to be excluded from the tables
# generated per http://www.unicode.org/reports/tr15/.
my @no_recomp_codes = (
'0958', # DEVANAGARI LETTER QA
'0959', # DEVANAGARI LETTER KHHA
'095A', # DEVANAGARI LETTER GHHA
'095B', # DEVANAGARI LETTER ZA
'095C', # DEVANAGARI LETTER DDDHA
'095D', # DEVANAGARI LETTER RHA
'095E', # DEVANAGARI LETTER FA
'095F', # DEVANAGARI LETTER YYA
'09DC', # BENGALI LETTER RRA
'09DD', # BENGALI LETTER RHA
'09DF', # BENGALI LETTER YYA
'0A33', # GURMUKHI LETTER LLA
'0A36', # GURMUKHI LETTER SHA
'0A59', # GURMUKHI LETTER KHHA
'0A5A', # GURMUKHI LETTER GHHA
'0A5B', # GURMUKHI LETTER ZA
'0A5E', # GURMUKHI LETTER FA
'0B5C', # ORIYA LETTER RRA
'0B5D', # ORIYA LETTER RHA
'0F43', # TIBETAN LETTER GHA
'0F4D', # TIBETAN LETTER DDHA
'0F52', # TIBETAN LETTER DHA
'0F57', # TIBETAN LETTER BHA
'0F5C', # TIBETAN LETTER DZHA
'0F69', # TIBETAN LETTER KSSA
'0F76', # TIBETAN VOWEL SIGN VOCALIC R
'0F78', # TIBETAN VOWEL SIGN VOCALIC L
'0F93', # TIBETAN SUBJOINED LETTER GHA
'0F9D', # TIBETAN SUBJOINED LETTER DDHA
'0FA2', # TIBETAN SUBJOINED LETTER DHA
'0FA7', # TIBETAN SUBJOINED LETTER BHA
'0FAC', # TIBETAN SUBJOINED LETTER DZHA
'0FB9', # TIBETAN SUBJOINED LETTER KSSA
'FB1D', # HEBREW LETTER YOD WITH HIRIQ:
'FB1F', # HEBREW LIGATURE YIDDISH YOD YOD PATAH
'FB2A', # HEBREW LETTER SHIN WITH SHIN DOT
'FB2B', # HEBREW LETTER SHIN WITH SIN DOT
'FB2C', # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
'FB2D', # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
'FB2E', # HEBREW LETTER ALEF WITH PATAH
'FB2F', # HEBREW LETTER ALEF WITH QAMATS
'FB30', # HEBREW LETTER ALEF WITH MAPIQ
'FB31', # HEBREW LETTER BET WITH DAGESH
'FB32', # HEBREW LETTER GIMEL WITH DAGESH
'FB33', # HEBREW LETTER DALET WITH DAGESH
'FB34', # HEBREW LETTER HE WITH MAPIQ
'FB35', # HEBREW LETTER VAV WITH DAGESH
'FB36', # HEBREW LETTER ZAYIN WITH DAGESH
'FB38', # HEBREW LETTER TET WITH DAGESH
'FB39', # HEBREW LETTER YOD WITH DAGESH
'FB3A', # HEBREW LETTER FINAL KAF WITH DAGESH
'FB3B', # HEBREW LETTER KAF WITH DAGESH
'FB3C', # HEBREW LETTER LAMED WITH DAGESH
'FB3E', # HEBREW LETTER MEM WITH DAGESH
'FB40', # HEBREW LETTER NUN WITH DAGESH
'FB41', # HEBREW LETTER SAMEKH WITH DAGESH
'FB43', # HEBREW LETTER FINAL PE WITH DAGESH
'FB44', # HEBREW LETTER PE WITH DAGESH
'FB46', # HEBREW LETTER TSADI WITH DAGESH
'FB47', # HEBREW LETTER QOF WITH DAGESH
'FB48', # HEBREW LETTER RESH WITH DAGESH
'FB49', # HEBREW LETTER SHIN WITH DAGESH
'FB4A', # HEBREW LETTER TAV WITH DAGESH
'FB4B', # HEBREW LETTER VAV WITH HOLAM
'FB4C', # HEBREW LETTER BET WITH RAFE
'FB4D', # HEBREW LETTER KAF WITH RAFE
'FB4E', # HEBREW LETTER PE WITH RAFE
# post composition exclusion
'2ADC', # FORKING
'1D15E', # MUSICAL SYMBOL HALF NOTE
'1D15F', # MUSICAL SYMBOL QUARTER NOTE
'1D160', # MUSICAL SYMBOL EIGHTH NOTE
'1D161', # MUSICAL SYMBOL SIXTEENTH NOTE
'1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE
'1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE
'1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
'1D1BB', # MUSICAL SYMBOL MINIMA
'1D1BC', # MUSICAL SYMBOL MINIMA BLACK
'1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE
'1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK
'1D1BF', # MUSICAL SYMBOL FUSA WHITE
'1D1C0' # MUSICAL SYMBOL FUSA BLACK
);
# Count number of lines in input file to get size of table.
my $input_lines = 0;
open(my $FH, $input_file) or die "Could not open input file $input_file: $!.";
while (my $line = <$FH>)
{
my @elts = split(';', $line);
my $code = get_hexa_code($elts[0]);
# Skip codes longer than 4 bytes, or 8 characters.
next if length($code) > 8;
# Skip codes that cannot be composed
my $found_no_recomp = 0;
foreach my $lcode (@no_recomp_codes)
{
if ($lcode eq $elts[0])
{
$found_no_recomp = 1;
last;
}
}
next if $found_no_recomp;
# Skip characters with no decompositions and a class of 0.
next if $elts[3] eq '0' && $elts[5] eq '';
$input_lines++;
}
close $FH;
# Open the input file and treat it line by line, one for each Unicode
# character.
open my $INPUT, $input_file or die "Could not open input file $input_file: $!";
open my $OUTPUT, "> $output_file" or die "Could not open output file $output_file: $!\n";
# Print header of output file.
print $OUTPUT <<HEADER;
/*
* File auto-generated from generate_conv.pl, do not edit. There is
* deliberately not an #ifndef PG_UTF8_TABLE_H here.
*/
typedef struct
{
uint32 utf; /* UTF-8 */
uint8 class; /* combining class of character */
uint8 dec_size; /* size of decomposition code list */
} pg_utf_decomposition;
/* conversion table */
HEADER
print $OUTPUT "static const pg_utf_decomposition SASLPrepConv[ $input_lines ] =\n{\n";
# Hash for decomposition tables made of string arrays (one for each
# character decomposition, classified by size).
my %decomp_tabs = ();
my $first_item = 1;
while ( my $line = <$INPUT> )
{
# Split the line wanted and get the fields needed:
# - Unicode number
# - Combining class
# - Decomposition table
my @elts = split(';', $line);
my $code = get_hexa_code($elts[0]);
my $class = sprintf("0x%02x", $elts[3]);
my $decom = $elts[5];
# Skip codes longer than 4 bytes, or 8 characters.
next if length($code) > 8;
# Skip characters with no decompositions and a class of 0.
# to reduce the table size.
next if $elts[3] eq '0' && $elts[5] eq '';
# Skip codes that cannot be composed
my $found_no_recomp = 0;
foreach my $lcode (@no_recomp_codes)
{
if ($lcode eq $elts[0])
{
$found_no_recomp = 1;
last;
}
}
next if $found_no_recomp;
# Print a comma for all items except the first one.
if ($first_item)
{
$first_item = 0;
}
else
{
print $OUTPUT ",\n";
}
# Remove decomposition type if any, keep only character codes and
# then print them.
$decom =~ s/\<[^][]*\>//g;
my @decom_elts = split(" ", $decom);
# Now print a single entry in the conversion table.
print $OUTPUT "\t{";
# Code number
print $OUTPUT "0x$code, ";
# Combining class
print $OUTPUT "$class, ";
# Decomposition size
# Print size of decomposition
my $decom_size = scalar(@decom_elts);
print $OUTPUT "$decom_size}";
# If the character has no decomposition we are done.
next if $decom_size == 0;
# Now save the decompositions into a dedicated area that will
# be written afterwards. First build the entry dedicated to
# a sub-table with the code and decomposition.
my $first_decom = 1;
my $decomp_string = "{";
# Code number
$decomp_string .= "0x$code, {";
foreach(@decom_elts)
{
if ($first_decom)
{
$first_decom = 0;
}
else
{
$decomp_string .= ", ";
}
my $decom_data = get_hexa_code($_);
$decomp_string .= "0x$decom_data";
}
$decomp_string .= "}}";
# Store it in its dedicated list.
push(@{ $decomp_tabs{$decom_size} }, $decomp_string);
}
print $OUTPUT "\n};\n\n\n";
# Print the decomposition tables by size.
foreach my $decomp_size (sort keys %decomp_tabs )
{
my @decomp_entries = @{ $decomp_tabs{$decomp_size}};
my $decomp_length = scalar(@decomp_entries);
# First print the header.
print $OUTPUT <<HEADER;
\n\n/* Decomposition table with entries of list length of $decomp_size */
typedef struct
{
uint32 utf; /* UTF-8 */
uint32 decomp[$decomp_size]; /* size of decomposition code list */
} pg_utf_decomposition_size_$decomp_size;
static const pg_utf_decomposition_size_$decomp_size UtfDecomp_$decomp_size [ $decomp_length ] =
{
HEADER
$first_item = 1;
# Print each entry.
foreach(@decomp_entries)
{
if ($first_item)
{
$first_item = 0;
}
else
{
print $OUTPUT ",\n";
}
print $OUTPUT "\t$_";
}
print $OUTPUT "\n};\n";
}
close $OUTPUT;
close $INPUT;
Loading
Loading
@@ -4,27 +4,7 @@
\echo Use "CREATE EXTENSION pg_sasl_prepare" to load this file. \quit
 
-- This is a pg_sasl_prepare
CREATE FUNCTION pg_sasl_prepare(_int4)
RETURNS _int4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
-- Conversion functions
CREATE FUNCTION utf8_to_array(text)
RETURNS _int4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION array_to_utf8(_int4)
CREATE FUNCTION pg_sasl_prepare(text)
RETURNS text
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
-- Conversion table fetch
CREATE OR REPLACE FUNCTION utf8_conv_table(
OUT code int,
OUT class smallint,
OUT decomposition _int4)
RETURNS SETOF record
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
LANGUAGE C STRICT;
/*-------------------------------------------------------------------------
*
* pg_sasl_prepare.c
* Set of functions for a minimal extension template
* Wrapper on top of upstream implementation of SASLprep, changing
* a UTF-8 string into a prepared string for a SCRAM exchange.
*
* Copyright (c) 1996-2017, PostgreSQL Global Development Group
*
Loading
Loading
@@ -13,729 +14,37 @@
 
#include "postgres.h"
 
#include "access/tupdesc.h"
#include "catalog/pg_type.h"
#include "common/saslprep.h"
#include "fmgr.h"
#include "funcapi.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "nodes/execnodes.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/tuplestore.h"
/* local includes */
#include "utf8_table.h"
 
PG_MODULE_MAGIC;
 
/* Utilities for array manipulation */
#define ARRPTR(x) ((int32 *) ARR_DATA_PTR(x))
#define ARRNELEMS(x) ArrayGetNItems(ARR_NDIM(x), ARR_DIMS(x))
/* Constants for calculations with Hangul characters */
#define SBASE 0xEAB080 /* U+AC00 */
#define LBASE 0xE18480 /* U+1100 */
#define VBASE 0xE185A1 /* U+1161 */
#define TBASE 0xE186A7 /* U+11A7 */
#define LCOUNT 19
#define VCOUNT 21
#define TCOUNT 28
#define NCOUNT VCOUNT * TCOUNT
#define SCOUNT LCOUNT * NCOUNT
/*
* Create a new int array with room for "num" elements.
* Taken from contrib/intarray/.
*/
static ArrayType *
new_intArrayType(int num)
{
ArrayType *r;
int nbytes = ARR_OVERHEAD_NONULLS(1) + sizeof(int) * num;
r = (ArrayType *) palloc0(nbytes);
SET_VARSIZE(r, nbytes);
ARR_NDIM(r) = 1;
r->dataoffset = 0; /* marker for no null bitmap */
ARR_ELEMTYPE(r) = INT4OID;
ARR_DIMS(r)[0] = num;
ARR_LBOUND(r)[0] = 1;
return r;
}
/*
* comparison routine for bsearch() of main conversion table.
* this routine is intended for UTF8 code -> conversion entry
*/
static int
conv_compare(const void *p1, const void *p2)
{
uint32 v1, v2;
v1 = *(const uint32 *) p1;
v2 = ((const pg_utf_decomposition *) p2)->utf;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* Set of comparison functions for sub-tables.
*/
#define CONV_COMPARE_SIZE(type) \
static int \
conv_compare_size_##type(const void *p1, const void *p2) \
{ \
uint32 v1, v2; \
v1 = *(const uint32 *) p1; \
v2 = ((const pg_utf_decomposition_size_##type *) p2)->utf; \
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); \
}
/* Update this list of new sub-tables are present in utf8_table.h */
CONV_COMPARE_SIZE(1);
CONV_COMPARE_SIZE(2);
CONV_COMPARE_SIZE(3);
CONV_COMPARE_SIZE(4);
CONV_COMPARE_SIZE(5);
CONV_COMPARE_SIZE(6);
CONV_COMPARE_SIZE(7);
CONV_COMPARE_SIZE(8);
CONV_COMPARE_SIZE(18);
/*
* Get the entry corresponding to code in the main conversion table.
* This is useful to avoid repeating the calls to bsearch everywhere.
*/
static pg_utf_decomposition *
get_code_entry(uint32 code)
{
pg_utf_decomposition *entry;
/*
* bsearch() works as follows:
* - a key to check for matches.
* - a pointer pointing to the base of the conversion table.
* - number of elements in the array to look for,
* - size of an array element.
* - comparison function.
* If a match cannot be found, NULL is returned.
*/
entry = bsearch(&code,
(void *) SASLPrepConv,
lengthof(SASLPrepConv),
sizeof(pg_utf_decomposition),
conv_compare);
return entry;
}
/*
* Using an entry from the main decomposition table, return an
* array which is a pointer to the decomposition.
*/
#define CONV_SEARCH_SIZE(type) \
{ \
pg_utf_decomposition_size_##type *item; \
uint32 *result; \
item = bsearch(&code, \
(void *) UtfDecomp_##type, \
lengthof(UtfDecomp_##type), \
sizeof(pg_utf_decomposition_size_##type), \
conv_compare_size_##type); \
result = item->decomp; \
return result; \
} while(0);
static uint32 *
get_code_decomposition(pg_utf_decomposition *entry)
{
uint32 code = entry->utf;
switch (entry->dec_size)
{
case 1:
CONV_SEARCH_SIZE(1);
case 2:
CONV_SEARCH_SIZE(2);
case 3:
CONV_SEARCH_SIZE(3);
case 4:
CONV_SEARCH_SIZE(4);
case 5:
CONV_SEARCH_SIZE(5);
case 6:
CONV_SEARCH_SIZE(6);
case 7:
CONV_SEARCH_SIZE(7);
case 8:
CONV_SEARCH_SIZE(8);
case 18:
CONV_SEARCH_SIZE(18);
default:
Assert(false);
}
/* should not come here */
return NULL;
}
/*
* Recursively look at the number of elements in the conversion table
* to calculate how many characters are used for the given code.
*/
static int
get_decomposed_size(uint32 code)
{
pg_utf_decomposition *entry;
int size = 0;
int i;
uint32 *decomp;
/*
* Fast path for Hangul characters not stored in tables to save memory
* as decomposition is algorithmic.
* See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
* on the matter.
*/
if (code >= SBASE && code < SBASE + SCOUNT)
{
uint32 tindex, sindex;
sindex = code - SBASE;
tindex = sindex % TCOUNT;
if (tindex != 0)
return 3;
return 2;
}
entry = get_code_entry(code);
/*
* Just count current code if no other decompositions. A NULL entry
* is equivalent to a character with class 0 and no decompositions.
*/
if (entry == NULL || entry->dec_size == 0)
return 1;
/*
* If this entry has other decomposition codes look at them as well.
* First get its decomposition in the list of tables available.
*/
decomp = get_code_decomposition(entry);
for (i = 0; i < entry->dec_size; i++)
{
uint32 lcode = decomp[i];
size += get_decomposed_size(lcode);
}
return size;
}
/*
* Recompose a set of characters. For hangul characters, the calculation
* is algorithmic. For others, an inverse lookup at the decomposition
* table is necessary. Returns true if a recomposition can be done, and
* false otherwise.
*/
static bool
recompose_code(uint32 start, uint32 code, uint32 *result)
{
/* No need to care about ascii characters */
if (start <= 0x7f || code <= 0x7f)
return false;
/* Hangul characters go here */
if (start >= LBASE && start < LBASE + LCOUNT &&
code >= VBASE && code < VBASE + VCOUNT)
{
*result = ((start - LBASE) * VCOUNT + code - VBASE) * TCOUNT + SBASE;
return true;
}
else if (start >= SBASE && start < (SBASE + SCOUNT) &&
((start - SBASE) % TCOUNT) == 0 &&
code >= TBASE && code < (TBASE + TCOUNT))
{
*result = start + code - TBASE;
return true;
}
else
{
int i;
/*
* Do an inverse lookup of the decomposition tables to see if
* anything matches. The comparison just needs to be a perfect
* match on the sub-table of size two, because the start character
* has already been recomposed partially.
*/
for (i = 0; i < lengthof(UtfDecomp_2); i++)
{
pg_utf_decomposition_size_2 entry = UtfDecomp_2[i];
if (start == entry.decomp[0] &&
code == entry.decomp[1])
{
*result = entry.utf;
return true;
}
}
}
return false;
}
/*
* Decompose the given code into the array given by caller. The
* decomposition begins at the position given by caller, saving one
* lookup at the conversion table. The current position needs to be
* updated here to let the caller know from where to continue filling
* in the array result.
*/
static void
decompose_code(uint32 code, int **result, int *current)
{
pg_utf_decomposition *entry;
int i;
uint32 *decomp;
/*
* Fast path for Hangul characters not stored in tables to save memory
* as decomposition is algorithmic.
* See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
* on the matter.
*/
if (code >= SBASE && code < SBASE + SCOUNT)
{
uint32 l, v, tindex, sindex;
int *res = *result;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
tindex = sindex % TCOUNT;
res[*current] = l;
(*current)++;
res[*current] = v;
(*current)++;
if (tindex != 0)
{
res[*current] = TBASE + tindex;
(*current)++;
}
return;
}
entry = get_code_entry(code);
/*
* Just fill in with the current decomposition if there are no
* decomposition codes to recurse to. A NULL entry is equivalent
* to a character with class 0 and no decompositions, so just leave
* also in this case.
*/
if (entry == NULL || entry->dec_size == 0)
{
int *res = *result;
res[*current] = (int) code;
(*current)++;
return;
}
/*
* If this entry has other decomposition codes look at them as well.
*/
decomp = get_code_decomposition(entry);
for (i = 0; i < entry->dec_size; i++)
{
uint32 lcode = decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, result, current);
}
}
/*
* pg_sasl_prepare
*
* Perform SASLprepare (NKFC) on a integer array identifying individual
* multibyte UTF-8 characters.
* multibyte UTF-8 characters. This is a simple wrapper on top of
* PostgreSQL implementation.
*/
PG_FUNCTION_INFO_V1(pg_sasl_prepare);
Datum
pg_sasl_prepare(PG_FUNCTION_ARGS)
{
ArrayType *input = PG_GETARG_ARRAYTYPE_P(0);
int *input_ptr = ARRPTR(input);
ArrayType *result;
int *result_ptr;
int *decomp_ptr;
int *recomp_ptr;
int count;
int size = 0;
int decomp_size = 0;
int recomp_size = 0;
/* variables for recomposition */
int last_class;
int starter_pos;
int target_pos;
uint32 starter_ch;
/* First do the compatibility decomposition */
/*
* Look recursively at the convertion table to understand the number
* of elements that need to be created.
*/
for (count = 0; count < ARRNELEMS(input); count++)
{
uint32 code = input_ptr[count];
/*
* Recursively look at the conversion table to determine into
* how many characters the given code need to be decomposed.
*/
decomp_size += get_decomposed_size(code);
}
/*
* Now fill in each entry recursively. This needs a second pass on
* the conversion table.
*/
decomp_ptr = (int *) palloc(decomp_size * sizeof(int));
size = 0;
for (count = 0; count < ARRNELEMS(input); count++)
{
uint32 code = input_ptr[count];
decompose_code(code, &decomp_ptr, &size);
/*
* XXX: Is it necessary to reorder the combining marks here?
*/
}
/*
* Now that the decomposition is done, apply the combining class
* for each multibyte character.
*/
for (count = 1; count < decomp_size; count++)
{
uint32 prev = decomp_ptr[count - 1];
uint32 next = decomp_ptr[count];
uint32 tmp;
pg_utf_decomposition *prevEntry = get_code_entry(prev);
pg_utf_decomposition *nextEntry = get_code_entry(next);
/*
* If no entries are found, the character used is either an Hangul
* character or a character with a class of 0 and no decompositions,
* so move to next result.
*/
if (prevEntry == NULL || nextEntry == NULL)
continue;
/*
* Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
* a sequence of two adjacent characters in a string is an exchangeable
* pair if the combining class (from the Unicode Character Database)
* for the first character is greater than the combining class for the
* second, and the second is not a starter. A character is a starter
* if its combining class is 0.
*/
if (nextEntry->class == 0x0 || prevEntry->class == 0x0)
continue;
if (prevEntry->class <= nextEntry->class)
continue;
/* exchange can happen */
tmp = decomp_ptr[count - 1];
decomp_ptr[count - 1] = decomp_ptr[count];
decomp_ptr[count] = tmp;
/* backtrack to check again */
if (count > 1)
count -= 2;
}
/*
* The last phase of NFKC is the recomposition of the multibyte string
* that has been reordered previously using combining classes. The
* recomposed string cannot be longer than the decomposed one, so
* make the allocation of the recomposed string based on that assumption.
*/
recomp_ptr = (int *) palloc(decomp_size * sizeof(int));
last_class = -1; /* this eliminates a special check */
starter_pos = 0;
target_pos = 1;
starter_ch = recomp_ptr[0] = decomp_ptr[0];
for (count = 1; count < decomp_size; count++)
{
uint32 ch = (uint32) decomp_ptr[count];
pg_utf_decomposition *ch_entry = get_code_entry(ch);
int ch_class = ch_entry == NULL ? 0 : ch_entry->class;
pg_wchar composite;
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
{
recomp_ptr[starter_pos] = composite;
starter_ch = composite;
}
else if (ch_class == 0)
{
starter_pos = target_pos;
starter_ch = ch;
last_class = -1;
recomp_ptr[target_pos++] = ch;
}
else
{
last_class = ch_class;
recomp_ptr[target_pos++] = ch;
}
}
recomp_size = target_pos;
/* And finally fill-in the result */
result = new_intArrayType(recomp_size);
result_ptr = ARRPTR(result);
memcpy(result_ptr, recomp_ptr, recomp_size * sizeof(uint32));
PG_RETURN_POINTER(result);
}
/*
* utf8_to_array
* Convert a UTF-8 string into an integer array.
*/
PG_FUNCTION_INFO_V1(utf8_to_array);
Datum
utf8_to_array(PG_FUNCTION_ARGS)
{
char *input = text_to_cstring(PG_GETARG_TEXT_PP(0));
ArrayType *result;
int *result_ptr;
int size = 0;
int count;
int encoding = GetDatabaseEncoding();
const unsigned char *utf = (unsigned char *) input;
char *password = text_to_cstring(PG_GETARG_TEXT_PP(0));
char *prep_password = NULL;
 
if (encoding != PG_UTF8)
if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Database encoding is not UTF-8")));
 
/*
* Calculate the array size first by doing a first pass on the UTF-8 string
*/
while (*utf)
{
int l;
l = pg_utf_mblen(utf);
if (!pg_utf8_islegal(utf, l))
elog(ERROR, "incorrect utf-8 input");
size++;
utf += l;
}
/*
* And now fill in the array with all the data from each character by
* doing a second pass.
*/
result = new_intArrayType(size);
result_ptr = ARRPTR(result);
utf = (unsigned char *) input;
count = 0;
while (*utf)
{
uint32 iutf = 0;
int l;
l = pg_utf_mblen(utf);
/* Calculate entry for character input for conversion table lookup */
if (l == 1)
{
iutf = *utf++;
}
else if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
}
else if (l == 3)
{
iutf = *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else if (l == 4)
{
iutf = *utf++ << 24;
iutf |= *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else
elog(ERROR, "incorrect multibyte length %d", l);
/* Let's not care about any signing */
result_ptr[count++] = (int32) iutf;
}
Assert(count == ARRNELEMS(result));
PG_RETURN_POINTER(result);
}
/*
* array_to_utf8
* Convert a UTF-8 string into an integer array.
*/
PG_FUNCTION_INFO_V1(array_to_utf8);
Datum
array_to_utf8(PG_FUNCTION_ARGS)
{
ArrayType *input = PG_GETARG_ARRAYTYPE_P(0);
int *input_ptr = ARRPTR(input);
char *result;
int size = 0;
int count = 0;
int i;
/*
* Do a first pass on the array elements to calculate the size of the
* string to return.
*/
for (i = 0; i < ARRNELEMS(input); i++)
{
uint32 code = input_ptr[i];
if (code & 0xff000000)
size++;
if (code & 0x00ff0000)
size++;
if (code & 0x0000ff00)
size++;
if (code & 0x000000ff)
size++;
}
/* Now fill in the string */
result = palloc0(size + 1);
for (i = 0; i < ARRNELEMS(input); i++)
{
uint32 code = input_ptr[i];
if (code & 0xff000000)
result[count++] = code >> 24;
if (code & 0x00ff0000)
result[count++] = code >> 16;
if (code & 0x0000ff00)
result[count++] = code >> 8;
if (code & 0x000000ff)
result[count++] = code;
}
result[count] = '\0';
Assert(count == size);
PG_RETURN_TEXT_P(cstring_to_text(result));
}
/*
* utf8_conv_table
* Return a full copy of the UTF-8 conversion table.
*/
PG_FUNCTION_INFO_V1(utf8_conv_table);
Datum
utf8_conv_table(PG_FUNCTION_ARGS)
{
TupleDesc tupdesc;
Tuplestorestate *tupstore;
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
MemoryContext per_query_ctx;
MemoryContext oldcontext;
int i;
/* check to see if caller supports us returning a tuplestore */
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
if (pg_saslprep(password, &prep_password) != SASLPREP_SUCCESS)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("set-valued function called in context that cannot accept a set")));
if (!(rsinfo->allowedModes & SFRM_Materialize))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("materialize mode required, but it is not " \
"allowed in this context")));
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
/* Build tuple descriptor */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
rsinfo->setResult = tupstore;
rsinfo->setDesc = tupdesc;
MemoryContextSwitchTo(oldcontext);
/* Print out all the values on the table */
for (i = 0; i < lengthof(SASLPrepConv); i++)
{
Datum values[3];
bool nulls[3];
pg_utf_decomposition entry = SASLPrepConv[i];
int count;
ArrayType *decomp = NULL;
int *decomp_ptr = NULL;
MemSet(values, 0, sizeof(values));
MemSet(nulls, 0, sizeof(nulls));
/* Fill in values, code first */
values[0] = Int32GetDatum(entry.utf);
/* class */
values[1] = Int16GetDatum((int16) entry.class);
/* decomposition array */
if (entry.dec_size == 0)
nulls[2] = true;
else
{
uint32 *entry_decomp;
/* Get decomposition of entry */
entry_decomp = get_code_decomposition(&entry);
decomp = new_intArrayType(entry.dec_size);
decomp_ptr = ARRPTR(decomp);
for (count = 0; count < entry.dec_size; count++)
decomp_ptr[count] = (int) entry_decomp[count];
values[2] = PointerGetDatum(decomp);
}
/* Save tuple values */
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
if (decomp != NULL)
pfree(decomp);
}
/* clean up and return the tuplestore */
tuplestore_donestoring(tupstore);
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("Error while processing SASLprep")));
 
return (Datum) 0;
PG_RETURN_TEXT_P(cstring_to_text(prep_password));
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment