Skip to content
Snippets Groups Projects
Commit 3fb9011c authored by Nick Thomas's avatar Nick Thomas
Browse files

Fork icu to fix error handling

parent 79e50fee
No related branches found
No related tags found
1 merge request!1Initial implementation of an elasticsearch indexer in Go
Pipeline #
package indexer
 
import (
"github.com/goodsign/icu"
"fmt"
"log"
"github.com/lupine/icu"
)
 
var (
Loading
Loading
@@ -20,6 +23,7 @@ func init() {
func tryEncodeString(s string) string {
encoded, err := encodeString(s)
if err != nil {
log.Println(err)
return s // TODO: Run it through the UTF-8 replacement encoder
}
 
Loading
Loading
@@ -29,6 +33,7 @@ func tryEncodeString(s string) string {
func tryEncodeBytes(b []byte) string {
encoded, err := encodeBytes(b)
if err != nil {
log.Println(err)
s := string(b)
return s // TODO: Run it through the UTF-8 replacement encoder
}
Loading
Loading
@@ -42,15 +47,22 @@ func encodeString(s string) (string, error) {
 
// encodeString converts a string from an arbitrary encoding to UTF-8
func encodeBytes(b []byte) (string, error) {
if len(b) == 0 {
return "", nil
}
matches, err := detector.GuessCharset(b)
if err != nil {
return "", err
return "", fmt.Errorf("Couldn't guess charset: %s", err)
}
 
utf8, err := converter.ConvertToUtf8(b, matches[0].Charset)
if err != nil {
return "", err
// Try encoding for each match, returning the first that succeeds
for _, match := range matches {
utf8, err := converter.ConvertToUtf8(b, match.Charset)
if err == nil {
return string(utf8), nil
}
}
 
return string(utf8), nil
return "", fmt.Errorf("Failed to convert from %s to UTF-8", matches[0].Charset)
}
Loading
Loading
@@ -6,16 +6,16 @@
#include <unicode/ucnv.h>
 
// See description in c_bridge.h
const int detectCharset(void *detector,
void *input,
int input_len,
int *status,
MatchData *matchBuffer,
const int detectCharset(void *detector,
void *input,
int input_len,
int *status,
MatchData *matchBuffer,
int matchBufferSize) {
 
// Put input bytes in the detector.
ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
Loading
Loading
@@ -25,7 +25,7 @@ const int detectCharset(void *detector,
 
// Perform analysis and return all guesses and their count.
bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
Loading
Loading
@@ -42,19 +42,19 @@ const int detectCharset(void *detector,
 
// Fill guessed encoding
bestGuessedCharset = ucsdet_getName(bestGuess, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
// Fill guessed language
bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
// Fill its confidence rating
int32_t conf = ucsdet_getConfidence(bestGuess, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
Loading
Loading
@@ -69,7 +69,7 @@ const int detectCharset(void *detector,
 
// See description in c_bridge.h
int convertToUtf16(const char *srcEncoding,
UChar *dest,
UChar *dest,
int32_t destCapacity,
const char *src,
int32_t srcLength,
Loading
Loading
@@ -77,13 +77,13 @@ int convertToUtf16(const char *srcEncoding,
UConverter *conv;
 
conv = ucnv_open(srcEncoding, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
/* Convert from original encoding to UTF-16 */
int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
Loading
Loading
@@ -94,7 +94,7 @@ int convertToUtf16(const char *srcEncoding,
 
// See description in c_bridge.h
int convertFromUtf16(const char *destEncoding,
char *dest,
char *dest,
int32_t destCapacity,
const UChar *src,
int32_t srcLength,
Loading
Loading
@@ -102,17 +102,17 @@ int convertFromUtf16(const char *destEncoding,
UConverter *conv;
 
conv = ucnv_open(destEncoding, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
/* Convert from UTF-16 to destination encoding */
int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status);
if (*status != U_ZERO_ERROR) {
if U_FAILURE(*status) {
return 0;
}
 
ucnv_close(conv);
 
return len;
}
\ No newline at end of file
}
Loading
Loading
@@ -14,7 +14,7 @@ const (
DefaultMaxTextSize = 1024 * 1024 // Default value for the max text length in conversion operations
utf8MaxCharSize = 4
utf16MaxCharSize = 4
)
)
 
var (
Utf8CString = C.CString("UTF-8")
Loading
Loading
@@ -35,7 +35,7 @@ type CharsetConverter struct {
// are created in memory once and then used. 'maxTextSize' sets the size of these buffers.
// ICU library would return error if any processed text is longer than this parameter.
//
// NOTE:
// NOTE:
//
// UTF8 uses 1 to 4 bytes for each symbol.
// UTF16 uses 2 bytes to 4 bytes for each symbol.
Loading
Loading
@@ -79,7 +79,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
C.int32_t(len(input)),
(*C.int)(unsafe.Pointer(&status)))
 
if status == U_ZERO_ERROR {
if isSuccess(status) {
nConvLen := C.convertFromUtf16(
Utf8CString,
(*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])),
Loading
Loading
@@ -88,7 +88,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
C.int32_t(convLen),
(*C.int)(unsafe.Pointer(&status)))
 
if status == U_ZERO_ERROR {
if isSuccess(status) {
resStr := conv.utf8Buffer[:nConvLen]
return ([]byte)(resStr), nil
}
Loading
Loading
Loading
Loading
@@ -11,9 +11,21 @@ import (
)
 
const (
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
)
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
U_ERROR_LIMIT = 0x7FFFFFFF // Dirty hack, negative error codes are are being turned into large positive ints
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
)
// Go implementation of the icu U_SUCCESS macro. Negative status codes are
// warnings, 0 is a success without warnings, > 0 is an error
func isSuccess(status int) bool {
return status <= U_ZERO_ERROR || status >= U_ERROR_LIMIT
}
// Go implementation of the icu U_FAILURE macro.
func isFailure(status int) bool {
return status > U_ZERO_ERROR && status < U_ERROR_LIMIT
}
 
// CharsetDetector provides ICU charset detection functionality.
type CharsetDetector struct {
Loading
Loading
@@ -39,7 +51,7 @@ func NewCharsetDetector() (*CharsetDetector, error) {
 
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
 
if status != U_ZERO_ERROR {
if isFailure(status) {
return nil, fmt.Errorf("ICU Error code returned: %d", status)
}
 
Loading
Loading
@@ -63,14 +75,14 @@ func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err err
// Perform detection. Guess count is the number of matches returned.
// The matches themself are put in the result buffer
guessCount := C.detectCharset(
unsafe.Pointer(det.ptr),
unsafe.Pointer(&input[0]),
C.int(inputLen),
(*C.int)(unsafe.Pointer(&status)),
unsafe.Pointer(det.ptr),
unsafe.Pointer(&input[0]),
C.int(inputLen),
(*C.int)(unsafe.Pointer(&status)),
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
C.int(MatchDataBufferSize))
 
if status == U_ZERO_ERROR {
if isSuccess(status) {
// Convert the returned number of entries from result buffer to a slice
// that will be returned
count := int(guessCount)
Loading
Loading
Loading
Loading
@@ -69,12 +69,6 @@
"revision": "2b26ad567f305510849d93c5d2025a8b561f2367",
"revisionTime": "2017-03-15T18:41:46Z"
},
{
"checksumSHA1": "dV9cqzwPhmplniCi4zr97Vo7hbo=",
"path": "github.com/goodsign/icu",
"revision": "029aa0206e822780c92c8da74c2a32bbe1682a0e",
"revisionTime": "2012-12-27T17:27:29Z"
},
{
"checksumSHA1": "0ZrwvB6KoGPj2PoDNSEJwxQ6Mog=",
"origin": "github.com/aws/aws-sdk-go/vendor/github.com/jmespath/go-jmespath",
Loading
Loading
@@ -82,6 +76,12 @@
"revision": "2b26ad567f305510849d93c5d2025a8b561f2367",
"revisionTime": "2017-03-15T18:41:46Z"
},
{
"checksumSHA1": "hcIgD9IEGtHu8o4NibzTl4qMtMU=",
"path": "github.com/lupine/icu",
"revision": "03c771153cfff2627e0e7c8326f6b3c6891b6acf",
"revisionTime": "2017-03-29T16:09:13Z"
},
{
"checksumSHA1": "iWCtyR1TkJ22Bi/ygzfKDvOQdQY=",
"origin": "gopkg.in/src-d/go-git.v4/vendor/github.com/sergi/go-diff/diffmatchpatch",
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment