Skip to content
Snippets Groups Projects

Initial implementation of an elasticsearch indexer in Go

Merged Nick Thomas requested to merge 1-initial-implementation into master
All threads resolved!
56 files
+ 858
161489
Compare changes
  • Side-by-side
  • Inline
Files
56
+ 15
31
package indexer
import (
"fmt"
"github.com/goodsign/icu"
)
"github.com/saintfish/chardet"
"golang.org/x/text/encoding/ianaindex"
var (
detector *icu.CharsetDetector
converter = icu.NewCharsetConverter(maxBlobSize)
)
var detector = chardet.NewTextDetector()
func init() {
var err error
detector, err = icu.NewCharsetDetector()
if err != nil {
panic(err)
}
}
func tryEncodeString(s string) string {
encoded, err := encodeString(s)
@@ -34,39 +42,15 @@ func encodeString(s string) (string, error) {
// encodeString converts a string from an arbitrary encoding to UTF-8
func encodeBytes(b []byte) (string, error) {
best, err := detector.DetectBest(b)
matches, err := detector.GuessCharset(b)
if err != nil {
return "", err
}
charset := best.Charset
// chardet has some incompatibilities with ianaindex
switch charset {
case "UTF-8":
return string(b), nil
case "GB-18030":
charset = "GB18030"
case "IBM420_ltr", "IBM420_rtl":
charset = "cp420"
case "IBM424_ltr", "IBM424_rtl":
charset = "IBM424"
}
encoding, err := ianaindex.IANA.Encoding(charset)
if err != nil {
return "", fmt.Errorf("Encoding %q: %v", charset, err)
}
// TODO(nick): Does this actually mean 'nothing to do'?
if encoding == nil {
return string(b), nil
}
decoded, err := encoding.NewDecoder().Bytes(b)
utf8, err := converter.ConvertToUtf8(b, matches[0].Charset)
if err != nil {
return "", err
}
return string(decoded), nil
return string(utf8), nil
}
Loading