Skip to content
Snippets Groups Projects
Commit f04f4f3e authored by Nick Thomas's avatar Nick Thomas
Browse files

Initial encode-to-UTF8 support

parent 9a572119
No related branches found
No related tags found
1 merge request!1Initial implementation of an elasticsearch indexer in Go
Pipeline #
Showing
with 10110 additions and 7 deletions
Loading
Loading
@@ -84,7 +84,8 @@ func (i *Indexer) BuildBlob(file *object.File, commitSHA string) (*Blob, error)
return nil, skipBinaryBlob
}
 
content := string(b)
content := tryEncodeBytes(b)
filename := tryEncodeString(file.Name)
 
return &Blob{
Type: "blob",
Loading
Loading
@@ -93,9 +94,9 @@ func (i *Indexer) BuildBlob(file *object.File, commitSHA string) (*Blob, error)
RepoID: parentID,
CommitSHA: commitSHA,
Content: content,
Path: file.Name,
Filename: file.Name,
Language: DetectLanguage(file.Name, b),
Path: filename,
Filename: filename,
Language: DetectLanguage(filename, b),
}, nil
}
 
Loading
Loading
Loading
Loading
@@ -30,7 +30,7 @@ func (i *Indexer) BuildCommit(c *object.Commit) *Commit {
Committer: BuildPerson(c.Committer),
ID: GenerateCommitID(parentID, sha),
RepoID: parentID,
Message: c.Message,
Message: tryEncodeString(c.Message),
SHA: sha,
}
}
package indexer
import (
"fmt"
"github.com/saintfish/chardet"
"golang.org/x/text/encoding/ianaindex"
)
var detector = chardet.NewTextDetector()
func tryEncodeString(s string) string {
encoded, err := encodeString(s)
if err != nil {
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func tryEncodeBytes(b []byte) string {
encoded, err := encodeBytes(b)
if err != nil {
s := string(b)
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func encodeString(s string) (string, error) {
return encodeBytes([]byte(s))
}
// encodeString converts a string from an arbitrary encoding to UTF-8
func encodeBytes(b []byte) (string, error) {
best, err := detector.DetectBest(b)
if err != nil {
return "", err
}
charset := best.Charset
// chardet has some incompatibilities with ianaindex
switch charset {
case "UTF-8":
return string(b), nil
case "GB-18030":
charset = "GB18030"
case "IBM420_ltr", "IBM420_rtl":
charset = "cp420"
case "IBM424_ltr", "IBM424_rtl":
charset = "IBM424"
}
encoding, err := ianaindex.IANA.Encoding(charset)
if err != nil {
return "", fmt.Errorf("Encoding %q: %v", charset, err)
}
// TODO(nick): Does this actually mean 'nothing to do'?
if encoding == nil {
return string(b), nil
}
decoded, err := encoding.NewDecoder().Bytes(b)
if err != nil {
return "", err
}
return string(decoded), nil
}
Loading
Loading
@@ -22,8 +22,8 @@ func GenerateDate(t time.Time) string {
 
func BuildPerson(p object.Signature) *Person {
return &Person{
Name: p.Name,
Email: p.Email,
Name: tryEncodeString(p.Name),
Email: tryEncodeString(p.Email),
Time: GenerateDate(p.When),
}
}
package chardet
import (
"bytes"
)
type recognizer2022 struct {
charset string
escapes [][]byte
}
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Confidence: r.matchConfidence(input.input),
}
}
func (r *recognizer2022) matchConfidence(input []byte) int {
var hits, misses, shifts int
input:
for i := 0; i < len(input); i++ {
c := input[i]
if c == 0x1B {
for _, esc := range r.escapes {
if bytes.HasPrefix(input[i+1:], esc) {
hits++
i += len(esc)
continue input
}
}
misses++
} else if c == 0x0E || c == 0x0F {
shifts++
}
}
if hits == 0 {
return 0
}
quality := (100*hits - 100*misses) / (hits + misses)
if hits+shifts < 5 {
quality -= (5 - (hits + shifts)) * 10
}
if quality < 0 {
quality = 0
}
return quality
}
var escapeSequences_2022JP = [][]byte{
{0x24, 0x28, 0x43}, // KS X 1001:1992
{0x24, 0x28, 0x44}, // JIS X 212-1990
{0x24, 0x40}, // JIS C 6226-1978
{0x24, 0x41}, // GB 2312-80
{0x24, 0x42}, // JIS X 208-1983
{0x26, 0x40}, // JIS X 208 1990, 1997
{0x28, 0x42}, // ASCII
{0x28, 0x48}, // JIS-Roman
{0x28, 0x49}, // Half-width katakana
{0x28, 0x4a}, // JIS-Roman
{0x2e, 0x41}, // ISO 8859-1
{0x2e, 0x46}, // ISO 8859-7
}
var escapeSequences_2022KR = [][]byte{
{0x24, 0x29, 0x43},
}
var escapeSequences_2022CN = [][]byte{
{0x24, 0x29, 0x41}, // GB 2312-80
{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
{0x24, 0x29, 0x45}, // ISO-IR-165
{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
{0x4e}, // SS2
{0x4f}, // SS3
}
func newRecognizer_2022JP() *recognizer2022 {
return &recognizer2022{
"ISO-2022-JP",
escapeSequences_2022JP,
}
}
func newRecognizer_2022KR() *recognizer2022 {
return &recognizer2022{
"ISO-2022-KR",
escapeSequences_2022KR,
}
}
func newRecognizer_2022CN() *recognizer2022 {
return &recognizer2022{
"ISO-2022-CN",
escapeSequences_2022CN,
}
}
Sheng Yu (yusheng dot sjtu at gmail dot com)
Copyright (c) 2012 chardet Authors
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Partial of the Software is derived from ICU project. See icu-license.html for
license of the derivative portions.
# chardet
chardet is library to automatically detect
[charset](http://en.wikipedia.org/wiki/Character_encoding) of texts for [Go
programming language](http://golang.org/). It's based on the algorithm and data
in [ICU](http://icu-project.org/)'s implementation.
## Documentation and Usage
See [pkgdoc](http://go.pkgdoc.org/github.com/saintfish/chardet)
// Package chardet ports character set detection from ICU.
package chardet
import (
"errors"
"sort"
)
// Result contains all the information that charset detector gives.
type Result struct {
// IANA name of the detected charset.
Charset string
// IANA name of the detected language. It may be empty for some charsets.
Language string
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
Confidence int
}
// Detector implements charset detection.
type Detector struct {
recognizers []recognizer
stripTag bool
}
// List of charset recognizers
var recognizers = []recognizer{
newRecognizer_utf8(),
newRecognizer_utf16be(),
newRecognizer_utf16le(),
newRecognizer_utf32be(),
newRecognizer_utf32le(),
newRecognizer_8859_1_en(),
newRecognizer_8859_1_da(),
newRecognizer_8859_1_de(),
newRecognizer_8859_1_es(),
newRecognizer_8859_1_fr(),
newRecognizer_8859_1_it(),
newRecognizer_8859_1_nl(),
newRecognizer_8859_1_no(),
newRecognizer_8859_1_pt(),
newRecognizer_8859_1_sv(),
newRecognizer_8859_2_cs(),
newRecognizer_8859_2_hu(),
newRecognizer_8859_2_pl(),
newRecognizer_8859_2_ro(),
newRecognizer_8859_5_ru(),
newRecognizer_8859_6_ar(),
newRecognizer_8859_7_el(),
newRecognizer_8859_8_I_he(),
newRecognizer_8859_8_he(),
newRecognizer_windows_1251(),
newRecognizer_windows_1256(),
newRecognizer_KOI8_R(),
newRecognizer_8859_9_tr(),
newRecognizer_sjis(),
newRecognizer_gb_18030(),
newRecognizer_euc_jp(),
newRecognizer_euc_kr(),
newRecognizer_big5(),
newRecognizer_2022JP(),
newRecognizer_2022KR(),
newRecognizer_2022CN(),
newRecognizer_IBM424_he_rtl(),
newRecognizer_IBM424_he_ltr(),
newRecognizer_IBM420_ar_rtl(),
newRecognizer_IBM420_ar_ltr(),
}
// NewTextDetector creates a Detector for plain text.
func NewTextDetector() *Detector {
return &Detector{recognizers, false}
}
// NewHtmlDetector creates a Detector for Html.
func NewHtmlDetector() *Detector {
return &Detector{recognizers, true}
}
var (
NotDetectedError = errors.New("Charset not detected.")
)
// DetectBest returns the Result with highest Confidence.
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
var all []Result
if all, err = d.DetectAll(b); err == nil {
r = &all[0]
}
return
}
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
}
outputs := make([]recognizerOutput, 0, len(d.recognizers))
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if o.Confidence > 0 {
outputs = append(outputs, o)
}
}
if len(outputs) == 0 {
return nil, NotDetectedError
}
sort.Sort(recognizerOutputs(outputs))
dedupOutputs := make([]Result, 0, len(outputs))
foundCharsets := make(map[string]struct{}, len(outputs))
for _, o := range outputs {
if _, found := foundCharsets[o.Charset]; !found {
dedupOutputs = append(dedupOutputs, Result(o))
foundCharsets[o.Charset] = struct{}{}
}
}
if len(dedupOutputs) == 0 {
return nil, NotDetectedError
}
return dedupOutputs, nil
}
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
}
type recognizerOutputs []recognizerOutput
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></meta>
<title>ICU License - ICU 1.8.1 and later</title>
</head>
<body BGCOLOR="#ffffff">
<h2>ICU License - ICU 1.8.1 and later</h2>
<p>COPYRIGHT AND PERMISSION NOTICE</p>
<p>
Copyright (c) 1995-2012 International Business Machines Corporation and others
</p>
<p>
All rights reserved.
</p>
<p>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies
of the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
</p>
<p>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
USE OR PERFORMANCE OF THIS SOFTWARE.
</p>
<p>
Except as contained in this notice, the name of a copyright holder shall not be
used in advertising or otherwise to promote the sale, use or other dealings in
this Software without prior written authorization of the copyright holder.
</p>
<hr>
<p><small>
All trademarks and registered trademarks mentioned herein are the property of their respective owners.
</small></p>
</body>
</html>
package chardet
import (
"errors"
"math"
)
type recognizerMultiByte struct {
charset string
language string
decoder charDecoder
commonChars []uint16
}
type charDecoder interface {
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
}
func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Language: r.language,
Confidence: r.matchConfidence(input),
}
}
func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
raw := input.raw
var c uint16
var err error
var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
totalCharCount++
if err != nil {
badCharCount++
} else if c <= 0xFF {
singleByteCharCount++
} else {
doubleByteCharCount++
if r.commonChars != nil && binarySearch(r.commonChars, c) {
commonCharCount++
}
}
if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
return 0
}
}
if doubleByteCharCount <= 10 && badCharCount == 0 {
if doubleByteCharCount == 0 && totalCharCount < 10 {
return 0
} else {
return 10
}
}
if doubleByteCharCount < 20*badCharCount {
return 0
}
if r.commonChars == nil {
confidence := 30 + doubleByteCharCount - 20*badCharCount
if confidence > 100 {
confidence = 100
}
return confidence
}
maxVal := math.Log(float64(doubleByteCharCount) / 4)
scaleFactor := 90 / maxVal
confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
if confidence > 100 {
confidence = 100
}
if confidence < 0 {
confidence = 0
}
return confidence
}
func binarySearch(l []uint16, c uint16) bool {
start := 0
end := len(l) - 1
for start <= end {
mid := (start + end) / 2
if c == l[mid] {
return true
} else if c < l[mid] {
end = mid - 1
} else {
start = mid + 1
}
}
return false
}
var eobError = errors.New("End of input buffer")
var badCharError = errors.New("Decode a bad char")
type charDecoder_sjis struct {
}
func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
c = uint16(first)
remain = input[1:]
if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
return
}
if len(remain) == 0 {
return c, remain, badCharError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
} else {
err = badCharError
}
return
}
var commonChars_sjis = []uint16{
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
}
func newRecognizer_sjis() *recognizerMultiByte {
return &recognizerMultiByte{
"Shift_JIS",
"ja",
charDecoder_sjis{},
commonChars_sjis,
}
}
type charDecoder_euc struct {
}
func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x8D {
return uint16(first), remain, nil
}
if len(remain) == 0 {
return 0, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if first >= 0xA1 && first <= 0xFE {
if second < 0xA1 {
err = badCharError
}
return
}
if first == 0x8E {
if second < 0xA1 {
err = badCharError
}
return
}
if first == 0x8F {
if len(remain) == 0 {
return 0, nil, eobError
}
third := remain[0]
remain = remain[1:]
c = c<<0 | uint16(third)
if third < 0xa1 {
err = badCharError
}
}
return
}
var commonChars_euc_jp = []uint16{
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
}
var commonChars_euc_kr = []uint16{
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
}
func newRecognizer_euc_jp() *recognizerMultiByte {
return &recognizerMultiByte{
"EUC-JP",
"ja",
charDecoder_euc{},
commonChars_euc_jp,
}
}
func newRecognizer_euc_kr() *recognizerMultiByte {
return &recognizerMultiByte{
"EUC-KR",
"ko",
charDecoder_euc{},
commonChars_euc_kr,
}
}
type charDecoder_big5 struct {
}
func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x7F || first == 0xFF {
return
}
if len(remain) == 0 {
return c, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if second < 0x40 || second == 0x7F || second == 0xFF {
err = badCharError
}
return
}
var commonChars_big5 = []uint16{
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
}
func newRecognizer_big5() *recognizerMultiByte {
return &recognizerMultiByte{
"Big5",
"zh",
charDecoder_big5{},
commonChars_big5,
}
}
type charDecoder_gb_18030 struct {
}
func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x80 {
return
}
if len(remain) == 0 {
return 0, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if first >= 0x81 && first <= 0xFE {
if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
return
}
if second >= 0x30 && second <= 0x39 {
if len(remain) == 0 {
return 0, nil, eobError
}
third := remain[0]
remain = remain[1:]
if third >= 0x81 && third <= 0xFE {
if len(remain) == 0 {
return 0, nil, eobError
}
fourth := remain[0]
remain = remain[1:]
if fourth >= 0x30 && fourth <= 0x39 {
c = c<<16 | uint16(third)<<8 | uint16(fourth)
return
}
}
}
err = badCharError
}
return
}
var commonChars_gb_18030 = []uint16{
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
}
func newRecognizer_gb_18030() *recognizerMultiByte {
return &recognizerMultiByte{
"GB-18030",
"zh",
charDecoder_gb_18030{},
commonChars_gb_18030,
}
}
package chardet
type recognizer interface {
Match(*recognizerInput) recognizerOutput
}
type recognizerOutput Result
type recognizerInput struct {
raw []byte
input []byte
tagStripped bool
byteStats []int
hasC1Bytes bool
}
func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
input, stripped := mayStripInput(raw, stripTag)
byteStats := computeByteStats(input)
return &recognizerInput{
raw: raw,
input: input,
tagStripped: stripped,
byteStats: byteStats,
hasC1Bytes: computeHasC1Bytes(byteStats),
}
}
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
const inputBufferSize = 8192
out = make([]byte, 0, inputBufferSize)
var badTags, openTags int32
var inMarkup bool = false
stripped = false
if stripTag {
stripped = true
for _, c := range raw {
if c == '<' {
if inMarkup {
badTags += 1
}
inMarkup = true
openTags += 1
}
if !inMarkup {
out = append(out, c)
if len(out) >= inputBufferSize {
break
}
}
if c == '>' {
inMarkup = false
}
}
}
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
limit := len(raw)
if limit > inputBufferSize {
limit = inputBufferSize
}
out = make([]byte, limit)
copy(out, raw[:limit])
stripped = false
}
return
}
func computeByteStats(input []byte) []int {
r := make([]int, 256)
for _, c := range input {
r[c] += 1
}
return r
}
func computeHasC1Bytes(byteStats []int) bool {
for _, count := range byteStats[0x80 : 0x9F+1] {
if count > 0 {
return true
}
}
return false
}
This diff is collapsed.
package chardet
import (
"bytes"
)
var (
utf16beBom = []byte{0xFE, 0xFF}
utf16leBom = []byte{0xFF, 0xFE}
utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
)
type recognizerUtf16be struct {
}
func newRecognizer_utf16be() *recognizerUtf16be {
return &recognizerUtf16be{}
}
func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16BE",
}
if bytes.HasPrefix(input.raw, utf16beBom) {
output.Confidence = 100
}
return
}
type recognizerUtf16le struct {
}
func newRecognizer_utf16le() *recognizerUtf16le {
return &recognizerUtf16le{}
}
func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16LE",
}
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
output.Confidence = 100
}
return
}
type recognizerUtf32 struct {
name string
bom []byte
decodeChar func(input []byte) uint32
}
func decodeUtf32be(input []byte) uint32 {
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
}
func decodeUtf32le(input []byte) uint32 {
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
}
func newRecognizer_utf32be() *recognizerUtf32 {
return &recognizerUtf32{
"UTF-32BE",
utf32beBom,
decodeUtf32be,
}
}
func newRecognizer_utf32le() *recognizerUtf32 {
return &recognizerUtf32{
"UTF-32LE",
utf32leBom,
decodeUtf32le,
}
}
func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: r.name,
}
hasBom := bytes.HasPrefix(input.raw, r.bom)
var numValid, numInvalid uint32
for b := input.raw; len(b) >= 4; b = b[4:] {
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
numInvalid++
} else {
numValid++
}
}
if hasBom && numInvalid == 0 {
output.Confidence = 100
} else if hasBom && numValid > numInvalid*10 {
output.Confidence = 80
} else if numValid > 3 && numInvalid == 0 {
output.Confidence = 100
} else if numValid > 0 && numInvalid == 0 {
output.Confidence = 80
} else if numValid > numInvalid*10 {
output.Confidence = 25
}
return
}
package chardet
import (
"bytes"
)
var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
type recognizerUtf8 struct {
}
func newRecognizer_utf8() *recognizerUtf8 {
return &recognizerUtf8{}
}
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-8",
}
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
inputLen := len(input.raw)
var numValid, numInvalid uint32
var trailBytes uint8
for i := 0; i < inputLen; i++ {
c := input.raw[i]
if c&0x80 == 0 {
continue
}
if c&0xE0 == 0xC0 {
trailBytes = 1
} else if c&0xF0 == 0xE0 {
trailBytes = 2
} else if c&0xF8 == 0xF0 {
trailBytes = 3
} else {
numInvalid++
if numInvalid > 5 {
break
}
trailBytes = 0
}
for i++; i < inputLen; i++ {
c = input.raw[i]
if c&0xC0 != 0x80 {
numInvalid++
break
}
if trailBytes--; trailBytes == 0 {
numValid++
break
}
}
}
if hasBom && numInvalid == 0 {
output.Confidence = 100
} else if hasBom && numValid > numInvalid*10 {
output.Confidence = 80
} else if numValid > 3 && numInvalid == 0 {
output.Confidence = 100
} else if numValid > 0 && numInvalid == 0 {
output.Confidence = 80
} else if numValid == 0 && numInvalid == 0 {
// Plain ASCII
output.Confidence = 10
} else if numValid > numInvalid*10 {
output.Confidence = 25
}
return
}
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Additional IP Rights Grant (Patents)
"This implementation" means the copyrightable works distributed by
Google as part of the Go project.
Google hereby grants to You a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer and otherwise run, modify and propagate the contents of this
implementation of Go, where such license applies only to those patent
claims, both currently owned or controlled by Google and acquired in
the future, licensable by Google that are necessarily infringed by this
implementation of Go. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of Go or any code incorporated within this
implementation of Go constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of Go
shall terminate as of the date such litigation is filed.
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go
// Package charmap provides simple character encodings such as IBM Code Page 437
// and Windows 1252.
package charmap // import "golang.org/x/text/encoding/charmap"
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// These encodings vary only in the way clients should interpret them. Their
// coded character set is identical and a single implementation can be shared.
var (
// ISO8859_6E is the ISO 8859-6E encoding.
ISO8859_6E encoding.Encoding = &iso8859_6E
// ISO8859_6I is the ISO 8859-6I encoding.
ISO8859_6I encoding.Encoding = &iso8859_6I
// ISO8859_8E is the ISO 8859-8E encoding.
ISO8859_8E encoding.Encoding = &iso8859_8E
// ISO8859_8I is the ISO 8859-8I encoding.
ISO8859_8I encoding.Encoding = &iso8859_8I
iso8859_6E = internal.Encoding{
ISO8859_6,
"ISO-8859-6E",
identifier.ISO88596E,
}
iso8859_6I = internal.Encoding{
ISO8859_6,
"ISO-8859-6I",
identifier.ISO88596I,
}
iso8859_8E = internal.Encoding{
ISO8859_8,
"ISO-8859-8E",
identifier.ISO88598E,
}
iso8859_8I = internal.Encoding{
ISO8859_8,
"ISO-8859-8I",
identifier.ISO88598I,
}
)
// All is a list of all defined encodings in this package.
var All = listAll
// TODO: implement these encodings, in order of importance.
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
// ISO8859_9: Close to Windows 1254.
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
type utf8Enc struct {
len uint8
data [3]byte
}
// charmap describes an 8-bit character set encoding.
type charmap struct {
// name is the encoding's name.
name string
// mib is the encoding type of this encoder.
mib identifier.MIB
// asciiSuperset states whether the encoding is a superset of ASCII.
asciiSuperset bool
// low is the lower bound of the encoded byte for a non-ASCII rune. If
// charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
low uint8
// replacement is the encoded replacement character.
replacement byte
// decode is the map from encoded byte to UTF-8.
decode [256]utf8Enc
// encoding is the map from runes to encoded bytes. Each entry is a
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
// the rune. The table entries are sorted by ascending rune.
encode [256]uint32
}
func (m *charmap) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
}
func (m *charmap) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
}
func (m *charmap) String() string {
return m.name
}
func (m *charmap) ID() (mib identifier.MIB, other string) {
return m.mib, ""
}
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
type charmapDecoder struct {
transform.NopResetter
charmap *charmap
}
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for i, c := range src {
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc = i + 1
continue
}
decode := &m.charmap.decode[c]
n := int(decode.len)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
// It's 15% faster to avoid calling copy for these tiny slices.
for j := 0; j < n; j++ {
dst[nDst] = decode.data[j]
nDst++
}
nSrc = i + 1
}
return nDst, nSrc, err
}
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
type charmapEncoder struct {
transform.NopResetter
charmap *charmap
}
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for nSrc < len(src) {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
if m.charmap.asciiSuperset {
nSrc++
dst[nDst] = uint8(r)
nDst++
continue
}
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
} else {
err = internal.RepertoireError(m.charmap.replacement)
}
break
}
}
// Binary search in [low, high) for that rune in the m.charmap.encode table.
for low, high := int(m.charmap.low), 0x100; ; {
if low >= high {
err = internal.RepertoireError(m.charmap.replacement)
break loop
}
mid := (low + high) / 2
got := m.charmap.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
dst[nDst] = byte(got >> 24)
nDst++
break
}
}
nSrc += size
}
return nDst, nSrc, err
}
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/internal/gen"
)
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !"#$%&'()*+,-./0123456789:;<=>?` +
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
var encodings = []struct {
name string
mib string
comment string
varName string
replacement byte
mapping string
}{
{
"IBM Code Page 037",
"IBM037",
"",
"CodePage037",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
},
{
"IBM Code Page 437",
"PC8CodePage437",
"",
"CodePage437",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
},
{
"IBM Code Page 850",
"PC850Multilingual",
"",
"CodePage850",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
},
{
"IBM Code Page 852",
"PCp852",
"",
"CodePage852",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
},
{
"IBM Code Page 855",
"IBM855",
"",
"CodePage855",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
},
{
"Windows Code Page 858", // PC latin1 with Euro
"IBM00858",
"",
"CodePage858",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
},
{
"IBM Code Page 860",
"IBM860",
"",
"CodePage860",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
},
{
"IBM Code Page 862",
"PC862LatinHebrew",
"",
"CodePage862",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
},
{
"IBM Code Page 863",
"IBM863",
"",
"CodePage863",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
},
{
"IBM Code Page 865",
"IBM865",
"",
"CodePage865",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
},
{
"IBM Code Page 866",
"IBM866",
"",
"CodePage866",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-ibm866.txt",
},
{
"IBM Code Page 1047",
"IBM1047",
"",
"CodePage1047",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
},
{
"IBM Code Page 1140",
"IBM01140",
"",
"CodePage1140",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
},
{
"ISO 8859-1",
"ISOLatin1",
"",
"ISO8859_1",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
},
{
"ISO 8859-2",
"ISOLatin2",
"",
"ISO8859_2",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
},
{
"ISO 8859-3",
"ISOLatin3",
"",
"ISO8859_3",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
},
{
"ISO 8859-4",
"ISOLatin4",
"",
"ISO8859_4",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
},
{
"ISO 8859-5",
"ISOLatinCyrillic",
"",
"ISO8859_5",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
},
{
"ISO 8859-6",
"ISOLatinArabic",
"",
"ISO8859_6,ISO8859_6E,ISO8859_6I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
},
{
"ISO 8859-7",
"ISOLatinGreek",
"",
"ISO8859_7",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
},
{
"ISO 8859-8",
"ISOLatinHebrew",
"",
"ISO8859_8,ISO8859_8E,ISO8859_8I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
},
{
"ISO 8859-9",
"ISOLatin5",
"",
"ISO8859_9",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
},
{
"ISO 8859-10",
"ISOLatin6",
"",
"ISO8859_10",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
},
{
"ISO 8859-13",
"ISO885913",
"",
"ISO8859_13",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
},
{
"ISO 8859-14",
"ISO885914",
"",
"ISO8859_14",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
},
{
"ISO 8859-15",
"ISO885915",
"",
"ISO8859_15",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
},
{
"ISO 8859-16",
"ISO885916",
"",
"ISO8859_16",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
},
{
"KOI8-R",
"KOI8R",
"",
"KOI8R",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-r.txt",
},
{
"KOI8-U",
"KOI8U",
"",
"KOI8U",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-u.txt",
},
{
"Macintosh",
"Macintosh",
"",
"Macintosh",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-macintosh.txt",
},
{
"Macintosh Cyrillic",
"MacintoshCyrillic",
"",
"MacintoshCyrillic",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
},
{
"Windows 874",
"Windows874",
"",
"Windows874",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-874.txt",
},
{
"Windows 1250",
"Windows1250",
"",
"Windows1250",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1250.txt",
},
{
"Windows 1251",
"Windows1251",
"",
"Windows1251",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1251.txt",
},
{
"Windows 1252",
"Windows1252",
"",
"Windows1252",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1252.txt",
},
{
"Windows 1253",
"Windows1253",
"",
"Windows1253",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1253.txt",
},
{
"Windows 1254",
"Windows1254",
"",
"Windows1254",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1254.txt",
},
{
"Windows 1255",
"Windows1255",
"",
"Windows1255",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1255.txt",
},
{
"Windows 1256",
"Windows1256",
"",
"Windows1256",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1256.txt",
},
{
"Windows 1257",
"Windows1257",
"",
"Windows1257",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1257.txt",
},
{
"Windows 1258",
"Windows1258",
"",
"Windows1258",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1258.txt",
},
{
"X-User-Defined",
"XUserDefined",
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
"XUserDefined",
encoding.ASCIISub,
ascii +
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
},
}
func getWHATWG(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 128)
for i := range mapping {
mapping[i] = '\ufffd'
}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, 0
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 128 <= x {
log.Fatalf("code %d is out of range", x)
}
if 0x80 <= y && y < 0xa0 {
// We diverge from the WHATWG spec by mapping control characters
// in the range [0x80, 0xa0) to U+FFFD.
continue
}
mapping[x] = rune(y)
}
return ascii + string(mapping)
}
func getUCM(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 256)
for i := range mapping {
mapping[i] = '\ufffd'
}
charsFound := 0
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
var c byte
var r rune
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
continue
}
mapping[c] = r
charsFound++
}
if charsFound < 200 {
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
}
return string(mapping)
}
func main() {
mibs := map[string]bool{}
all := []string{}
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "charmap")
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
printf("import (\n")
printf("\t\"golang.org/x/text/encoding\"\n")
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
printf(")\n\n")
for _, e := range encodings {
varNames := strings.Split(e.varName, ",")
all = append(all, varNames...)
varName := varNames[0]
switch {
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
e.mapping = getWHATWG(e.mapping)
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
e.mapping = getUCM(e.mapping)
}
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
if asciiSuperset {
low = 0x80
}
lvn := 1
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
lvn = 3
}
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
printf("// %s is the %s encoding.\n", varName, e.name)
if e.comment != "" {
printf("//\n// %s\n", e.comment)
}
printf("var %s encoding.Encoding = &%s\n\nvar %s = charmap{\nname: %q,\n",
varName, lowerVarName, lowerVarName, e.name)
if mibs[e.mib] {
log.Fatalf("MIB type %q declared multiple times.", e.mib)
}
printf("mib: identifier.%s,\n", e.mib)
printf("asciiSuperset: %t,\n", asciiSuperset)
printf("low: 0x%02x,\n", low)
printf("replacement: 0x%02x,\n", e.replacement)
printf("decode: [256]utf8Enc{\n")
i, backMapping := 0, map[rune]byte{}
for _, c := range e.mapping {
if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
backMapping[c] = byte(i)
}
var buf [8]byte
n := utf8.EncodeRune(buf[:], c)
if n > 3 {
panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
}
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
if i%2 == 1 {
printf("\n")
}
i++
}
printf("},\n")
printf("encode: [256]uint32{\n")
encode := make([]uint32, 0, 256)
for c, i := range backMapping {
encode = append(encode, uint32(i)<<24|uint32(c))
}
sort.Sort(byRune(encode))
for len(encode) < cap(encode) {
encode = append(encode, encode[len(encode)-1])
}
for i, enc := range encode {
printf("0x%08x,", enc)
if i%8 == 7 {
printf("\n")
}
}
printf("},\n}\n")
// Add an estimate of the size of a single charmap{} struct value, which
// includes two 256 elem arrays of 4 bytes and some extra fields, which
// align to 3 uint64s on 64-bit architectures.
w.Size += 2*4*256 + 3*8
}
// TODO: add proper line breaking.
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
}
type byRune []uint32
func (b byRune) Len() int { return len(b) }
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment