Skip to content
Snippets Groups Projects
Commit e9abdd57 authored by Nick Thomas's avatar Nick Thomas
Browse files

Merge branch '1-initial-icu' into '1-initial-implementation'

Use icu4c for encoding to UTF-8

See merge request !3
parents f04f4f3e 038f1a77
No related branches found
No related tags found
2 merge requests!3Use icu4c for encoding to UTF-8,!1Initial implementation of an elasticsearch indexer in Go
Pipeline #
Showing
with 855 additions and 1838 deletions
Loading
Loading
@@ -6,10 +6,12 @@
ELASTIC_CONNECTION_INFO: '{"url":["http://elasticsearch:9200"]}'
stage: test
script:
- apt-get update && apt-get -yy install libicu-dev
- make setup
- make format
- make test
- make cover
- make
- make test
 
test 1.7:
<<: *test
Loading
Loading
package indexer
 
import (
"fmt"
"github.com/goodsign/icu"
)
 
"github.com/saintfish/chardet"
"golang.org/x/text/encoding/ianaindex"
var (
detector *icu.CharsetDetector
converter = icu.NewCharsetConverter(maxBlobSize)
)
 
var detector = chardet.NewTextDetector()
func init() {
var err error
detector, err = icu.NewCharsetDetector()
if err != nil {
panic(err)
}
}
 
func tryEncodeString(s string) string {
encoded, err := encodeString(s)
Loading
Loading
@@ -34,39 +42,15 @@ func encodeString(s string) (string, error) {
 
// encodeString converts a string from an arbitrary encoding to UTF-8
func encodeBytes(b []byte) (string, error) {
best, err := detector.DetectBest(b)
matches, err := detector.GuessCharset(b)
if err != nil {
return "", err
}
 
charset := best.Charset
// chardet has some incompatibilities with ianaindex
switch charset {
case "UTF-8":
return string(b), nil
case "GB-18030":
charset = "GB18030"
case "IBM420_ltr", "IBM420_rtl":
charset = "cp420"
case "IBM424_ltr", "IBM424_rtl":
charset = "IBM424"
}
encoding, err := ianaindex.IANA.Encoding(charset)
if err != nil {
return "", fmt.Errorf("Encoding %q: %v", charset, err)
}
// TODO(nick): Does this actually mean 'nothing to do'?
if encoding == nil {
return string(b), nil
}
decoded, err := encoding.NewDecoder().Bytes(b)
utf8, err := converter.ConvertToUtf8(b, matches[0].Charset)
if err != nil {
return "", err
}
 
return string(decoded), nil
return string(utf8), nil
}
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
<title>ICU License - ICU 1.8.1 and later</title>
</head>
<body BGCOLOR="#ffffff">
<h2>ICU License - ICU 1.8.1 and later</h2>
<p>COPYRIGHT AND PERMISSION NOTICE</p>
<p>
Copyright (c) 1995-2012 International Business Machines Corporation and others
</p>
<p>
All rights reserved.
</p>
<p>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies
of the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
</p>
<p>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
USE OR PERFORMANCE OF THIS SOFTWARE.
</p>
<p>
Except as contained in this notice, the name of a copyright holder shall not be
used in advertising or otherwise to promote the sale, use or other dealings in
this Software without prior written authorization of the copyright holder.
</p>
<hr style="color:gray;background-color:gray">
<p><small>
All trademarks and registered trademarks mentioned herein are the property of their respective owners.
</small></p>
<hr style="height:3px;color:black;background-color:black">
<h2>Third-Party Software Licenses</h2>
This section contains third-party software notices and/or additional terms for licensed
third-party software components included within ICU libraries.
<h3>1. Unicode Data Files and Software</h3>
<h3 align="center"><a name="Exhibit1">EXHIBIT 1</a><br>
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE</h3>
<blockquote>
<p>Unicode Data Files include all data files under the directories
<a href="http://www.unicode.org/Public/">http://www.unicode.org/Public/</a>,
<a href="http://www.unicode.org/reports/">http://www.unicode.org/reports/</a>,
and
<a title="http://www.unicode.org/cldr/data/" onClick="return top.js.OpenExtLink(window,event,this)" target="_blank" href="http://www.unicode.org/cldr/data/">
http://www.unicode.org/cldr/data/</a>. Unicode Data Files do not include PDF online code charts under the directory <a href="http://www.unicode.org/Public/">http://www.unicode.org/Public/</a>. Software includes any source code
published in the Unicode Standard or under the directories <a href="http://www.unicode.org/Public/">http://www.unicode.org/Public/</a>,
<a href="http://www.unicode.org/reports/">http://www.unicode.org/reports/</a>,
and
<a title="http://www.unicode.org/cldr/data/" onClick="return top.js.OpenExtLink(window,event,this)" target="_blank" href="http://www.unicode.org/cldr/data/">
http://www.unicode.org/cldr/data/</a>.</p>
<p>NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.</p>
<p>COPYRIGHT AND PERMISSION NOTICE</p>
<p>Copyright © 1991-2012 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in
<a href="http://www.unicode.org/copyright.html">http://www.unicode.org/copyright.html</a>.</p>
<p>Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and
any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that (a) the above copyright notice(s) and this permission notice appear
with all copies of the Data Files or Software, (b) both the above copyright notice(s) and this permission notice appear in associated documentation, and (c) there is clear notice in each modified Data File or in the Software as well as in the documentation associated with the Data File(s) or Software that the data or software has been modified.</p>
<p>THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.</p>
<p>Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder.</p>
<hr width="80%">
<p>Unicode and the Unicode logo are trademarks of Unicode, Inc. in the United States and other countries. All third party trademarks referenced herein are the property of their respective owners.</p>
</blockquote>
<h3>2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)</h3>
<pre>
# The Google Chrome software developed by Google is licensed under the BSD license. Other software included in this distribution is provided under other licenses, as set forth below.
#
# The BSD License
# http://opensource.org/licenses/bsd-license.php
# Copyright (C) 2006-2008, Google Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
# Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# The word list in cjdict.txt are generated by combining three word lists listed
# below with further processing for compound word breaking. The frequency is generated
# with an iterative training against Google web corpora.
#
# * Libtabe (Chinese)
# - https://sourceforge.net/project/?group_id=1519
# - Its license terms and conditions are shown below.
#
# * IPADIC (Japanese)
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
# - Its license terms and conditions are shown below.
#
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyrighy (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the TaBE Project nor the names of its
# * contributors may be used to endorse or promote products derived
# * from this software without specific prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# /*
# * Copyright (c) 1999 Computer Systems and Communication Lab,
# * Institute of Information Science, Academia Sinica.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the Computer Systems and Communication Lab
# * nor the names of its contributors may be used to endorse or
# * promote products derived from this software without specific
# * prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute, University of Illinois
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
#
# ---------------COPYING.libtabe-----END------------------------------------
#
#
# ---------------COPYING.ipadic-----BEGIN------------------------------------
#
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
# and Technology. All Rights Reserved.
#
# Use, reproduction, and distribution of this software is permitted.
# Any copy of this software, whether in its original form or modified,
# must include both the above copyright notice and the following
# paragraphs.
#
# Nara Institute of Science and Technology (NAIST),
# the copyright holders, disclaims all warranties with regard to this
# software, including all implied warranties of merchantability and
# fitness, in no event shall NAIST be liable for
# any special, indirect or consequential damages or any damages
# whatsoever resulting from loss of use, data or profits, whether in an
# action of contract, negligence or other tortuous action, arising out
# of or in connection with the use or performance of this software.
#
# A large portion of the dictionary entries
# originate from ICOT Free Software. The following conditions for ICOT
# Free Software applies to the current dictionary as well.
#
# Each User may also freely distribute the Program, whether in its
# original form or modified, to any third party or parties, PROVIDED
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
# on, or be attached to, the Program, which is distributed substantially
# in the same form as set out herein and that such intended
# distribution, if actually made, will neither violate or otherwise
# contravene any of the laws and regulations of the countries having
# jurisdiction over the User or the intended distribution itself.
#
# NO WARRANTY
#
# The program was produced on an experimental basis in the course of the
# research and development conducted during the project and is provided
# to users as so produced on an experimental basis. Accordingly, the
# program is provided without any warranty whatsoever, whether express,
# implied, statutory or otherwise. The term "warranty" used herein
# includes, but is not limited to, any warranty of the quality,
# performance, merchantability and fitness for a particular purpose of
# the program and the nonexistence of any infringement or violation of
# any right of any third party.
#
# Each user of the program will agree and understand, and be deemed to
# have agreed and understood, that there is no warranty whatsoever for
# the program and, accordingly, the entire risk arising from or
# otherwise connected with the program is assumed by the user.
#
# Therefore, neither ICOT, the copyright holder, or any other
# organization that participated in or was otherwise related to the
# development of the program and their respective officials, directors,
# officers and other employees shall be held liable for any and all
# damages, including, without limitation, general, special, incidental
# and consequential damages, arising out of or otherwise in connection
# with the use or inability to use the program or any product, material
# or result produced or otherwise obtained by using the program,
# regardless of whether they have been advised of, or otherwise had
# knowledge of, the possibility of such damages at any time during the
# project or thereafter. Each user will be deemed to have agreed to the
# foregoing by his or her commencement of use of the program. The term
# "use" as used herein includes, but is not limited to, the use,
# modification, copying and distribution of the program and the
# production of secondary products from the program.
#
# In the case where the program, whether in its original form or
# modified, was distributed or delivered to or received by a user from
# any person, organization or entity other than ICOT, unless it makes or
# grants independently of ICOT any specific warranty to the user in
# writing, such person, organization or entity, will also be exempted
# from and not be held liable to the user for any such damages as noted
# above as far as the program is concerned.
#
# ---------------COPYING.ipadic-----END------------------------------------
</pre>
<h3>3. Time Zone Database</h3>
<p>ICU uses the public domain data and code derived from <a href="http://www.iana.org/time-zones">
Time Zone Database</a> for its time zone support. The ownership of the TZ database is explained
in <a href="http://tools.ietf.org/html/rfc6557">BCP 175: Procedure for Maintaining the Time Zone
Database</a> section 7.<p>
<pre>
7. Database Ownership
The TZ database itself is not an IETF Contribution or an IETF
document. Rather it is a pre-existing and regularly updated work
that is in the public domain, and is intended to remain in the public
domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do not apply
to the TZ Database or contributions that individuals make to it.
Should any claims be made and substantiated against the TZ Database,
the organization that is providing the IANA Considerations defined in
this RFC, under the memorandum of understanding with the IETF,
currently ICANN, may act in accordance with all competent court
orders. No ownership claims will be made by ICANN or the IETF Trust
on the database or the code. Any person making a contribution to the
database or code waives all rights to future claims in that
contribution or in the TZ Database.
</pre>
</body>
</html>
\ No newline at end of file
About
==========
Cgo binding for icu4c C library detection and conversion functions. Guaranteed compatibility with version 50.1.
Installation
==========
Installation consists of several simple steps. They may be a bit different on your target system (e.g. require more permissions) so adapt them to the parameters of your system.
### Install build-essential
Make sure you have **build-essential** installed. Otherwise icu would fail on the configuration stage.
Installation example using apt-get (Ubuntu):
```
sudo apt-get install build-essential
```
### Install pkg-config
Make sure you have **pkg-config** installed.
Installation example using apt-get (Ubuntu):
```
sudo apt-get install pkg-config
```
### Get icu4c C library code
Download and unarchive original icu4c archive from [icu download section](http://site.icu-project.org/download).
Example (for version 50.1):
```
wget http://download.icu-project.org/files/icu4c/50.1/icu4c-50_1-src.tgz
tar -zxvf icu4c-50_1-src.tgz
mv -i ./icu ~/where-you-store-libs
```
NOTE: If this link is not working or there are some problems with downloading, there is a stable version 50.1 snapshot saved in [Github Downloads](https://github.com/downloads/goodsign/icu/icu4c-50_1-src.tgz).
### Build and install icu4c C library
From the directory, where you unarchived icu4c, run:
```
cd source
./configure
make
sudo make install
sudo ldconfig
```
### Install Go wrapper
```
go get github.com/goodsign/icu
go test github.com/goodsign/icu (must PASS)
```
Installation notes
==========
* Make sure that you have your local library paths set correctly and that installation was successful. Otherwise, **go build** or **go test** may fail.
* icu4c is installed in your local library directory (e.g. **/usr/local/lib**) and puts its libraries there. This path should be registered in your system (using ldconfig or exporting LD_LIBRARY_PATH, etc.) or the linker would fail.
* icu4c installs its header files to local include folders (e.g. **/usr/local/include/unicode**) so there is no need to have additional .h files with this package, but the system must be properly set up to detect .h files in those directories.
Usage
==========
Note: check icu documentation for returned encoding identifiers.
Detector
----------
```go
// Create detector
detector, err := NewCharsetDetector()
if err != nil {
//... Handle error ...
}
defer detector.Close()
// Guess encoding
encMatches, err := detector.GuessCharset(encodedText)
if err != nil {
//... Handle error ...
}
// Get charset with max confidence (goes first)
maxenc := encMatches[0].Charset
// Use maxenc.
// ...
```
Converter
----------
```go
...
// Create converter
converter := NewCharsetConverter(DefaultMaxTextSize)
// Convert to utf-8
converted, err := converter.ConvertToUtf8(encodedText, maxenc)
if nil != err {
//... Handle error ...
}
```
Usage notes
==========
* Check **NewCharsetConverter** func comments for details on max text size parameter.
* Often you would use detector and converter in pair. So, the 'converter' usage example actually continues the 'detector' example and uses the 'maxenc' result from it.
More info
----------
For more information on icu refer to the original [website](http://site.icu-project.org/), which contains links on theory and other details.
icu4c Licence
==========
ICU is released under a nonrestrictive open source license that is suitable for use with both commercial software and with other open source or free software.
[LICENCE file](https://github.com/goodsign/icu/blob/master/LICENCE_icu)
Licence
==========
The goodsign/icu binding is released under the [BSD Licence](http://opensource.org/licenses/bsd-license.php)
[LICENCE file](https://github.com/goodsign/icu/blob/master/LICENCE)
\ No newline at end of file
#include "c_bridge.h"
#include <string.h>
#include <unicode/utypes.h>
#include <unicode/ucsdet.h>
#include <stdlib.h>
#include <unicode/ucnv.h>
// See description in c_bridge.h
const int detectCharset(void *detector,
void *input,
int input_len,
int *status,
MatchData *matchBuffer,
int matchBufferSize) {
// Put input bytes in the detector.
ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
// Prepare vars for returned count and guesses.
int matchCount;
const UCharsetMatch **bestGuesses;
// Perform analysis and return all guesses and their count.
bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
// Fill the matchBuffer. Its size is matchBufferSize, so it is filled with
// less or equal to matchBufferSize number of entries.
int i;
int retCount = matchCount > matchBufferSize ? matchBufferSize : matchCount;
for (i = 0; i < retCount; i++) {
const UCharsetMatch* bestGuess = bestGuesses[i];
const char *bestGuessedCharset = NULL;
const char *bestGuessedLanguage = NULL;
// Fill guessed encoding
bestGuessedCharset = ucsdet_getName(bestGuess, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
// Fill guessed language
bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
// Fill its confidence rating
int32_t conf = ucsdet_getConfidence(bestGuess, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
matchBuffer[i].confidence = conf;
matchBuffer[i].charset = bestGuessedCharset;
matchBuffer[i].language = bestGuessedLanguage;
}
// Return the number of guesses put into matchBuffer.
return retCount;
}
// See description in c_bridge.h
int convertToUtf16(const char *srcEncoding,
UChar *dest,
int32_t destCapacity,
const char *src,
int32_t srcLength,
int *status){
UConverter *conv;
conv = ucnv_open(srcEncoding, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
/* Convert from original encoding to UTF-16 */
int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
ucnv_close(conv);
return len;
}
// See description in c_bridge.h
int convertFromUtf16(const char *destEncoding,
char *dest,
int32_t destCapacity,
const UChar *src,
int32_t srcLength,
int *status){
UConverter *conv;
conv = ucnv_open(destEncoding, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
/* Convert from UTF-16 to destination encoding */
int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status);
if (*status != U_ZERO_ERROR) {
return 0;
}
ucnv_close(conv);
return len;
}
\ No newline at end of file
#ifndef __C_BRIDGE_H__
#define __C_BRIDGE_H__
// C_BRIDGE is a bridge between go and native pure c functions used to
// operate with ICU library code.
#include <unicode/utypes.h>
#include <unicode/ucsdet.h>
// MatchData contains information about one 'guess' of the
// encoding detector. It contains the guessed charset (ICU string identifiers,
// see ICU documentation for them) and a confidence coefficient, which is a
// number between 0 and 100 (100 is the best).
typedef struct MatchData {
const char* charset;
const char* language;
short int confidence;
} MatchData;
// detectCharset performs the detection (guessing) operation using a given detector (ICU internals),
// input data (bytes), input length and error status pointer (Read ICU docs abour error codes).
//
// After the detection is performed, all possible matches are put into the matchBuffer. If there are
// more results than matchBufferSize, then only matchBufferSize entries are put (So no overflow can
// ever happen).
//
// The results of this function are put into the matchBuffer, so it MUST NOT be called asynchronously.
// Caller should guarantee thread safety and perform locks while working with it.
const int detectCharset(void *detector,
void *input,
int input_len,
int *status,
MatchData *matchBuffer,
int matchBufferSize);
// convertToUtf16 performs conversion from any encoding to utf16. Utf16 is the ICU standard so
// it is easier to convert to/from it.
//
// The results of this function are put into the dest buffer, so it MUST NOT be called asynchronously.
// Caller should guarantee thread safety and perform locks while working with it.
int convertToUtf16(const char *srcEncoding,
UChar *dest,
int32_t destCapacity,
const char *src,
int32_t srcLength,
int *status);
// convertFromUtf16 performs conversion from utf16 to any other encoding. Utf16 is the ICU standard so
// it is easier to convert to/from it.
//
// The results of this function are put into the dest buffer, so it MUST NOT be called asynchronously.
// Caller should guarantee thread safety and perform locks while working with it.
int convertFromUtf16(const char *destEncoding,
char *dest,
int32_t destCapacity,
const UChar *src,
int32_t srcLength,
int *status);
#endif //__C_BRIDGE_H__
\ No newline at end of file
package icu
// #cgo pkg-config: icu-i18n
// #include "c_bridge.h"
// #include "stdlib.h"
import "C"
import (
"fmt"
"sync"
"unsafe"
)
const (
DefaultMaxTextSize = 1024 * 1024 // Default value for the max text length in conversion operations
utf8MaxCharSize = 4
utf16MaxCharSize = 4
)
var (
Utf8CString = C.CString("UTF-8")
)
// CharsetConverter provides ICU charset conversion functionality.
type CharsetConverter struct {
utf16Buffer []byte
utf8Buffer []byte
maxTextSize int
cMutex sync.Mutex // Mutex used to guarantee thread safety for ICU calls
}
// NewCharsetConverter creates a new charset converter. It doesn't need to be closed as
// it doesn't allocate any resources.
//
// For better performance, conversion buffers are not allocated on each operation. Instead they
// are created in memory once and then used. 'maxTextSize' sets the size of these buffers.
// ICU library would return error if any processed text is longer than this parameter.
//
// NOTE:
//
// UTF8 uses 1 to 4 bytes for each symbol.
// UTF16 uses 2 bytes to 4 bytes for each symbol.
//
// So, to guarantee successful conversion of text with size = 'maxTextSize' we need:
// maxTextSize * 8 bytes (utf8 buffer + utf16 buffer).
func NewCharsetConverter(maxTextSize int) (*CharsetConverter) {
conv := new(CharsetConverter)
conv.utf16Buffer = make([]byte, utf16MaxCharSize * maxTextSize)
conv.utf8Buffer = make([]byte, utf8MaxCharSize * maxTextSize)
return conv
}
// ConvertToUtf8 converts input bytes encoded with srcEncoding to UTF-8.
func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([]byte, error) {
// As described in c_bridge.h, conversion operations are not thread safe and
// should be called consequently. So a mutex is used here.
conv.cMutex.Lock()
defer conv.cMutex.Unlock()
inputLen := len(input)
if inputLen == 0 {
return nil, fmt.Errorf("Nil length of input")
}
var status int
encCString := C.CString(srcEncoding)
inputCString := C.CString(string(input))
defer C.free(unsafe.Pointer(encCString))
defer C.free(unsafe.Pointer(inputCString))
convLen := C.convertToUtf16(
encCString,
(*C.UChar)(unsafe.Pointer(&conv.utf16Buffer[0])),
C.int32_t(len(conv.utf16Buffer)),
inputCString,
C.int32_t(len(input)),
(*C.int)(unsafe.Pointer(&status)))
if status == U_ZERO_ERROR {
nConvLen := C.convertFromUtf16(
Utf8CString,
(*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])),
C.int32_t(len(conv.utf8Buffer)),
(*C.UChar)(unsafe.Pointer(&conv.utf16Buffer[0])),
C.int32_t(convLen),
(*C.int)(unsafe.Pointer(&status)))
if status == U_ZERO_ERROR {
resStr := conv.utf8Buffer[:nConvLen]
return ([]byte)(resStr), nil
}
}
return nil, fmt.Errorf("ICU Error code returned: %d", status)
}
package icu
// #cgo pkg-config: icu-i18n
// #include "c_bridge.h"
// #include "stdlib.h"
import "C"
import (
"fmt"
"sync"
"unsafe"
)
const (
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
)
// CharsetDetector provides ICU charset detection functionality.
type CharsetDetector struct {
ptr *C.UCharsetDetector // ICU struct needed for detection
resBuffer [MatchDataBufferSize]C.MatchData
gMutex sync.Mutex // Mutex used to guarantee thread safety for ICU calls
}
// An equivalent of MatchData C structure (see c_bridge.h)
type Match struct {
Charset string
Language string
Confidence int
}
// Creates new charset detector. If it is successfully created, it
// must be closed as it needs to free native ICU resources.
func NewCharsetDetector() (*CharsetDetector, error) {
det := new(CharsetDetector)
var status int
statusPtr := unsafe.Pointer(&status)
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
if status != U_ZERO_ERROR {
return nil, fmt.Errorf("ICU Error code returned: %d", status)
}
return det, nil
}
func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err error) {
// As described in c_bridge.h, detection operations are not thread safe and
// should be called consequently. So a mutex is used here.
det.gMutex.Lock()
defer det.gMutex.Unlock()
inputLen := len(input)
if inputLen == 0 {
return nil, fmt.Errorf("Input data len is 0")
}
var status int
// Perform detection. Guess count is the number of matches returned.
// The matches themself are put in the result buffer
guessCount := C.detectCharset(
unsafe.Pointer(det.ptr),
unsafe.Pointer(&input[0]),
C.int(inputLen),
(*C.int)(unsafe.Pointer(&status)),
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
C.int(MatchDataBufferSize))
if status == U_ZERO_ERROR {
// Convert the returned number of entries from result buffer to a slice
// that will be returned
count := int(guessCount)
mt := make([]Match, count, count)
for i := 0; i < count; i++ {
mData := det.resBuffer[i]
charset := C.GoString(mData.charset)
language := C.GoString(mData.language)
mt[i] = Match{charset, language, int(mData.confidence)}
}
return mt, nil
}
return nil, fmt.Errorf("ICU Error code returned: %d", status)
}
// Close frees native C resources
func (det *CharsetDetector) Close() {
det.gMutex.Lock()
defer det.gMutex.Unlock()
if det.ptr != nil {
C.ucsdet_close(det.ptr)
}
}
package chardet
import (
"bytes"
)
type recognizer2022 struct {
charset string
escapes [][]byte
}
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Confidence: r.matchConfidence(input.input),
}
}
func (r *recognizer2022) matchConfidence(input []byte) int {
var hits, misses, shifts int
input:
for i := 0; i < len(input); i++ {
c := input[i]
if c == 0x1B {
for _, esc := range r.escapes {
if bytes.HasPrefix(input[i+1:], esc) {
hits++
i += len(esc)
continue input
}
}
misses++
} else if c == 0x0E || c == 0x0F {
shifts++
}
}
if hits == 0 {
return 0
}
quality := (100*hits - 100*misses) / (hits + misses)
if hits+shifts < 5 {
quality -= (5 - (hits + shifts)) * 10
}
if quality < 0 {
quality = 0
}
return quality
}
var escapeSequences_2022JP = [][]byte{
{0x24, 0x28, 0x43}, // KS X 1001:1992
{0x24, 0x28, 0x44}, // JIS X 212-1990
{0x24, 0x40}, // JIS C 6226-1978
{0x24, 0x41}, // GB 2312-80
{0x24, 0x42}, // JIS X 208-1983
{0x26, 0x40}, // JIS X 208 1990, 1997
{0x28, 0x42}, // ASCII
{0x28, 0x48}, // JIS-Roman
{0x28, 0x49}, // Half-width katakana
{0x28, 0x4a}, // JIS-Roman
{0x2e, 0x41}, // ISO 8859-1
{0x2e, 0x46}, // ISO 8859-7
}
var escapeSequences_2022KR = [][]byte{
{0x24, 0x29, 0x43},
}
var escapeSequences_2022CN = [][]byte{
{0x24, 0x29, 0x41}, // GB 2312-80
{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
{0x24, 0x29, 0x45}, // ISO-IR-165
{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
{0x4e}, // SS2
{0x4f}, // SS3
}
func newRecognizer_2022JP() *recognizer2022 {
return &recognizer2022{
"ISO-2022-JP",
escapeSequences_2022JP,
}
}
func newRecognizer_2022KR() *recognizer2022 {
return &recognizer2022{
"ISO-2022-KR",
escapeSequences_2022KR,
}
}
func newRecognizer_2022CN() *recognizer2022 {
return &recognizer2022{
"ISO-2022-CN",
escapeSequences_2022CN,
}
}
Sheng Yu (yusheng dot sjtu at gmail dot com)
Copyright (c) 2012 chardet Authors
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Partial of the Software is derived from ICU project. See icu-license.html for
license of the derivative portions.
# chardet
chardet is library to automatically detect
[charset](http://en.wikipedia.org/wiki/Character_encoding) of texts for [Go
programming language](http://golang.org/). It's based on the algorithm and data
in [ICU](http://icu-project.org/)'s implementation.
## Documentation and Usage
See [pkgdoc](http://go.pkgdoc.org/github.com/saintfish/chardet)
// Package chardet ports character set detection from ICU.
package chardet
import (
"errors"
"sort"
)
// Result contains all the information that charset detector gives.
type Result struct {
// IANA name of the detected charset.
Charset string
// IANA name of the detected language. It may be empty for some charsets.
Language string
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
Confidence int
}
// Detector implements charset detection.
type Detector struct {
recognizers []recognizer
stripTag bool
}
// List of charset recognizers
var recognizers = []recognizer{
newRecognizer_utf8(),
newRecognizer_utf16be(),
newRecognizer_utf16le(),
newRecognizer_utf32be(),
newRecognizer_utf32le(),
newRecognizer_8859_1_en(),
newRecognizer_8859_1_da(),
newRecognizer_8859_1_de(),
newRecognizer_8859_1_es(),
newRecognizer_8859_1_fr(),
newRecognizer_8859_1_it(),
newRecognizer_8859_1_nl(),
newRecognizer_8859_1_no(),
newRecognizer_8859_1_pt(),
newRecognizer_8859_1_sv(),
newRecognizer_8859_2_cs(),
newRecognizer_8859_2_hu(),
newRecognizer_8859_2_pl(),
newRecognizer_8859_2_ro(),
newRecognizer_8859_5_ru(),
newRecognizer_8859_6_ar(),
newRecognizer_8859_7_el(),
newRecognizer_8859_8_I_he(),
newRecognizer_8859_8_he(),
newRecognizer_windows_1251(),
newRecognizer_windows_1256(),
newRecognizer_KOI8_R(),
newRecognizer_8859_9_tr(),
newRecognizer_sjis(),
newRecognizer_gb_18030(),
newRecognizer_euc_jp(),
newRecognizer_euc_kr(),
newRecognizer_big5(),
newRecognizer_2022JP(),
newRecognizer_2022KR(),
newRecognizer_2022CN(),
newRecognizer_IBM424_he_rtl(),
newRecognizer_IBM424_he_ltr(),
newRecognizer_IBM420_ar_rtl(),
newRecognizer_IBM420_ar_ltr(),
}
// NewTextDetector creates a Detector for plain text.
func NewTextDetector() *Detector {
return &Detector{recognizers, false}
}
// NewHtmlDetector creates a Detector for Html.
func NewHtmlDetector() *Detector {
return &Detector{recognizers, true}
}
var (
NotDetectedError = errors.New("Charset not detected.")
)
// DetectBest returns the Result with highest Confidence.
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
var all []Result
if all, err = d.DetectAll(b); err == nil {
r = &all[0]
}
return
}
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
}
outputs := make([]recognizerOutput, 0, len(d.recognizers))
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if o.Confidence > 0 {
outputs = append(outputs, o)
}
}
if len(outputs) == 0 {
return nil, NotDetectedError
}
sort.Sort(recognizerOutputs(outputs))
dedupOutputs := make([]Result, 0, len(outputs))
foundCharsets := make(map[string]struct{}, len(outputs))
for _, o := range outputs {
if _, found := foundCharsets[o.Charset]; !found {
dedupOutputs = append(dedupOutputs, Result(o))
foundCharsets[o.Charset] = struct{}{}
}
}
if len(dedupOutputs) == 0 {
return nil, NotDetectedError
}
return dedupOutputs, nil
}
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
}
type recognizerOutputs []recognizerOutput
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></meta>
<title>ICU License - ICU 1.8.1 and later</title>
</head>
<body BGCOLOR="#ffffff">
<h2>ICU License - ICU 1.8.1 and later</h2>
<p>COPYRIGHT AND PERMISSION NOTICE</p>
<p>
Copyright (c) 1995-2012 International Business Machines Corporation and others
</p>
<p>
All rights reserved.
</p>
<p>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies
of the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
</p>
<p>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
USE OR PERFORMANCE OF THIS SOFTWARE.
</p>
<p>
Except as contained in this notice, the name of a copyright holder shall not be
used in advertising or otherwise to promote the sale, use or other dealings in
this Software without prior written authorization of the copyright holder.
</p>
<hr>
<p><small>
All trademarks and registered trademarks mentioned herein are the property of their respective owners.
</small></p>
</body>
</html>
package chardet
import (
"errors"
"math"
)
type recognizerMultiByte struct {
charset string
language string
decoder charDecoder
commonChars []uint16
}
type charDecoder interface {
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
}
func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Language: r.language,
Confidence: r.matchConfidence(input),
}
}
func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
raw := input.raw
var c uint16
var err error
var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
totalCharCount++
if err != nil {
badCharCount++
} else if c <= 0xFF {
singleByteCharCount++
} else {
doubleByteCharCount++
if r.commonChars != nil && binarySearch(r.commonChars, c) {
commonCharCount++
}
}
if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
return 0
}
}
if doubleByteCharCount <= 10 && badCharCount == 0 {
if doubleByteCharCount == 0 && totalCharCount < 10 {
return 0
} else {
return 10
}
}
if doubleByteCharCount < 20*badCharCount {
return 0
}
if r.commonChars == nil {
confidence := 30 + doubleByteCharCount - 20*badCharCount
if confidence > 100 {
confidence = 100
}
return confidence
}
maxVal := math.Log(float64(doubleByteCharCount) / 4)
scaleFactor := 90 / maxVal
confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
if confidence > 100 {
confidence = 100
}
if confidence < 0 {
confidence = 0
}
return confidence
}
func binarySearch(l []uint16, c uint16) bool {
start := 0
end := len(l) - 1
for start <= end {
mid := (start + end) / 2
if c == l[mid] {
return true
} else if c < l[mid] {
end = mid - 1
} else {
start = mid + 1
}
}
return false
}
var eobError = errors.New("End of input buffer")
var badCharError = errors.New("Decode a bad char")
type charDecoder_sjis struct {
}
func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
c = uint16(first)
remain = input[1:]
if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
return
}
if len(remain) == 0 {
return c, remain, badCharError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
} else {
err = badCharError
}
return
}
var commonChars_sjis = []uint16{
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
}
func newRecognizer_sjis() *recognizerMultiByte {
return &recognizerMultiByte{
"Shift_JIS",
"ja",
charDecoder_sjis{},
commonChars_sjis,
}
}
type charDecoder_euc struct {
}
func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x8D {
return uint16(first), remain, nil
}
if len(remain) == 0 {
return 0, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if first >= 0xA1 && first <= 0xFE {
if second < 0xA1 {
err = badCharError
}
return
}
if first == 0x8E {
if second < 0xA1 {
err = badCharError
}
return
}
if first == 0x8F {
if len(remain) == 0 {
return 0, nil, eobError
}
third := remain[0]
remain = remain[1:]
c = c<<0 | uint16(third)
if third < 0xa1 {
err = badCharError
}
}
return
}
var commonChars_euc_jp = []uint16{
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
}
var commonChars_euc_kr = []uint16{
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
}
func newRecognizer_euc_jp() *recognizerMultiByte {
return &recognizerMultiByte{
"EUC-JP",
"ja",
charDecoder_euc{},
commonChars_euc_jp,
}
}
func newRecognizer_euc_kr() *recognizerMultiByte {
return &recognizerMultiByte{
"EUC-KR",
"ko",
charDecoder_euc{},
commonChars_euc_kr,
}
}
type charDecoder_big5 struct {
}
func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x7F || first == 0xFF {
return
}
if len(remain) == 0 {
return c, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if second < 0x40 || second == 0x7F || second == 0xFF {
err = badCharError
}
return
}
var commonChars_big5 = []uint16{
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
}
func newRecognizer_big5() *recognizerMultiByte {
return &recognizerMultiByte{
"Big5",
"zh",
charDecoder_big5{},
commonChars_big5,
}
}
type charDecoder_gb_18030 struct {
}
func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x80 {
return
}
if len(remain) == 0 {
return 0, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if first >= 0x81 && first <= 0xFE {
if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
return
}
if second >= 0x30 && second <= 0x39 {
if len(remain) == 0 {
return 0, nil, eobError
}
third := remain[0]
remain = remain[1:]
if third >= 0x81 && third <= 0xFE {
if len(remain) == 0 {
return 0, nil, eobError
}
fourth := remain[0]
remain = remain[1:]
if fourth >= 0x30 && fourth <= 0x39 {
c = c<<16 | uint16(third)<<8 | uint16(fourth)
return
}
}
}
err = badCharError
}
return
}
var commonChars_gb_18030 = []uint16{
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
}
func newRecognizer_gb_18030() *recognizerMultiByte {
return &recognizerMultiByte{
"GB-18030",
"zh",
charDecoder_gb_18030{},
commonChars_gb_18030,
}
}
package chardet
type recognizer interface {
Match(*recognizerInput) recognizerOutput
}
type recognizerOutput Result
type recognizerInput struct {
raw []byte
input []byte
tagStripped bool
byteStats []int
hasC1Bytes bool
}
func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
input, stripped := mayStripInput(raw, stripTag)
byteStats := computeByteStats(input)
return &recognizerInput{
raw: raw,
input: input,
tagStripped: stripped,
byteStats: byteStats,
hasC1Bytes: computeHasC1Bytes(byteStats),
}
}
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
const inputBufferSize = 8192
out = make([]byte, 0, inputBufferSize)
var badTags, openTags int32
var inMarkup bool = false
stripped = false
if stripTag {
stripped = true
for _, c := range raw {
if c == '<' {
if inMarkup {
badTags += 1
}
inMarkup = true
openTags += 1
}
if !inMarkup {
out = append(out, c)
if len(out) >= inputBufferSize {
break
}
}
if c == '>' {
inMarkup = false
}
}
}
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
limit := len(raw)
if limit > inputBufferSize {
limit = inputBufferSize
}
out = make([]byte, limit)
copy(out, raw[:limit])
stripped = false
}
return
}
func computeByteStats(input []byte) []int {
r := make([]int, 256)
for _, c := range input {
r[c] += 1
}
return r
}
func computeHasC1Bytes(byteStats []int) bool {
for _, count := range byteStats[0x80 : 0x9F+1] {
if count > 0 {
return true
}
}
return false
}
This diff is collapsed.
package chardet
import (
"bytes"
)
var (
utf16beBom = []byte{0xFE, 0xFF}
utf16leBom = []byte{0xFF, 0xFE}
utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
)
type recognizerUtf16be struct {
}
func newRecognizer_utf16be() *recognizerUtf16be {
return &recognizerUtf16be{}
}
func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16BE",
}
if bytes.HasPrefix(input.raw, utf16beBom) {
output.Confidence = 100
}
return
}
type recognizerUtf16le struct {
}
func newRecognizer_utf16le() *recognizerUtf16le {
return &recognizerUtf16le{}
}
func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16LE",
}
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
output.Confidence = 100
}
return
}
type recognizerUtf32 struct {
name string
bom []byte
decodeChar func(input []byte) uint32
}
func decodeUtf32be(input []byte) uint32 {
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
}
func decodeUtf32le(input []byte) uint32 {
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
}
func newRecognizer_utf32be() *recognizerUtf32 {
return &recognizerUtf32{
"UTF-32BE",
utf32beBom,
decodeUtf32be,
}
}
func newRecognizer_utf32le() *recognizerUtf32 {
return &recognizerUtf32{
"UTF-32LE",
utf32leBom,
decodeUtf32le,
}
}
func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: r.name,
}
hasBom := bytes.HasPrefix(input.raw, r.bom)
var numValid, numInvalid uint32
for b := input.raw; len(b) >= 4; b = b[4:] {
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
numInvalid++
} else {
numValid++
}
}
if hasBom && numInvalid == 0 {
output.Confidence = 100
} else if hasBom && numValid > numInvalid*10 {
output.Confidence = 80
} else if numValid > 3 && numInvalid == 0 {
output.Confidence = 100
} else if numValid > 0 && numInvalid == 0 {
output.Confidence = 80
} else if numValid > numInvalid*10 {
output.Confidence = 25
}
return
}
package chardet
import (
"bytes"
)
var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
type recognizerUtf8 struct {
}
func newRecognizer_utf8() *recognizerUtf8 {
return &recognizerUtf8{}
}
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-8",
}
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
inputLen := len(input.raw)
var numValid, numInvalid uint32
var trailBytes uint8
for i := 0; i < inputLen; i++ {
c := input.raw[i]
if c&0x80 == 0 {
continue
}
if c&0xE0 == 0xC0 {
trailBytes = 1
} else if c&0xF0 == 0xE0 {
trailBytes = 2
} else if c&0xF8 == 0xF0 {
trailBytes = 3
} else {
numInvalid++
if numInvalid > 5 {
break
}
trailBytes = 0
}
for i++; i < inputLen; i++ {
c = input.raw[i]
if c&0xC0 != 0x80 {
numInvalid++
break
}
if trailBytes--; trailBytes == 0 {
numValid++
break
}
}
}
if hasBom && numInvalid == 0 {
output.Confidence = 100
} else if hasBom && numValid > numInvalid*10 {
output.Confidence = 80
} else if numValid > 3 && numInvalid == 0 {
output.Confidence = 100
} else if numValid > 0 && numInvalid == 0 {
output.Confidence = 80
} else if numValid == 0 && numInvalid == 0 {
// Plain ASCII
output.Confidence = 10
} else if numValid > numInvalid*10 {
output.Confidence = 25
}
return
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment