update
parent
46c68a68e8
commit
f22123d788
@ -0,0 +1,3 @@
|
|||||||
|
# This source code refers to The Go Authors for copyright purposes.
|
||||||
|
# The master list of authors is in the main Go distribution,
|
||||||
|
# visible at http://tip.golang.org/AUTHORS.
|
@ -0,0 +1,3 @@
|
|||||||
|
# This source code was written by the Go contributors.
|
||||||
|
# The master list of contributors is in the main Go distribution,
|
||||||
|
# visible at http://tip.golang.org/CONTRIBUTORS.
|
@ -0,0 +1,27 @@
|
|||||||
|
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following disclaimer
|
||||||
|
in the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
* Neither the name of Google Inc. nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@ -0,0 +1,22 @@
|
|||||||
|
Additional IP Rights Grant (Patents)
|
||||||
|
|
||||||
|
"This implementation" means the copyrightable works distributed by
|
||||||
|
Google as part of the Go project.
|
||||||
|
|
||||||
|
Google hereby grants to You a perpetual, worldwide, non-exclusive,
|
||||||
|
no-charge, royalty-free, irrevocable (except as stated in this section)
|
||||||
|
patent license to make, have made, use, offer to sell, sell, import,
|
||||||
|
transfer and otherwise run, modify and propagate the contents of this
|
||||||
|
implementation of Go, where such license applies only to those patent
|
||||||
|
claims, both currently owned or controlled by Google and acquired in
|
||||||
|
the future, licensable by Google that are necessarily infringed by this
|
||||||
|
implementation of Go. This grant does not include claims that would be
|
||||||
|
infringed only as a consequence of further modification of this
|
||||||
|
implementation. If you or your agent or exclusive licensee institute or
|
||||||
|
order or agree to the institution of patent litigation against any
|
||||||
|
entity (including a cross-claim or counterclaim in a lawsuit) alleging
|
||||||
|
that this implementation of Go or any code incorporated within this
|
||||||
|
implementation of Go constitutes direct or contributory patent
|
||||||
|
infringement, or inducement of patent infringement, then any patent
|
||||||
|
rights granted to you under this License for this implementation of Go
|
||||||
|
shall terminate as of the date such litigation is filed.
|
@ -0,0 +1,335 @@
|
|||||||
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package encoding defines an interface for character encodings, such as Shift
|
||||||
|
// JIS and Windows 1252, that can convert to and from UTF-8.
|
||||||
|
//
|
||||||
|
// Encoding implementations are provided in other packages, such as
|
||||||
|
// golang.org/x/text/encoding/charmap and
|
||||||
|
// golang.org/x/text/encoding/japanese.
|
||||||
|
package encoding // import "golang.org/x/text/encoding"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"strconv"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/text/encoding/internal/identifier"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TODO:
|
||||||
|
// - There seems to be some inconsistency in when decoders return errors
|
||||||
|
// and when not. Also documentation seems to suggest they shouldn't return
|
||||||
|
// errors at all (except for UTF-16).
|
||||||
|
// - Encoders seem to rely on or at least benefit from the input being in NFC
|
||||||
|
// normal form. Perhaps add an example how users could prepare their output.
|
||||||
|
|
||||||
|
// Encoding is a character set encoding that can be transformed to and from
|
||||||
|
// UTF-8.
|
||||||
|
type Encoding interface {
|
||||||
|
// NewDecoder returns a Decoder.
|
||||||
|
NewDecoder() *Decoder
|
||||||
|
|
||||||
|
// NewEncoder returns an Encoder.
|
||||||
|
NewEncoder() *Encoder
|
||||||
|
}
|
||||||
|
|
||||||
|
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
|
||||||
|
//
|
||||||
|
// Transforming source bytes that are not of that encoding will not result in an
|
||||||
|
// error per se. Each byte that cannot be transcoded will be represented in the
|
||||||
|
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
|
||||||
|
type Decoder struct {
|
||||||
|
transform.Transformer
|
||||||
|
|
||||||
|
// This forces external creators of Decoders to use names in struct
|
||||||
|
// initializers, allowing for future extendibility without having to break
|
||||||
|
// code.
|
||||||
|
_ struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
|
||||||
|
// bytes or nil, err if any error occurred.
|
||||||
|
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
|
||||||
|
b, _, err := transform.Bytes(d, b)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return b, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// String converts the given encoded string to UTF-8. It returns the converted
|
||||||
|
// string or "", err if any error occurred.
|
||||||
|
func (d *Decoder) String(s string) (string, error) {
|
||||||
|
s, _, err := transform.String(d, s)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reader wraps another Reader to decode its bytes.
|
||||||
|
//
|
||||||
|
// The Decoder may not be used for any other operation as long as the returned
|
||||||
|
// Reader is in use.
|
||||||
|
func (d *Decoder) Reader(r io.Reader) io.Reader {
|
||||||
|
return transform.NewReader(r, d)
|
||||||
|
}
|
||||||
|
|
||||||
|
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
|
||||||
|
//
|
||||||
|
// Each rune that cannot be transcoded will result in an error. In this case,
|
||||||
|
// the transform will consume all source byte up to, not including the offending
|
||||||
|
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
|
||||||
|
// `\uFFFD`. To return early with an error instead, use transform.Chain to
|
||||||
|
// preprocess the data with a UTF8Validator.
|
||||||
|
type Encoder struct {
|
||||||
|
transform.Transformer
|
||||||
|
|
||||||
|
// This forces external creators of Encoders to use names in struct
|
||||||
|
// initializers, allowing for future extendibility without having to break
|
||||||
|
// code.
|
||||||
|
_ struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
|
||||||
|
// any error occurred.
|
||||||
|
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
|
||||||
|
b, _, err := transform.Bytes(e, b)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return b, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// String converts a string from UTF-8. It returns the converted string or
|
||||||
|
// "", err if any error occurred.
|
||||||
|
func (e *Encoder) String(s string) (string, error) {
|
||||||
|
s, _, err := transform.String(e, s)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Writer wraps another Writer to encode its UTF-8 output.
|
||||||
|
//
|
||||||
|
// The Encoder may not be used for any other operation as long as the returned
|
||||||
|
// Writer is in use.
|
||||||
|
func (e *Encoder) Writer(w io.Writer) io.Writer {
|
||||||
|
return transform.NewWriter(w, e)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASCIISub is the ASCII substitute character, as recommended by
|
||||||
|
// https://unicode.org/reports/tr36/#Text_Comparison
|
||||||
|
const ASCIISub = '\x1a'
|
||||||
|
|
||||||
|
// Nop is the nop encoding. Its transformed bytes are the same as the source
|
||||||
|
// bytes; it does not replace invalid UTF-8 sequences.
|
||||||
|
var Nop Encoding = nop{}
|
||||||
|
|
||||||
|
type nop struct{}
|
||||||
|
|
||||||
|
func (nop) NewDecoder() *Decoder {
|
||||||
|
return &Decoder{Transformer: transform.Nop}
|
||||||
|
}
|
||||||
|
func (nop) NewEncoder() *Encoder {
|
||||||
|
return &Encoder{Transformer: transform.Nop}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replacement is the replacement encoding. Decoding from the replacement
|
||||||
|
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
|
||||||
|
// the replacement encoding yields the same as the source bytes except that
|
||||||
|
// invalid UTF-8 is converted to '\uFFFD'.
|
||||||
|
//
|
||||||
|
// It is defined at http://encoding.spec.whatwg.org/#replacement
|
||||||
|
var Replacement Encoding = replacement{}
|
||||||
|
|
||||||
|
type replacement struct{}
|
||||||
|
|
||||||
|
func (replacement) NewDecoder() *Decoder {
|
||||||
|
return &Decoder{Transformer: replacementDecoder{}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (replacement) NewEncoder() *Encoder {
|
||||||
|
return &Encoder{Transformer: replacementEncoder{}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (replacement) ID() (mib identifier.MIB, other string) {
|
||||||
|
return identifier.Replacement, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
type replacementDecoder struct{ transform.NopResetter }
|
||||||
|
|
||||||
|
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
if len(dst) < 3 {
|
||||||
|
return 0, 0, transform.ErrShortDst
|
||||||
|
}
|
||||||
|
if atEOF {
|
||||||
|
const fffd = "\ufffd"
|
||||||
|
dst[0] = fffd[0]
|
||||||
|
dst[1] = fffd[1]
|
||||||
|
dst[2] = fffd[2]
|
||||||
|
nDst = 3
|
||||||
|
}
|
||||||
|
return nDst, len(src), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type replacementEncoder struct{ transform.NopResetter }
|
||||||
|
|
||||||
|
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
r, size := rune(0), 0
|
||||||
|
|
||||||
|
for ; nSrc < len(src); nSrc += size {
|
||||||
|
r = rune(src[nSrc])
|
||||||
|
|
||||||
|
// Decode a 1-byte rune.
|
||||||
|
if r < utf8.RuneSelf {
|
||||||
|
size = 1
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// Decode a multi-byte rune.
|
||||||
|
r, size = utf8.DecodeRune(src[nSrc:])
|
||||||
|
if size == 1 {
|
||||||
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||||
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||||
|
// full character yet.
|
||||||
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
r = '\ufffd'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||||
|
}
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
|
||||||
|
// repertoire of the destination encoding with HTML escape sequences.
|
||||||
|
//
|
||||||
|
// This wrapper exists to comply to URL and HTML forms requiring a
|
||||||
|
// non-terminating legacy encoder. The produced sequences may lead to data
|
||||||
|
// loss as they are indistinguishable from legitimate input. To avoid this
|
||||||
|
// issue, use UTF-8 encodings whenever possible.
|
||||||
|
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
|
||||||
|
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReplaceUnsupported wraps encoders to replace source runes outside the
|
||||||
|
// repertoire of the destination encoding with an encoding-specific
|
||||||
|
// replacement.
|
||||||
|
//
|
||||||
|
// This wrapper is only provided for backwards compatibility and legacy
|
||||||
|
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
|
||||||
|
func ReplaceUnsupported(e *Encoder) *Encoder {
|
||||||
|
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
|
||||||
|
}
|
||||||
|
|
||||||
|
type errorHandler struct {
|
||||||
|
*Encoder
|
||||||
|
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: consider making this error public in some form.
|
||||||
|
type repertoireError interface {
|
||||||
|
Replacement() byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
|
||||||
|
for err != nil {
|
||||||
|
rerr, ok := err.(repertoireError)
|
||||||
|
if !ok {
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
||||||
|
r, sz := utf8.DecodeRune(src[nSrc:])
|
||||||
|
n, ok := h.handler(dst[nDst:], r, rerr)
|
||||||
|
if !ok {
|
||||||
|
return nDst, nSrc, transform.ErrShortDst
|
||||||
|
}
|
||||||
|
err = nil
|
||||||
|
nDst += n
|
||||||
|
if nSrc += sz; nSrc < len(src) {
|
||||||
|
var dn, sn int
|
||||||
|
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
|
||||||
|
nDst += dn
|
||||||
|
nSrc += sn
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
||||||
|
buf := [8]byte{}
|
||||||
|
b := strconv.AppendUint(buf[:0], uint64(r), 10)
|
||||||
|
if n = len(b) + len("&#;"); n >= len(dst) {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
dst[0] = '&'
|
||||||
|
dst[1] = '#'
|
||||||
|
dst[copy(dst[2:], b)+2] = ';'
|
||||||
|
return n, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
||||||
|
if len(dst) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
dst[0] = err.Replacement()
|
||||||
|
return 1, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
|
||||||
|
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
|
||||||
|
|
||||||
|
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
|
||||||
|
// input byte that is not valid UTF-8.
|
||||||
|
var UTF8Validator transform.Transformer = utf8Validator{}
|
||||||
|
|
||||||
|
type utf8Validator struct{ transform.NopResetter }
|
||||||
|
|
||||||
|
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
n := len(src)
|
||||||
|
if n > len(dst) {
|
||||||
|
n = len(dst)
|
||||||
|
}
|
||||||
|
for i := 0; i < n; {
|
||||||
|
if c := src[i]; c < utf8.RuneSelf {
|
||||||
|
dst[i] = c
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, size := utf8.DecodeRune(src[i:])
|
||||||
|
if size == 1 {
|
||||||
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||||
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||||
|
// full character yet.
|
||||||
|
err = ErrInvalidUTF8
|
||||||
|
if !atEOF && !utf8.FullRune(src[i:]) {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
}
|
||||||
|
return i, i, err
|
||||||
|
}
|
||||||
|
if i+size > len(dst) {
|
||||||
|
return i, i, transform.ErrShortDst
|
||||||
|
}
|
||||||
|
for ; size > 0; size-- {
|
||||||
|
dst[i] = src[i]
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(src) > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
}
|
||||||
|
return n, n, err
|
||||||
|
}
|
@ -0,0 +1,81 @@
|
|||||||
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//go:generate go run gen.go
|
||||||
|
|
||||||
|
// Package identifier defines the contract between implementations of Encoding
|
||||||
|
// and Index by defining identifiers that uniquely identify standardized coded
|
||||||
|
// character sets (CCS) and character encoding schemes (CES), which we will
|
||||||
|
// together refer to as encodings, for which Encoding implementations provide
|
||||||
|
// converters to and from UTF-8. This package is typically only of concern to
|
||||||
|
// implementers of Indexes and Encodings.
|
||||||
|
//
|
||||||
|
// One part of the identifier is the MIB code, which is defined by IANA and
|
||||||
|
// uniquely identifies a CCS or CES. Each code is associated with data that
|
||||||
|
// references authorities, official documentation as well as aliases and MIME
|
||||||
|
// names.
|
||||||
|
//
|
||||||
|
// Not all CESs are covered by the IANA registry. The "other" string that is
|
||||||
|
// returned by ID can be used to identify other character sets or versions of
|
||||||
|
// existing ones.
|
||||||
|
//
|
||||||
|
// It is recommended that each package that provides a set of Encodings provide
|
||||||
|
// the All and Common variables to reference all supported encodings and
|
||||||
|
// commonly used subset. This allows Index implementations to include all
|
||||||
|
// available encodings without explicitly referencing or knowing about them.
|
||||||
|
package identifier
|
||||||
|
|
||||||
|
// Note: this package is internal, but could be made public if there is a need
|
||||||
|
// for writing third-party Indexes and Encodings.
|
||||||
|
|
||||||
|
// References:
|
||||||
|
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
|
||||||
|
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
|
||||||
|
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
|
||||||
|
// - http://www.ietf.org/rfc/rfc2978.txt
|
||||||
|
// - https://www.unicode.org/reports/tr22/
|
||||||
|
// - http://www.w3.org/TR/encoding/
|
||||||
|
// - https://encoding.spec.whatwg.org/
|
||||||
|
// - https://encoding.spec.whatwg.org/encodings.json
|
||||||
|
// - https://tools.ietf.org/html/rfc6657#section-5
|
||||||
|
|
||||||
|
// Interface can be implemented by Encodings to define the CCS or CES for which
|
||||||
|
// it implements conversions.
|
||||||
|
type Interface interface {
|
||||||
|
// ID returns an encoding identifier. Exactly one of the mib and other
|
||||||
|
// values should be non-zero.
|
||||||
|
//
|
||||||
|
// In the usual case it is only necessary to indicate the MIB code. The
|
||||||
|
// other string can be used to specify encodings for which there is no MIB,
|
||||||
|
// such as "x-mac-dingbat".
|
||||||
|
//
|
||||||
|
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
|
||||||
|
ID() (mib MIB, other string)
|
||||||
|
|
||||||
|
// NOTE: the restrictions on the encoding are to allow extending the syntax
|
||||||
|
// with additional information such as versions, vendors and other variants.
|
||||||
|
}
|
||||||
|
|
||||||
|
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
|
||||||
|
// some identifiers for some encodings that are not covered by the IANA
|
||||||
|
// standard.
|
||||||
|
//
|
||||||
|
// See http://www.iana.org/assignments/ianacharset-mib.
|
||||||
|
type MIB uint16
|
||||||
|
|
||||||
|
// These additional MIB types are not defined in IANA. They are added because
|
||||||
|
// they are common and defined within the text repo.
|
||||||
|
const (
|
||||||
|
// Unofficial marks the start of encodings not registered by IANA.
|
||||||
|
Unofficial MIB = 10000 + iota
|
||||||
|
|
||||||
|
// Replacement is the WhatWG replacement encoding.
|
||||||
|
Replacement
|
||||||
|
|
||||||
|
// XUserDefined is the code for x-user-defined.
|
||||||
|
XUserDefined
|
||||||
|
|
||||||
|
// MacintoshCyrillic is the code for x-mac-cyrillic.
|
||||||
|
MacintoshCyrillic
|
||||||
|
)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,75 @@
|
|||||||
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package internal contains code that is shared among encoding implementations.
|
||||||
|
package internal
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/text/encoding"
|
||||||
|
"golang.org/x/text/encoding/internal/identifier"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Encoding is an implementation of the Encoding interface that adds the String
|
||||||
|
// and ID methods to an existing encoding.
|
||||||
|
type Encoding struct {
|
||||||
|
encoding.Encoding
|
||||||
|
Name string
|
||||||
|
MIB identifier.MIB
|
||||||
|
}
|
||||||
|
|
||||||
|
// _ verifies that Encoding implements identifier.Interface.
|
||||||
|
var _ identifier.Interface = (*Encoding)(nil)
|
||||||
|
|
||||||
|
func (e *Encoding) String() string {
|
||||||
|
return e.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *Encoding) ID() (mib identifier.MIB, other string) {
|
||||||
|
return e.MIB, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// SimpleEncoding is an Encoding that combines two Transformers.
|
||||||
|
type SimpleEncoding struct {
|
||||||
|
Decoder transform.Transformer
|
||||||
|
Encoder transform.Transformer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
|
||||||
|
return &encoding.Decoder{Transformer: e.Decoder}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
|
||||||
|
return &encoding.Encoder{Transformer: e.Encoder}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FuncEncoding is an Encoding that combines two functions returning a new
|
||||||
|
// Transformer.
|
||||||
|
type FuncEncoding struct {
|
||||||
|
Decoder func() transform.Transformer
|
||||||
|
Encoder func() transform.Transformer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e FuncEncoding) NewDecoder() *encoding.Decoder {
|
||||||
|
return &encoding.Decoder{Transformer: e.Decoder()}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e FuncEncoding) NewEncoder() *encoding.Encoder {
|
||||||
|
return &encoding.Encoder{Transformer: e.Encoder()}
|
||||||
|
}
|
||||||
|
|
||||||
|
// A RepertoireError indicates a rune is not in the repertoire of a destination
|
||||||
|
// encoding. It is associated with an encoding-specific suggested replacement
|
||||||
|
// byte.
|
||||||
|
type RepertoireError byte
|
||||||
|
|
||||||
|
// Error implements the error interrface.
|
||||||
|
func (r RepertoireError) Error() string {
|
||||||
|
return "encoding: rune not supported by encoding."
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replacement returns the replacement string associated with this error.
|
||||||
|
func (r RepertoireError) Replacement() byte { return byte(r) }
|
||||||
|
|
||||||
|
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)
|
@ -0,0 +1,12 @@
|
|||||||
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package simplifiedchinese
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/text/encoding"
|
||||||
|
)
|
||||||
|
|
||||||
|
// All is a list of all defined encodings in this package.
|
||||||
|
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}
|
@ -0,0 +1,269 @@
|
|||||||
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package simplifiedchinese
|
||||||
|
|
||||||
|
import (
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/text/encoding"
|
||||||
|
"golang.org/x/text/encoding/internal"
|
||||||
|
"golang.org/x/text/encoding/internal/identifier"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// GB18030 is the GB18030 encoding.
|
||||||
|
GB18030 encoding.Encoding = &gbk18030
|
||||||
|
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
|
||||||
|
// and is also known as Code Page 936.
|
||||||
|
GBK encoding.Encoding = &gbk
|
||||||
|
)
|
||||||
|
|
||||||
|
var gbk = internal.Encoding{
|
||||||
|
&internal.SimpleEncoding{
|
||||||
|
gbkDecoder{gb18030: false},
|
||||||
|
gbkEncoder{gb18030: false},
|
||||||
|
},
|
||||||
|
"GBK",
|
||||||
|
identifier.GBK,
|
||||||
|
}
|
||||||
|
|
||||||
|
var gbk18030 = internal.Encoding{
|
||||||
|
&internal.SimpleEncoding{
|
||||||
|
gbkDecoder{gb18030: true},
|
||||||
|
gbkEncoder{gb18030: true},
|
||||||
|
},
|
||||||
|
"GB18030",
|
||||||
|
identifier.GB18030,
|
||||||
|
}
|
||||||
|
|
||||||
|
type gbkDecoder struct {
|
||||||
|
transform.NopResetter
|
||||||
|
gb18030 bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
r, size := rune(0), 0
|
||||||
|
loop:
|
||||||
|
for ; nSrc < len(src); nSrc += size {
|
||||||
|
switch c0 := src[nSrc]; {
|
||||||
|
case c0 < utf8.RuneSelf:
|
||||||
|
r, size = rune(c0), 1
|
||||||
|
|
||||||
|
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||||
|
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||||
|
// says to treat "gbk" as Code Page 936.
|
||||||
|
case c0 == 0x80:
|
||||||
|
r, size = '€', 1
|
||||||
|
|
||||||
|
case c0 < 0xff:
|
||||||
|
if nSrc+1 >= len(src) {
|
||||||
|
if !atEOF {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
c1 := src[nSrc+1]
|
||||||
|
switch {
|
||||||
|
case 0x40 <= c1 && c1 < 0x7f:
|
||||||
|
c1 -= 0x40
|
||||||
|
case 0x80 <= c1 && c1 < 0xff:
|
||||||
|
c1 -= 0x41
|
||||||
|
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
|
||||||
|
if nSrc+3 >= len(src) {
|
||||||
|
if !atEOF {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
// The second byte here is always ASCII, so we can set size
|
||||||
|
// to 1 in all cases.
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
c2 := src[nSrc+2]
|
||||||
|
if c2 < 0x81 || 0xff <= c2 {
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
c3 := src[nSrc+3]
|
||||||
|
if c3 < 0x30 || 0x3a <= c3 {
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
size = 4
|
||||||
|
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
|
||||||
|
if r < 39420 {
|
||||||
|
i, j := 0, len(gb18030)
|
||||||
|
for i < j {
|
||||||
|
h := i + (j-i)/2
|
||||||
|
if r >= rune(gb18030[h][0]) {
|
||||||
|
i = h + 1
|
||||||
|
} else {
|
||||||
|
j = h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dec := &gb18030[i-1]
|
||||||
|
r += rune(dec[1]) - rune(dec[0])
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
r -= 189000
|
||||||
|
if 0 <= r && r < 0x100000 {
|
||||||
|
r += 0x10000
|
||||||
|
} else {
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
}
|
||||||
|
goto write
|
||||||
|
default:
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
r, size = '\ufffd', 2
|
||||||
|
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
|
||||||
|
r = rune(decode[i])
|
||||||
|
if r == 0 {
|
||||||
|
r = '\ufffd'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
}
|
||||||
|
|
||||||
|
write:
|
||||||
|
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||||
|
}
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
type gbkEncoder struct {
|
||||||
|
transform.NopResetter
|
||||||
|
gb18030 bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
r, r2, size := rune(0), rune(0), 0
|
||||||
|
for ; nSrc < len(src); nSrc += size {
|
||||||
|
r = rune(src[nSrc])
|
||||||
|
|
||||||
|
// Decode a 1-byte rune.
|
||||||
|
if r < utf8.RuneSelf {
|
||||||
|
size = 1
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// Decode a multi-byte rune.
|
||||||
|
r, size = utf8.DecodeRune(src[nSrc:])
|
||||||
|
if size == 1 {
|
||||||
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||||
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||||
|
// full character yet.
|
||||||
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// func init checks that the switch covers all tables.
|
||||||
|
switch {
|
||||||
|
case encode0Low <= r && r < encode0High:
|
||||||
|
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
|
||||||
|
goto write2
|
||||||
|
}
|
||||||
|
case encode1Low <= r && r < encode1High:
|
||||||
|
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||||
|
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||||
|
// says to treat "gbk" as Code Page 936.
|
||||||
|
if r == '€' {
|
||||||
|
r = 0x80
|
||||||
|
goto write1
|
||||||
|
}
|
||||||
|
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
|
||||||
|
goto write2
|
||||||
|
}
|
||||||
|
case encode2Low <= r && r < encode2High:
|
||||||
|
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
|
||||||
|
goto write2
|
||||||
|
}
|
||||||
|
case encode3Low <= r && r < encode3High:
|
||||||
|
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
|
||||||
|
goto write2
|
||||||
|
}
|
||||||
|
case encode4Low <= r && r < encode4High:
|
||||||
|
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
|
||||||
|
goto write2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if e.gb18030 {
|
||||||
|
if r < 0x10000 {
|
||||||
|
i, j := 0, len(gb18030)
|
||||||
|
for i < j {
|
||||||
|
h := i + (j-i)/2
|
||||||
|
if r >= rune(gb18030[h][1]) {
|
||||||
|
i = h + 1
|
||||||
|
} else {
|
||||||
|
j = h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dec := &gb18030[i-1]
|
||||||
|
r += rune(dec[0]) - rune(dec[1])
|
||||||
|
goto write4
|
||||||
|
} else if r < 0x110000 {
|
||||||
|
r += 189000 - 0x10000
|
||||||
|
goto write4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err = internal.ErrASCIIReplacement
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
write1:
|
||||||
|
if nDst >= len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst] = uint8(r)
|
||||||
|
nDst++
|
||||||
|
continue
|
||||||
|
|
||||||
|
write2:
|
||||||
|
if nDst+2 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst+0] = uint8(r2 >> 8)
|
||||||
|
dst[nDst+1] = uint8(r2)
|
||||||
|
nDst += 2
|
||||||
|
continue
|
||||||
|
|
||||||
|
write4:
|
||||||
|
if nDst+4 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst+3] = uint8(r%10 + 0x30)
|
||||||
|
r /= 10
|
||||||
|
dst[nDst+2] = uint8(r%126 + 0x81)
|
||||||
|
r /= 126
|
||||||
|
dst[nDst+1] = uint8(r%10 + 0x30)
|
||||||
|
r /= 10
|
||||||
|
dst[nDst+0] = uint8(r + 0x81)
|
||||||
|
nDst += 4
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// Check that the hard-coded encode switch covers all tables.
|
||||||
|
if numEncodeTables != 5 {
|
||||||
|
panic("bad numEncodeTables")
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,245 @@
|
|||||||
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package simplifiedchinese
|
||||||
|
|
||||||
|
import (
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/text/encoding"
|
||||||
|
"golang.org/x/text/encoding/internal"
|
||||||
|
"golang.org/x/text/encoding/internal/identifier"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// HZGB2312 is the HZ-GB2312 encoding.
|
||||||
|
var HZGB2312 encoding.Encoding = &hzGB2312
|
||||||
|
|
||||||
|
var hzGB2312 = internal.Encoding{
|
||||||
|
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
|
||||||
|
"HZ-GB2312",
|
||||||
|
identifier.HZGB2312,
|
||||||
|
}
|
||||||
|
|
||||||
|
func hzGB2312NewDecoder() transform.Transformer {
|
||||||
|
return new(hzGB2312Decoder)
|
||||||
|
}
|
||||||
|
|
||||||
|
func hzGB2312NewEncoder() transform.Transformer {
|
||||||
|
return new(hzGB2312Encoder)
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
asciiState = iota
|
||||||
|
gbState
|
||||||
|
)
|
||||||
|
|
||||||
|
type hzGB2312Decoder int
|
||||||
|
|
||||||
|
func (d *hzGB2312Decoder) Reset() {
|
||||||
|
*d = asciiState
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
r, size := rune(0), 0
|
||||||
|
loop:
|
||||||
|
for ; nSrc < len(src); nSrc += size {
|
||||||
|
c0 := src[nSrc]
|
||||||
|
if c0 >= utf8.RuneSelf {
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
|
||||||
|
if c0 == '~' {
|
||||||
|
if nSrc+1 >= len(src) {
|
||||||
|
if !atEOF {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
size = 2
|
||||||
|
switch src[nSrc+1] {
|
||||||
|
case '{':
|
||||||
|
*d = gbState
|
||||||
|
continue
|
||||||
|
case '}':
|
||||||
|
*d = asciiState
|
||||||
|
continue
|
||||||
|
case '~':
|
||||||
|
if nDst >= len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
dst[nDst] = '~'
|
||||||
|
nDst++
|
||||||
|
continue
|
||||||
|
case '\n':
|
||||||
|
continue
|
||||||
|
default:
|
||||||
|
r = utf8.RuneError
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if *d == asciiState {
|
||||||
|
r, size = rune(c0), 1
|
||||||
|
} else {
|
||||||
|
if nSrc+1 >= len(src) {
|
||||||
|
if !atEOF {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
r, size = utf8.RuneError, 1
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
size = 2
|
||||||
|
c1 := src[nSrc+1]
|
||||||
|
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
|
||||||
|
// error
|
||||||
|
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
|
||||||
|
r = rune(decode[i])
|
||||||
|
if r != 0 {
|
||||||
|
goto write
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if c1 > utf8.RuneSelf {
|
||||||
|
// Be consistent and always treat non-ASCII as a single error.
|
||||||
|
size = 1
|
||||||
|
}
|
||||||
|
r = utf8.RuneError
|
||||||
|
}
|
||||||
|
|
||||||
|
write:
|
||||||
|
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||||
|
}
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
type hzGB2312Encoder int
|
||||||
|
|
||||||
|
func (d *hzGB2312Encoder) Reset() {
|
||||||
|
*d = asciiState
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
r, size := rune(0), 0
|
||||||
|
for ; nSrc < len(src); nSrc += size {
|
||||||
|
r = rune(src[nSrc])
|
||||||
|
|
||||||
|
// Decode a 1-byte rune.
|
||||||
|
if r < utf8.RuneSelf {
|
||||||
|
size = 1
|
||||||
|
if r == '~' {
|
||||||
|
if nDst+2 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst+0] = '~'
|
||||||
|
dst[nDst+1] = '~'
|
||||||
|
nDst += 2
|
||||||
|
continue
|
||||||
|
} else if *e != asciiState {
|
||||||
|
if nDst+3 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
*e = asciiState
|
||||||
|
dst[nDst+0] = '~'
|
||||||
|
dst[nDst+1] = '}'
|
||||||
|
nDst += 2
|
||||||
|
} else if nDst >= len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst] = uint8(r)
|
||||||
|
nDst += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode a multi-byte rune.
|
||||||
|
r, size = utf8.DecodeRune(src[nSrc:])
|
||||||
|
if size == 1 {
|
||||||
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||||
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||||
|
// full character yet.
|
||||||
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||||
|
err = transform.ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// func init checks that the switch covers all tables.
|
||||||
|
switch {
|
||||||
|
case encode0Low <= r && r < encode0High:
|
||||||
|
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||||
|
goto writeGB
|
||||||
|
}
|
||||||
|
case encode1Low <= r && r < encode1High:
|
||||||
|
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||||
|
goto writeGB
|
||||||
|
}
|
||||||
|
case encode2Low <= r && r < encode2High:
|
||||||
|
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||||
|
goto writeGB
|
||||||
|
}
|
||||||
|
case encode3Low <= r && r < encode3High:
|
||||||
|
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||||
|
goto writeGB
|
||||||
|
}
|
||||||
|
case encode4Low <= r && r < encode4High:
|
||||||
|
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||||
|
goto writeGB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
terminateInASCIIState:
|
||||||
|
// Switch back to ASCII state in case of error so that an ASCII
|
||||||
|
// replacement character can be written in the correct state.
|
||||||
|
if *e != asciiState {
|
||||||
|
if nDst+2 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst+0] = '~'
|
||||||
|
dst[nDst+1] = '}'
|
||||||
|
nDst += 2
|
||||||
|
}
|
||||||
|
err = internal.ErrASCIIReplacement
|
||||||
|
break
|
||||||
|
|
||||||
|
writeGB:
|
||||||
|
c0 := uint8(r>>8) - 0x80
|
||||||
|
c1 := uint8(r) - 0x80
|
||||||
|
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
|
||||||
|
goto terminateInASCIIState
|
||||||
|
}
|
||||||
|
if *e == asciiState {
|
||||||
|
if nDst+4 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
*e = gbState
|
||||||
|
dst[nDst+0] = '~'
|
||||||
|
dst[nDst+1] = '{'
|
||||||
|
nDst += 2
|
||||||
|
} else if nDst+2 > len(dst) {
|
||||||
|
err = transform.ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dst[nDst+0] = c0
|
||||||
|
dst[nDst+1] = c1
|
||||||
|
nDst += 2
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// TODO: should one always terminate in ASCII state to make it safe to
|
||||||
|
// concatenate two HZ-GB2312-encoded strings?
|
||||||
|
return nDst, nSrc, err
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,709 @@
|
|||||||
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Package transform provides reader and writer wrappers that transform the
|
||||||
|
// bytes passing through as well as various transformations. Example
|
||||||
|
// transformations provided by other packages include normalization and
|
||||||
|
// conversion between character sets.
|
||||||
|
package transform // import "golang.org/x/text/transform"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// ErrShortDst means that the destination buffer was too short to
|
||||||
|
// receive all of the transformed bytes.
|
||||||
|
ErrShortDst = errors.New("transform: short destination buffer")
|
||||||
|
|
||||||
|
// ErrShortSrc means that the source buffer has insufficient data to
|
||||||
|
// complete the transformation.
|
||||||
|
ErrShortSrc = errors.New("transform: short source buffer")
|
||||||
|
|
||||||
|
// ErrEndOfSpan means that the input and output (the transformed input)
|
||||||
|
// are not identical.
|
||||||
|
ErrEndOfSpan = errors.New("transform: input and output are not identical")
|
||||||
|
|
||||||
|
// errInconsistentByteCount means that Transform returned success (nil
|
||||||
|
// error) but also returned nSrc inconsistent with the src argument.
|
||||||
|
errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
|
||||||
|
|
||||||
|
// errShortInternal means that an internal buffer is not large enough
|
||||||
|
// to make progress and the Transform operation must be aborted.
|
||||||
|
errShortInternal = errors.New("transform: short internal buffer")
|
||||||
|
)
|
||||||
|
|
||||||
|
// Transformer transforms bytes.
|
||||||
|
type Transformer interface {
|
||||||
|
// Transform writes to dst the transformed bytes read from src, and
|
||||||
|
// returns the number of dst bytes written and src bytes read. The
|
||||||
|
// atEOF argument tells whether src represents the last bytes of the
|
||||||
|
// input.
|
||||||
|
//
|
||||||
|
// Callers should always process the nDst bytes produced and account
|
||||||
|
// for the nSrc bytes consumed before considering the error err.
|
||||||
|
//
|
||||||
|
// A nil error means that all of the transformed bytes (whether freshly
|
||||||
|
// transformed from src or left over from previous Transform calls)
|
||||||
|
// were written to dst. A nil error can be returned regardless of
|
||||||
|
// whether atEOF is true. If err is nil then nSrc must equal len(src);
|
||||||
|
// the converse is not necessarily true.
|
||||||
|
//
|
||||||
|
// ErrShortDst means that dst was too short to receive all of the
|
||||||
|
// transformed bytes. ErrShortSrc means that src had insufficient data
|
||||||
|
// to complete the transformation. If both conditions apply, then
|
||||||
|
// either error may be returned. Other than the error conditions listed
|
||||||
|
// here, implementations are free to report other errors that arise.
|
||||||
|
Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
|
||||||
|
|
||||||
|
// Reset resets the state and allows a Transformer to be reused.
|
||||||
|
Reset()
|
||||||
|
}
|
||||||
|
|
||||||
|
// SpanningTransformer extends the Transformer interface with a Span method
|
||||||
|
// that determines how much of the input already conforms to the Transformer.
|
||||||
|
type SpanningTransformer interface {
|
||||||
|
Transformer
|
||||||
|
|
||||||
|
// Span returns a position in src such that transforming src[:n] results in
|
||||||
|
// identical output src[:n] for these bytes. It does not necessarily return
|
||||||
|
// the largest such n. The atEOF argument tells whether src represents the
|
||||||
|
// last bytes of the input.
|
||||||
|
//
|
||||||
|
// Callers should always account for the n bytes consumed before
|
||||||
|
// considering the error err.
|
||||||
|
//
|
||||||
|
// A nil error means that all input bytes are known to be identical to the
|
||||||
|
// output produced by the Transformer. A nil error can be returned
|
||||||
|
// regardless of whether atEOF is true. If err is nil, then n must
|
||||||
|
// equal len(src); the converse is not necessarily true.
|
||||||
|
//
|
||||||
|
// ErrEndOfSpan means that the Transformer output may differ from the
|
||||||
|
// input after n bytes. Note that n may be len(src), meaning that the output
|
||||||
|
// would contain additional bytes after otherwise identical output.
|
||||||
|
// ErrShortSrc means that src had insufficient data to determine whether the
|
||||||
|
// remaining bytes would change. Other than the error conditions listed
|
||||||
|
// here, implementations are free to report other errors that arise.
|
||||||
|
//
|
||||||
|
// Calling Span can modify the Transformer state as a side effect. In
|
||||||
|
// effect, it does the transformation just as calling Transform would, only
|
||||||
|
// without copying to a destination buffer and only up to a point it can
|
||||||
|
// determine the input and output bytes are the same. This is obviously more
|
||||||
|
// limited than calling Transform, but can be more efficient in terms of
|
||||||
|
// copying and allocating buffers. Calls to Span and Transform may be
|
||||||
|
// interleaved.
|
||||||
|
Span(src []byte, atEOF bool) (n int, err error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NopResetter can be embedded by implementations of Transformer to add a nop
|
||||||
|
// Reset method.
|
||||||
|
type NopResetter struct{}
|
||||||
|
|
||||||
|
// Reset implements the Reset method of the Transformer interface.
|
||||||
|
func (NopResetter) Reset() {}
|
||||||
|
|
||||||
|
// Reader wraps another io.Reader by transforming the bytes read.
|
||||||
|
type Reader struct {
|
||||||
|
r io.Reader
|
||||||
|
t Transformer
|
||||||
|
err error
|
||||||
|
|
||||||
|
// dst[dst0:dst1] contains bytes that have been transformed by t but
|
||||||
|
// not yet copied out via Read.
|
||||||
|
dst []byte
|
||||||
|
dst0, dst1 int
|
||||||
|
|
||||||
|
// src[src0:src1] contains bytes that have been read from r but not
|
||||||
|
// yet transformed through t.
|
||||||
|
src []byte
|
||||||
|
src0, src1 int
|
||||||
|
|
||||||
|
// transformComplete is whether the transformation is complete,
|
||||||
|
// regardless of whether or not it was successful.
|
||||||
|
transformComplete bool
|
||||||
|
}
|
||||||
|
|
||||||
|
const defaultBufSize = 4096
|
||||||
|
|
||||||
|
// NewReader returns a new Reader that wraps r by transforming the bytes read
|
||||||
|
// via t. It calls Reset on t.
|
||||||
|
func NewReader(r io.Reader, t Transformer) *Reader {
|
||||||
|
t.Reset()
|
||||||
|
return &Reader{
|
||||||
|
r: r,
|
||||||
|
t: t,
|
||||||
|
dst: make([]byte, defaultBufSize),
|
||||||
|
src: make([]byte, defaultBufSize),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read implements the io.Reader interface.
|
||||||
|
func (r *Reader) Read(p []byte) (int, error) {
|
||||||
|
n, err := 0, error(nil)
|
||||||
|
for {
|
||||||
|
// Copy out any transformed bytes and return the final error if we are done.
|
||||||
|
if r.dst0 != r.dst1 {
|
||||||
|
n = copy(p, r.dst[r.dst0:r.dst1])
|
||||||
|
r.dst0 += n
|
||||||
|
if r.dst0 == r.dst1 && r.transformComplete {
|
||||||
|
return n, r.err
|
||||||
|
}
|
||||||
|
return n, nil
|
||||||
|
} else if r.transformComplete {
|
||||||
|
return 0, r.err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to transform some source bytes, or to flush the transformer if we
|
||||||
|
// are out of source bytes. We do this even if r.r.Read returned an error.
|
||||||
|
// As the io.Reader documentation says, "process the n > 0 bytes returned
|
||||||
|
// before considering the error".
|
||||||
|
if r.src0 != r.src1 || r.err != nil {
|
||||||
|
r.dst0 = 0
|
||||||
|
r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
|
||||||
|
r.src0 += n
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case err == nil:
|
||||||
|
if r.src0 != r.src1 {
|
||||||
|
r.err = errInconsistentByteCount
|
||||||
|
}
|
||||||
|
// The Transform call was successful; we are complete if we
|
||||||
|
// cannot read more bytes into src.
|
||||||
|
r.transformComplete = r.err != nil
|
||||||
|
continue
|
||||||
|
case err == ErrShortDst && (r.dst1 != 0 || n != 0):
|
||||||
|
// Make room in dst by copying out, and try again.
|
||||||
|
continue
|
||||||
|
case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
|
||||||
|
// Read more bytes into src via the code below, and try again.
|
||||||
|
default:
|
||||||
|
r.transformComplete = true
|
||||||
|
// The reader error (r.err) takes precedence over the
|
||||||
|
// transformer error (err) unless r.err is nil or io.EOF.
|
||||||
|
if r.err == nil || r.err == io.EOF {
|
||||||
|
r.err = err
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move any untransformed source bytes to the start of the buffer
|
||||||
|
// and read more bytes.
|
||||||
|
if r.src0 != 0 {
|
||||||
|
r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
|
||||||
|
}
|
||||||
|
n, r.err = r.r.Read(r.src[r.src1:])
|
||||||
|
r.src1 += n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: implement ReadByte (and ReadRune??).
|
||||||
|
|
||||||
|
// Writer wraps another io.Writer by transforming the bytes read.
|
||||||
|
// The user needs to call Close to flush unwritten bytes that may
|
||||||
|
// be buffered.
|
||||||
|
type Writer struct {
|
||||||
|
w io.Writer
|
||||||
|
t Transformer
|
||||||
|
dst []byte
|
||||||
|
|
||||||
|
// src[:n] contains bytes that have not yet passed through t.
|
||||||
|
src []byte
|
||||||
|
n int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewWriter returns a new Writer that wraps w by transforming the bytes written
|
||||||
|
// via t. It calls Reset on t.
|
||||||
|
func NewWriter(w io.Writer, t Transformer) *Writer {
|
||||||
|
t.Reset()
|
||||||
|
return &Writer{
|
||||||
|
w: w,
|
||||||
|
t: t,
|
||||||
|
dst: make([]byte, defaultBufSize),
|
||||||
|
src: make([]byte, defaultBufSize),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write implements the io.Writer interface. If there are not enough
|
||||||
|
// bytes available to complete a Transform, the bytes will be buffered
|
||||||
|
// for the next write. Call Close to convert the remaining bytes.
|
||||||
|
func (w *Writer) Write(data []byte) (n int, err error) {
|
||||||
|
src := data
|
||||||
|
if w.n > 0 {
|
||||||
|
// Append bytes from data to the last remainder.
|
||||||
|
// TODO: limit the amount copied on first try.
|
||||||
|
n = copy(w.src[w.n:], data)
|
||||||
|
w.n += n
|
||||||
|
src = w.src[:w.n]
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
nDst, nSrc, err := w.t.Transform(w.dst, src, false)
|
||||||
|
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
||||||
|
return n, werr
|
||||||
|
}
|
||||||
|
src = src[nSrc:]
|
||||||
|
if w.n == 0 {
|
||||||
|
n += nSrc
|
||||||
|
} else if len(src) <= n {
|
||||||
|
// Enough bytes from w.src have been consumed. We make src point
|
||||||
|
// to data instead to reduce the copying.
|
||||||
|
w.n = 0
|
||||||
|
n -= len(src)
|
||||||
|
src = data[n:]
|
||||||
|
if n < len(data) && (err == nil || err == ErrShortSrc) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
switch err {
|
||||||
|
case ErrShortDst:
|
||||||
|
// This error is okay as long as we are making progress.
|
||||||
|
if nDst > 0 || nSrc > 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case ErrShortSrc:
|
||||||
|
if len(src) < len(w.src) {
|
||||||
|
m := copy(w.src, src)
|
||||||
|
// If w.n > 0, bytes from data were already copied to w.src and n
|
||||||
|
// was already set to the number of bytes consumed.
|
||||||
|
if w.n == 0 {
|
||||||
|
n += m
|
||||||
|
}
|
||||||
|
w.n = m
|
||||||
|
err = nil
|
||||||
|
} else if nDst > 0 || nSrc > 0 {
|
||||||
|
// Not enough buffer to store the remainder. Keep processing as
|
||||||
|
// long as there is progress. Without this case, transforms that
|
||||||
|
// require a lookahead larger than the buffer may result in an
|
||||||
|
// error. This is not something one may expect to be common in
|
||||||
|
// practice, but it may occur when buffers are set to small
|
||||||
|
// sizes during testing.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
case nil:
|
||||||
|
if w.n > 0 {
|
||||||
|
err = errInconsistentByteCount
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close implements the io.Closer interface.
|
||||||
|
func (w *Writer) Close() error {
|
||||||
|
src := w.src[:w.n]
|
||||||
|
for {
|
||||||
|
nDst, nSrc, err := w.t.Transform(w.dst, src, true)
|
||||||
|
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
||||||
|
return werr
|
||||||
|
}
|
||||||
|
if err != ErrShortDst {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
src = src[nSrc:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type nop struct{ NopResetter }
|
||||||
|
|
||||||
|
func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
n := copy(dst, src)
|
||||||
|
if n < len(src) {
|
||||||
|
err = ErrShortDst
|
||||||
|
}
|
||||||
|
return n, n, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (nop) Span(src []byte, atEOF bool) (n int, err error) {
|
||||||
|
return len(src), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type discard struct{ NopResetter }
|
||||||
|
|
||||||
|
func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
return 0, len(src), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Discard is a Transformer for which all Transform calls succeed
|
||||||
|
// by consuming all bytes and writing nothing.
|
||||||
|
Discard Transformer = discard{}
|
||||||
|
|
||||||
|
// Nop is a SpanningTransformer that copies src to dst.
|
||||||
|
Nop SpanningTransformer = nop{}
|
||||||
|
)
|
||||||
|
|
||||||
|
// chain is a sequence of links. A chain with N Transformers has N+1 links and
|
||||||
|
// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
|
||||||
|
// buffers given to chain.Transform and the middle N-1 buffers are intermediate
|
||||||
|
// buffers owned by the chain. The i'th link transforms bytes from the i'th
|
||||||
|
// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
|
||||||
|
// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
|
||||||
|
type chain struct {
|
||||||
|
link []link
|
||||||
|
err error
|
||||||
|
// errStart is the index at which the error occurred plus 1. Processing
|
||||||
|
// errStart at this level at the next call to Transform. As long as
|
||||||
|
// errStart > 0, chain will not consume any more source bytes.
|
||||||
|
errStart int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *chain) fatalError(errIndex int, err error) {
|
||||||
|
if i := errIndex + 1; i > c.errStart {
|
||||||
|
c.errStart = i
|
||||||
|
c.err = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type link struct {
|
||||||
|
t Transformer
|
||||||
|
// b[p:n] holds the bytes to be transformed by t.
|
||||||
|
b []byte
|
||||||
|
p int
|
||||||
|
n int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *link) src() []byte {
|
||||||
|
return l.b[l.p:l.n]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *link) dst() []byte {
|
||||||
|
return l.b[l.n:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chain returns a Transformer that applies t in sequence.
|
||||||
|
func Chain(t ...Transformer) Transformer {
|
||||||
|
if len(t) == 0 {
|
||||||
|
return nop{}
|
||||||
|
}
|
||||||
|
c := &chain{link: make([]link, len(t)+1)}
|
||||||
|
for i, tt := range t {
|
||||||
|
c.link[i].t = tt
|
||||||
|
}
|
||||||
|
// Allocate intermediate buffers.
|
||||||
|
b := make([][defaultBufSize]byte, len(t)-1)
|
||||||
|
for i := range b {
|
||||||
|
c.link[i+1].b = b[i][:]
|
||||||
|
}
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset resets the state of Chain. It calls Reset on all the Transformers.
|
||||||
|
func (c *chain) Reset() {
|
||||||
|
for i, l := range c.link {
|
||||||
|
if l.t != nil {
|
||||||
|
l.t.Reset()
|
||||||
|
}
|
||||||
|
c.link[i].p, c.link[i].n = 0, 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: make chain use Span (is going to be fun to implement!)
|
||||||
|
|
||||||
|
// Transform applies the transformers of c in sequence.
|
||||||
|
func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
// Set up src and dst in the chain.
|
||||||
|
srcL := &c.link[0]
|
||||||
|
dstL := &c.link[len(c.link)-1]
|
||||||
|
srcL.b, srcL.p, srcL.n = src, 0, len(src)
|
||||||
|
dstL.b, dstL.n = dst, 0
|
||||||
|
var lastFull, needProgress bool // for detecting progress
|
||||||
|
|
||||||
|
// i is the index of the next Transformer to apply, for i in [low, high].
|
||||||
|
// low is the lowest index for which c.link[low] may still produce bytes.
|
||||||
|
// high is the highest index for which c.link[high] has a Transformer.
|
||||||
|
// The error returned by Transform determines whether to increase or
|
||||||
|
// decrease i. We try to completely fill a buffer before converting it.
|
||||||
|
for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
|
||||||
|
in, out := &c.link[i], &c.link[i+1]
|
||||||
|
nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
|
||||||
|
out.n += nDst
|
||||||
|
in.p += nSrc
|
||||||
|
if i > 0 && in.p == in.n {
|
||||||
|
in.p, in.n = 0, 0
|
||||||
|
}
|
||||||
|
needProgress, lastFull = lastFull, false
|
||||||
|
switch err0 {
|
||||||
|
case ErrShortDst:
|
||||||
|
// Process the destination buffer next. Return if we are already
|
||||||
|
// at the high index.
|
||||||
|
if i == high {
|
||||||
|
return dstL.n, srcL.p, ErrShortDst
|
||||||
|
}
|
||||||
|
if out.n != 0 {
|
||||||
|
i++
|
||||||
|
// If the Transformer at the next index is not able to process any
|
||||||
|
// source bytes there is nothing that can be done to make progress
|
||||||
|
// and the bytes will remain unprocessed. lastFull is used to
|
||||||
|
// detect this and break out of the loop with a fatal error.
|
||||||
|
lastFull = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// The destination buffer was too small, but is completely empty.
|
||||||
|
// Return a fatal error as this transformation can never complete.
|
||||||
|
c.fatalError(i, errShortInternal)
|
||||||
|
case ErrShortSrc:
|
||||||
|
if i == 0 {
|
||||||
|
// Save ErrShortSrc in err. All other errors take precedence.
|
||||||
|
err = ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Source bytes were depleted before filling up the destination buffer.
|
||||||
|
// Verify we made some progress, move the remaining bytes to the errStart
|
||||||
|
// and try to get more source bytes.
|
||||||
|
if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
|
||||||
|
// There were not enough source bytes to proceed while the source
|
||||||
|
// buffer cannot hold any more bytes. Return a fatal error as this
|
||||||
|
// transformation can never complete.
|
||||||
|
c.fatalError(i, errShortInternal)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// in.b is an internal buffer and we can make progress.
|
||||||
|
in.p, in.n = 0, copy(in.b, in.src())
|
||||||
|
fallthrough
|
||||||
|
case nil:
|
||||||
|
// if i == low, we have depleted the bytes at index i or any lower levels.
|
||||||
|
// In that case we increase low and i. In all other cases we decrease i to
|
||||||
|
// fetch more bytes before proceeding to the next index.
|
||||||
|
if i > low {
|
||||||
|
i--
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
c.fatalError(i, err0)
|
||||||
|
}
|
||||||
|
// Exhausted level low or fatal error: increase low and continue
|
||||||
|
// to process the bytes accepted so far.
|
||||||
|
i++
|
||||||
|
low = i
|
||||||
|
}
|
||||||
|
|
||||||
|
// If c.errStart > 0, this means we found a fatal error. We will clear
|
||||||
|
// all upstream buffers. At this point, no more progress can be made
|
||||||
|
// downstream, as Transform would have bailed while handling ErrShortDst.
|
||||||
|
if c.errStart > 0 {
|
||||||
|
for i := 1; i < c.errStart; i++ {
|
||||||
|
c.link[i].p, c.link[i].n = 0, 0
|
||||||
|
}
|
||||||
|
err, c.errStart, c.err = c.err, 0, nil
|
||||||
|
}
|
||||||
|
return dstL.n, srcL.p, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deprecated: Use runes.Remove instead.
|
||||||
|
func RemoveFunc(f func(r rune) bool) Transformer {
|
||||||
|
return removeF(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
type removeF func(r rune) bool
|
||||||
|
|
||||||
|
func (removeF) Reset() {}
|
||||||
|
|
||||||
|
// Transform implements the Transformer interface.
|
||||||
|
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
|
||||||
|
|
||||||
|
if r = rune(src[0]); r < utf8.RuneSelf {
|
||||||
|
sz = 1
|
||||||
|
} else {
|
||||||
|
r, sz = utf8.DecodeRune(src)
|
||||||
|
|
||||||
|
if sz == 1 {
|
||||||
|
// Invalid rune.
|
||||||
|
if !atEOF && !utf8.FullRune(src) {
|
||||||
|
err = ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// We replace illegal bytes with RuneError. Not doing so might
|
||||||
|
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||||
|
// The resulting byte sequence may subsequently contain runes
|
||||||
|
// for which t(r) is true that were passed unnoticed.
|
||||||
|
if !t(r) {
|
||||||
|
if nDst+3 > len(dst) {
|
||||||
|
err = ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
nDst += copy(dst[nDst:], "\uFFFD")
|
||||||
|
}
|
||||||
|
nSrc++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !t(r) {
|
||||||
|
if nDst+sz > len(dst) {
|
||||||
|
err = ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
nDst += copy(dst[nDst:], src[:sz])
|
||||||
|
}
|
||||||
|
nSrc += sz
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// grow returns a new []byte that is longer than b, and copies the first n bytes
|
||||||
|
// of b to the start of the new slice.
|
||||||
|
func grow(b []byte, n int) []byte {
|
||||||
|
m := len(b)
|
||||||
|
if m <= 32 {
|
||||||
|
m = 64
|
||||||
|
} else if m <= 256 {
|
||||||
|
m *= 2
|
||||||
|
} else {
|
||||||
|
m += m >> 1
|
||||||
|
}
|
||||||
|
buf := make([]byte, m)
|
||||||
|
copy(buf, b[:n])
|
||||||
|
return buf
|
||||||
|
}
|
||||||
|
|
||||||
|
const initialBufSize = 128
|
||||||
|
|
||||||
|
// String returns a string with the result of converting s[:n] using t, where
|
||||||
|
// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
|
||||||
|
func String(t Transformer, s string) (result string, n int, err error) {
|
||||||
|
t.Reset()
|
||||||
|
if s == "" {
|
||||||
|
// Fast path for the common case for empty input. Results in about a
|
||||||
|
// 86% reduction of running time for BenchmarkStringLowerEmpty.
|
||||||
|
if _, _, err := t.Transform(nil, nil, true); err == nil {
|
||||||
|
return "", 0, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate only once. Note that both dst and src escape when passed to
|
||||||
|
// Transform.
|
||||||
|
buf := [2 * initialBufSize]byte{}
|
||||||
|
dst := buf[:initialBufSize:initialBufSize]
|
||||||
|
src := buf[initialBufSize : 2*initialBufSize]
|
||||||
|
|
||||||
|
// The input string s is transformed in multiple chunks (starting with a
|
||||||
|
// chunk size of initialBufSize). nDst and nSrc are per-chunk (or
|
||||||
|
// per-Transform-call) indexes, pDst and pSrc are overall indexes.
|
||||||
|
nDst, nSrc := 0, 0
|
||||||
|
pDst, pSrc := 0, 0
|
||||||
|
|
||||||
|
// pPrefix is the length of a common prefix: the first pPrefix bytes of the
|
||||||
|
// result will equal the first pPrefix bytes of s. It is not guaranteed to
|
||||||
|
// be the largest such value, but if pPrefix, len(result) and len(s) are
|
||||||
|
// all equal after the final transform (i.e. calling Transform with atEOF
|
||||||
|
// being true returned nil error) then we don't need to allocate a new
|
||||||
|
// result string.
|
||||||
|
pPrefix := 0
|
||||||
|
for {
|
||||||
|
// Invariant: pDst == pPrefix && pSrc == pPrefix.
|
||||||
|
|
||||||
|
n := copy(src, s[pSrc:])
|
||||||
|
nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
|
||||||
|
pDst += nDst
|
||||||
|
pSrc += nSrc
|
||||||
|
|
||||||
|
// TODO: let transformers implement an optional Spanner interface, akin
|
||||||
|
// to norm's QuickSpan. This would even allow us to avoid any allocation.
|
||||||
|
if !bytes.Equal(dst[:nDst], src[:nSrc]) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
pPrefix = pSrc
|
||||||
|
if err == ErrShortDst {
|
||||||
|
// A buffer can only be short if a transformer modifies its input.
|
||||||
|
break
|
||||||
|
} else if err == ErrShortSrc {
|
||||||
|
if nSrc == 0 {
|
||||||
|
// No progress was made.
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Equal so far and !atEOF, so continue checking.
|
||||||
|
} else if err != nil || pPrefix == len(s) {
|
||||||
|
return string(s[:pPrefix]), pPrefix, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
|
||||||
|
|
||||||
|
// We have transformed the first pSrc bytes of the input s to become pDst
|
||||||
|
// transformed bytes. Those transformed bytes are discontiguous: the first
|
||||||
|
// pPrefix of them equal s[:pPrefix] and the last nDst of them equal
|
||||||
|
// dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
|
||||||
|
// that they become one contiguous slice: dst[:pDst].
|
||||||
|
if pPrefix != 0 {
|
||||||
|
newDst := dst
|
||||||
|
if pDst > len(newDst) {
|
||||||
|
newDst = make([]byte, len(s)+nDst-nSrc)
|
||||||
|
}
|
||||||
|
copy(newDst[pPrefix:pDst], dst[:nDst])
|
||||||
|
copy(newDst[:pPrefix], s[:pPrefix])
|
||||||
|
dst = newDst
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prevent duplicate Transform calls with atEOF being true at the end of
|
||||||
|
// the input. Also return if we have an unrecoverable error.
|
||||||
|
if (err == nil && pSrc == len(s)) ||
|
||||||
|
(err != nil && err != ErrShortDst && err != ErrShortSrc) {
|
||||||
|
return string(dst[:pDst]), pSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transform the remaining input, growing dst and src buffers as necessary.
|
||||||
|
for {
|
||||||
|
n := copy(src, s[pSrc:])
|
||||||
|
atEOF := pSrc+n == len(s)
|
||||||
|
nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], atEOF)
|
||||||
|
pDst += nDst
|
||||||
|
pSrc += nSrc
|
||||||
|
|
||||||
|
// If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
|
||||||
|
// make progress. This may avoid excessive allocations.
|
||||||
|
if err == ErrShortDst {
|
||||||
|
if nDst == 0 {
|
||||||
|
dst = grow(dst, pDst)
|
||||||
|
}
|
||||||
|
} else if err == ErrShortSrc {
|
||||||
|
if atEOF {
|
||||||
|
return string(dst[:pDst]), pSrc, err
|
||||||
|
}
|
||||||
|
if nSrc == 0 {
|
||||||
|
src = grow(src, 0)
|
||||||
|
}
|
||||||
|
} else if err != nil || pSrc == len(s) {
|
||||||
|
return string(dst[:pDst]), pSrc, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bytes returns a new byte slice with the result of converting b[:n] using t,
|
||||||
|
// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
|
||||||
|
func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
|
||||||
|
return doAppend(t, 0, make([]byte, len(b)), b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append appends the result of converting src[:n] using t to dst, where
|
||||||
|
// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
|
||||||
|
func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
|
||||||
|
if len(dst) == cap(dst) {
|
||||||
|
n := len(src) + len(dst) // It is okay for this to be 0.
|
||||||
|
b := make([]byte, n)
|
||||||
|
dst = b[:copy(b, dst)]
|
||||||
|
}
|
||||||
|
return doAppend(t, len(dst), dst[:cap(dst)], src)
|
||||||
|
}
|
||||||
|
|
||||||
|
func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
|
||||||
|
t.Reset()
|
||||||
|
pSrc := 0
|
||||||
|
for {
|
||||||
|
nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
|
||||||
|
pDst += nDst
|
||||||
|
pSrc += nSrc
|
||||||
|
if err != ErrShortDst {
|
||||||
|
return dst[:pDst], pSrc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grow the destination buffer, but do not grow as long as we can make
|
||||||
|
// progress. This may avoid excessive allocations.
|
||||||
|
if nDst == 0 {
|
||||||
|
dst = grow(dst, pDst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue