From 98787c1cc0db10490e09ea06bc4c89b2807f57be Mon Sep 17 00:00:00 2001 From: tom twinkle Date: Tue, 23 Jan 2024 14:04:08 +0900 Subject: [PATCH] fix: Supports bufio use also for strings larger than 4096 bytes. --- main.go | 30 ++++++++---- main_test.go | 133 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 148 insertions(+), 15 deletions(-) diff --git a/main.go b/main.go index c84a484..86a3e25 100644 --- a/main.go +++ b/main.go @@ -32,15 +32,27 @@ func (t *replacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e if len(_src) == 0 && atEOF { return } - if !utf8.Valid(_src) { - // If not a string, do not process - err = ErrInvalidUTF8 - return - } for len(_src) > 0 { - _, n := utf8.DecodeRune(_src) - buf := _src[:n] + r, size := utf8.DecodeRune(_src) + if r < utf8.RuneSelf { + size = 1 + } else { + if size == 1 { + // All valid runes of size 1 (those below utf8.RuneSelf) were + // handled above. We have invalid UTF-8, or we haven't seen the + // full character yet. + if !atEOF && !utf8.FullRune(_src) { + err = transform.ErrShortSrc + break + } + // If the last string cannot be converted to rune, it is not replaced. + if atEOF && !utf8.FullRune(_src) { + break + } + } + } + buf := _src[:size] if _, encErr := t.enc.Bytes(buf); encErr != nil { // Replace strings that cannot be converted buf = []byte(string(t.replaceRune)) @@ -54,9 +66,9 @@ func (t *replacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e if dstN <= 0 { break } - nSrc += n + nSrc += size nDst += dstN - _src = _src[n:] + _src = _src[size:] } return } diff --git a/main_test.go b/main_test.go index 8324af2..5b9135a 100644 --- a/main_test.go +++ b/main_test.go @@ -1,6 +1,7 @@ package garbledreplacer_test import ( + "bufio" "bytes" "errors" "github.com/tomtwinkle/garbledreplacer" @@ -46,10 +47,10 @@ func TestNewTransformer(t *testing.T) { want: "?", }, "UTF-8->ShiftJIS:Invalid UTF-8 character": { - encoding: japanese.ShiftJIS, - in: "\xe4", - replace: '?', - wantError: garbledreplacer.ErrInvalidUTF8, + encoding: japanese.ShiftJIS, + in: "\xe4", + replace: '?', + want: "", }, "UTF-8->EUCJP:with garbled text": { encoding: japanese.EUCJP, @@ -63,6 +64,18 @@ func TestNewTransformer(t *testing.T) { replace: '?', want: strings.Repeat("咖呸咕咀呻?呷咄咒咆呼咐?呱呶和咚呢", 3000), }, + "UTF-8->ShiftJIS:with garbled text:larger than 4096bytes": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 4000), + replace: '?', + want: strings.Repeat("一二三四?五六七八九?十拾壱", 4000), + }, + "UTF-8->ShiftJIS:all garbled text:larger than 4096bytes": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("🍣🍣🍣🍺🍣🍣🍣", 4000), + replace: '?', + want: strings.Repeat("???????", 4000), + }, } for n, v := range tests { @@ -72,8 +85,12 @@ func TestNewTransformer(t *testing.T) { t.Run(name, func(t *testing.T) { var buf bytes.Buffer w := transform.NewWriter(&buf, garbledreplacer.NewTransformer(tt.encoding, tt.replace)) - if _, err := w.Write([]byte(tt.in)); err != nil { - if tt.wantError != nil && errors.Is(err, tt.wantError) { + _, err := w.Write([]byte(tt.in)) + if tt.wantError != nil { + if err == nil { + t.Errorf("want error %v, got nil", tt.wantError) + } + if errors.Is(err, tt.wantError) { return } t.Error(err) @@ -101,6 +118,110 @@ func TestNewTransformer(t *testing.T) { } } +func TestNewTransformerWithBufio(t *testing.T) { + tests := map[string]struct { + encoding encoding.Encoding + in string + replace rune + want string + wantError error + }{ + "UTF-8->ShiftJIS:no garbled text": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("一二三四五六七八九十拾壱", 1000), + replace: '?', + want: strings.Repeat("一二三四五六七八九十拾壱", 1000), + }, + "UTF-8->ShiftJIS:with garbled text": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("一二三四五六七八九十拾壱🍺", 1000), + replace: '?', + want: strings.Repeat("一二三四五六七八九十拾壱?", 1000), + }, + "UTF-8->ShiftJIS:with garbled text:other replaceRune": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 3000), + replace: '@', + want: strings.Repeat("一二三四@五六七八九@十拾壱", 3000), + }, + "UTF-8->ShiftJIS:RuneError only": { + encoding: japanese.ShiftJIS, + in: string(utf8.RuneError), + replace: '?', + want: "?", + }, + "UTF-8->ShiftJIS:Invalid UTF-8 character": { + encoding: japanese.ShiftJIS, + in: "\xe4", + replace: '?', + want: "", + }, + "UTF-8->EUCJP:with garbled text": { + encoding: japanese.EUCJP, + in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 3000), + replace: '?', + want: strings.Repeat("一二三四?五六七八九?十拾壱", 3000), + }, + "UTF-8->Big5:with garbled text": { + encoding: traditionalchinese.Big5, + in: strings.Repeat("咖呸咕咀呻🍣呷咄咒咆呼咐🍺呱呶和咚呢", 3000), + replace: '?', + want: strings.Repeat("咖呸咕咀呻?呷咄咒咆呼咐?呱呶和咚呢", 3000), + }, + "UTF-8->ShiftJIS:with garbled text:larger than 4096bytes": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 4000), + replace: '?', + want: strings.Repeat("一二三四?五六七八九?十拾壱", 4000), + }, + "UTF-8->ShiftJIS:all garbled text:larger than 4096bytes": { + encoding: japanese.ShiftJIS, + in: strings.Repeat("🍣🍣🍣🍺🍣🍣🍣", 4000), + replace: '?', + want: strings.Repeat("???????", 4000), + }, + } + + for n, v := range tests { + name := n + tt := v + + t.Run(name, func(t *testing.T) { + var buf bytes.Buffer + w := bufio.NewWriter(transform.NewWriter(&buf, garbledreplacer.NewTransformer(tt.encoding, tt.replace))) + _, err := w.WriteString(tt.in) + if tt.wantError != nil { + if err == nil { + t.Errorf("want error %v, got nil", tt.wantError) + } + if errors.Is(err, tt.wantError) { + return + } + t.Error(err) + } + if err := w.Flush(); err != nil { + t.Error(err) + } + + var actual bytes.Buffer + aw := transform.NewWriter(&actual, tt.encoding.NewDecoder()) + if _, err := aw.Write(buf.Bytes()); err != nil { + t.Error(err) + } + if err := aw.Close(); err != nil { + t.Error(err) + } + + if len([]rune(tt.want)) != len([]rune(actual.String())) { + t.Errorf("string length does not match %d=%d", len([]rune(tt.want)), len([]rune(actual.String()))) + } + if tt.want != actual.String() { + t.Errorf("string does not match\n%s", actual.String()) + } + }) + } +} + // nolint: typecheck func FuzzTransformer(f *testing.F) { f.Skip()