Skip to content

Commit

Permalink
fix: Supports bufio use also for strings larger than 4096 bytes.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomtwinkle committed Jan 23, 2024
1 parent 4a5af67 commit 98787c1
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 15 deletions.
30 changes: 21 additions & 9 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,27 @@ func (t *replacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e
if len(_src) == 0 && atEOF {
return
}
if !utf8.Valid(_src) {
// If not a string, do not process
err = ErrInvalidUTF8
return
}

for len(_src) > 0 {
_, n := utf8.DecodeRune(_src)
buf := _src[:n]
r, size := utf8.DecodeRune(_src)
if r < utf8.RuneSelf {
size = 1
} else {
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8, or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(_src) {
err = transform.ErrShortSrc
break
}
// If the last string cannot be converted to rune, it is not replaced.
if atEOF && !utf8.FullRune(_src) {
break
}
}
}
buf := _src[:size]
if _, encErr := t.enc.Bytes(buf); encErr != nil {
// Replace strings that cannot be converted
buf = []byte(string(t.replaceRune))
Expand All @@ -54,9 +66,9 @@ func (t *replacer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e
if dstN <= 0 {
break
}
nSrc += n
nSrc += size
nDst += dstN
_src = _src[n:]
_src = _src[size:]
}
return
}
133 changes: 127 additions & 6 deletions main_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package garbledreplacer_test

import (
"bufio"
"bytes"
"errors"
"github.com/tomtwinkle/garbledreplacer"
Expand Down Expand Up @@ -46,10 +47,10 @@ func TestNewTransformer(t *testing.T) {
want: "?",
},
"UTF-8->ShiftJIS:Invalid UTF-8 character": {
encoding: japanese.ShiftJIS,
in: "\xe4",
replace: '?',
wantError: garbledreplacer.ErrInvalidUTF8,
encoding: japanese.ShiftJIS,
in: "\xe4",
replace: '?',
want: "",
},
"UTF-8->EUCJP:with garbled text": {
encoding: japanese.EUCJP,
Expand All @@ -63,6 +64,18 @@ func TestNewTransformer(t *testing.T) {
replace: '?',
want: strings.Repeat("咖呸咕咀呻?呷咄咒咆呼咐?呱呶和咚呢", 3000),
},
"UTF-8->ShiftJIS:with garbled text:larger than 4096bytes": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 4000),
replace: '?',
want: strings.Repeat("一二三四?五六七八九?十拾壱", 4000),
},
"UTF-8->ShiftJIS:all garbled text:larger than 4096bytes": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("🍣🍣🍣🍺🍣🍣🍣", 4000),
replace: '?',
want: strings.Repeat("???????", 4000),
},
}

for n, v := range tests {
Expand All @@ -72,8 +85,12 @@ func TestNewTransformer(t *testing.T) {
t.Run(name, func(t *testing.T) {
var buf bytes.Buffer
w := transform.NewWriter(&buf, garbledreplacer.NewTransformer(tt.encoding, tt.replace))
if _, err := w.Write([]byte(tt.in)); err != nil {
if tt.wantError != nil && errors.Is(err, tt.wantError) {
_, err := w.Write([]byte(tt.in))
if tt.wantError != nil {
if err == nil {
t.Errorf("want error %v, got nil", tt.wantError)
}
if errors.Is(err, tt.wantError) {
return
}
t.Error(err)
Expand Down Expand Up @@ -101,6 +118,110 @@ func TestNewTransformer(t *testing.T) {
}
}

func TestNewTransformerWithBufio(t *testing.T) {
tests := map[string]struct {
encoding encoding.Encoding
in string
replace rune
want string
wantError error
}{
"UTF-8->ShiftJIS:no garbled text": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("一二三四五六七八九十拾壱", 1000),
replace: '?',
want: strings.Repeat("一二三四五六七八九十拾壱", 1000),
},
"UTF-8->ShiftJIS:with garbled text": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("一二三四五六七八九十拾壱🍺", 1000),
replace: '?',
want: strings.Repeat("一二三四五六七八九十拾壱?", 1000),
},
"UTF-8->ShiftJIS:with garbled text:other replaceRune": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 3000),
replace: '@',
want: strings.Repeat("一二三四@五六七八九@十拾壱", 3000),
},
"UTF-8->ShiftJIS:RuneError only": {
encoding: japanese.ShiftJIS,
in: string(utf8.RuneError),
replace: '?',
want: "?",
},
"UTF-8->ShiftJIS:Invalid UTF-8 character": {
encoding: japanese.ShiftJIS,
in: "\xe4",
replace: '?',
want: "",
},
"UTF-8->EUCJP:with garbled text": {
encoding: japanese.EUCJP,
in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 3000),
replace: '?',
want: strings.Repeat("一二三四?五六七八九?十拾壱", 3000),
},
"UTF-8->Big5:with garbled text": {
encoding: traditionalchinese.Big5,
in: strings.Repeat("咖呸咕咀呻🍣呷咄咒咆呼咐🍺呱呶和咚呢", 3000),
replace: '?',
want: strings.Repeat("咖呸咕咀呻?呷咄咒咆呼咐?呱呶和咚呢", 3000),
},
"UTF-8->ShiftJIS:with garbled text:larger than 4096bytes": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("一二三四🍣五六七八九🍺十拾壱", 4000),
replace: '?',
want: strings.Repeat("一二三四?五六七八九?十拾壱", 4000),
},
"UTF-8->ShiftJIS:all garbled text:larger than 4096bytes": {
encoding: japanese.ShiftJIS,
in: strings.Repeat("🍣🍣🍣🍺🍣🍣🍣", 4000),
replace: '?',
want: strings.Repeat("???????", 4000),
},
}

for n, v := range tests {
name := n
tt := v

t.Run(name, func(t *testing.T) {
var buf bytes.Buffer
w := bufio.NewWriter(transform.NewWriter(&buf, garbledreplacer.NewTransformer(tt.encoding, tt.replace)))
_, err := w.WriteString(tt.in)
if tt.wantError != nil {
if err == nil {
t.Errorf("want error %v, got nil", tt.wantError)
}
if errors.Is(err, tt.wantError) {
return
}
t.Error(err)
}
if err := w.Flush(); err != nil {
t.Error(err)
}

var actual bytes.Buffer
aw := transform.NewWriter(&actual, tt.encoding.NewDecoder())
if _, err := aw.Write(buf.Bytes()); err != nil {
t.Error(err)
}
if err := aw.Close(); err != nil {
t.Error(err)
}

if len([]rune(tt.want)) != len([]rune(actual.String())) {
t.Errorf("string length does not match %d=%d", len([]rune(tt.want)), len([]rune(actual.String())))
}
if tt.want != actual.String() {
t.Errorf("string does not match\n%s", actual.String())
}
})
}
}

// nolint: typecheck
func FuzzTransformer(f *testing.F) {
f.Skip()
Expand Down

0 comments on commit 98787c1

Please sign in to comment.