Detect encoding changes while parsing diff (#16330)

* Detect encoding changes while parsing diff
This commit is contained in:
Jimmy Praet 2021-07-13 03:13:52 +02:00 committed by GitHub
parent 2614309a58
commit 4ce32c9e93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -32,6 +32,7 @@ import (
"github.com/sergi/go-diff/diffmatchpatch" "github.com/sergi/go-diff/diffmatchpatch"
stdcharset "golang.org/x/net/html/charset" stdcharset "golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/transform" "golang.org/x/text/transform"
) )
@ -883,35 +884,46 @@ parsingLoop:
} }
// FIXME: There are numerous issues with this: // TODO: There are numerous issues with this:
// - we might want to consider detecting encoding while parsing but... // - we might want to consider detecting encoding while parsing but...
// - we're likely to fail to get the correct encoding here anyway as we won't have enough information // - we're likely to fail to get the correct encoding here anyway as we won't have enough information
// - and this doesn't really account for changes in encoding var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3)
var buf bytes.Buffer var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3)
diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer)
diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer)
diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer)
for _, f := range diff.Files { for _, f := range diff.Files {
buf.Reset() for _, buffer := range diffLineTypeBuffers {
buffer.Reset()
}
for _, sec := range f.Sections { for _, sec := range f.Sections {
for _, l := range sec.Lines { for _, l := range sec.Lines {
if l.Type == DiffLineSection { if l.Type == DiffLineSection {
continue continue
} }
buf.WriteString(l.Content[1:]) diffLineTypeBuffers[l.Type].WriteString(l.Content[1:])
buf.WriteString("\n") diffLineTypeBuffers[l.Type].WriteString("\n")
} }
} }
charsetLabel, err := charset.DetectEncoding(buf.Bytes()) for lineType, buffer := range diffLineTypeBuffers {
if charsetLabel != "UTF-8" && err == nil { diffLineTypeDecoders[lineType] = nil
encoding, _ := stdcharset.Lookup(charsetLabel) if buffer.Len() == 0 {
if encoding != nil { continue
d := encoding.NewDecoder() }
for _, sec := range f.Sections { charsetLabel, err := charset.DetectEncoding(buffer.Bytes())
for _, l := range sec.Lines { if charsetLabel != "UTF-8" && err == nil {
if l.Type == DiffLineSection { encoding, _ := stdcharset.Lookup(charsetLabel)
continue if encoding != nil {
} diffLineTypeDecoders[lineType] = encoding.NewDecoder()
if c, _, err := transform.String(d, l.Content[1:]); err == nil { }
l.Content = l.Content[0:1] + c }
} }
for _, sec := range f.Sections {
for _, l := range sec.Lines {
decoder := diffLineTypeDecoders[l.Type]
if decoder != nil {
if c, _, err := transform.String(decoder, l.Content[1:]); err == nil {
l.Content = l.Content[0:1] + c
} }
} }
} }