Skip to content

Commit 99efa02

Browse files
authored
Switch Unicode Escaping to a VSCode-like system (#19990)
This PR rewrites the invisible unicode detection algorithm to more closely match that of the Monaco editor on the system. It provides a technique for detecting ambiguous characters and relaxes the detection of combining marks. Control characters are in addition detected as invisible in this implementation whereas they are not on monaco but this is related to font issues. Close #19913 Signed-off-by: Andrew Thornton <art27@cantab.net>
1 parent 11dc6df commit 99efa02

29 files changed

+2106
-370
lines changed

modules/charset/ambiguous.go

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT
2+
// Copyright 2022 The Gitea Authors. All rights reserved.
3+
// Use of this source code is governed by a MIT-style
4+
// license that can be found in the LICENSE file.
5+
6+
package charset
7+
8+
import (
9+
"sort"
10+
"strings"
11+
"unicode"
12+
13+
"code.gitea.io/gitea/modules/translation"
14+
)
15+
16+
// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale.
17+
func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable {
18+
key := locale.Language()
19+
var table *AmbiguousTable
20+
var ok bool
21+
for len(key) > 0 {
22+
if table, ok = AmbiguousCharacters[key]; ok {
23+
break
24+
}
25+
idx := strings.LastIndexAny(key, "-_")
26+
if idx < 0 {
27+
key = ""
28+
} else {
29+
key = key[:idx]
30+
}
31+
}
32+
if table == nil {
33+
table = AmbiguousCharacters["_default"]
34+
}
35+
36+
return []*AmbiguousTable{
37+
table,
38+
AmbiguousCharacters["_common"],
39+
}
40+
}
41+
42+
func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool {
43+
for _, table := range tables {
44+
if !unicode.Is(table.RangeTable, r) {
45+
continue
46+
}
47+
i := sort.Search(len(table.Confusable), func(i int) bool {
48+
return table.Confusable[i] >= r
49+
})
50+
(*confusableTo) = table.With[i]
51+
return true
52+
}
53+
return false
54+
}

modules/charset/ambiguous/ambiguous.json

+1
Large diffs are not rendered by default.

modules/charset/ambiguous/generate.go

+178
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
import (
8+
"bytes"
9+
"flag"
10+
"fmt"
11+
"go/format"
12+
"os"
13+
"sort"
14+
"text/template"
15+
"unicode"
16+
17+
"code.gitea.io/gitea/modules/json"
18+
19+
"golang.org/x/text/unicode/rangetable"
20+
)
21+
22+
// ambiguous.json provides a one to one mapping of ambiguous characters to other characters
23+
// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
24+
25+
type AmbiguousTable struct {
26+
Confusable []rune
27+
With []rune
28+
Locale string
29+
RangeTable *unicode.RangeTable
30+
}
31+
32+
type RunePair struct {
33+
Confusable rune
34+
With rune
35+
}
36+
37+
var verbose bool
38+
39+
func main() {
40+
flag.Usage = func() {
41+
fmt.Fprintf(os.Stderr, `%s: Generate AmbiguousCharacter
42+
43+
Usage: %[1]s [-v] [-o output.go] ambiguous.json
44+
`, os.Args[0])
45+
flag.PrintDefaults()
46+
}
47+
48+
output := ""
49+
flag.BoolVar(&verbose, "v", false, "verbose output")
50+
flag.StringVar(&output, "o", "ambiguous_gen.go", "file to output to")
51+
flag.Parse()
52+
input := flag.Arg(0)
53+
if input == "" {
54+
input = "ambiguous.json"
55+
}
56+
57+
bs, err := os.ReadFile(input)
58+
if err != nil {
59+
fatalf("Unable to read: %s Err: %v", input, err)
60+
}
61+
62+
var unwrapped string
63+
if err := json.Unmarshal(bs, &unwrapped); err != nil {
64+
fatalf("Unable to unwrap content in: %s Err: %v", input, err)
65+
}
66+
67+
fromJSON := map[string][]uint32{}
68+
if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil {
69+
fatalf("Unable to unmarshal content in: %s Err: %v", input, err)
70+
}
71+
72+
tables := make([]*AmbiguousTable, 0, len(fromJSON))
73+
for locale, chars := range fromJSON {
74+
table := &AmbiguousTable{Locale: locale}
75+
table.Confusable = make([]rune, 0, len(chars)/2)
76+
table.With = make([]rune, 0, len(chars)/2)
77+
pairs := make([]RunePair, len(chars)/2)
78+
for i := 0; i < len(chars); i += 2 {
79+
pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1])
80+
}
81+
sort.Slice(pairs, func(i, j int) bool {
82+
return pairs[i].Confusable < pairs[j].Confusable
83+
})
84+
for _, pair := range pairs {
85+
table.Confusable = append(table.Confusable, pair.Confusable)
86+
table.With = append(table.With, pair.With)
87+
}
88+
table.RangeTable = rangetable.New(table.Confusable...)
89+
tables = append(tables, table)
90+
}
91+
sort.Slice(tables, func(i, j int) bool {
92+
return tables[i].Locale < tables[j].Locale
93+
})
94+
data := map[string]interface{}{
95+
"Tables": tables,
96+
}
97+
98+
if err := runTemplate(generatorTemplate, output, &data); err != nil {
99+
fatalf("Unable to run template: %v", err)
100+
}
101+
}
102+
103+
func runTemplate(t *template.Template, filename string, data interface{}) error {
104+
buf := bytes.NewBuffer(nil)
105+
if err := t.Execute(buf, data); err != nil {
106+
return fmt.Errorf("unable to execute template: %w", err)
107+
}
108+
bs, err := format.Source(buf.Bytes())
109+
if err != nil {
110+
verbosef("Bad source:\n%s", buf.String())
111+
return fmt.Errorf("unable to format source: %w", err)
112+
}
113+
file, err := os.Create(filename)
114+
if err != nil {
115+
return fmt.Errorf("failed to create file %s because %w", filename, err)
116+
}
117+
defer file.Close()
118+
_, err = file.Write(bs)
119+
if err != nil {
120+
return fmt.Errorf("unable to write generated source: %w", err)
121+
}
122+
return nil
123+
}
124+
125+
var generatorTemplate = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT
126+
// Copyright 2022 The Gitea Authors. All rights reserved.
127+
// Use of this source code is governed by a MIT-style
128+
// license that can be found in the LICENSE file.
129+
130+
package charset
131+
132+
import "unicode"
133+
134+
// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
135+
136+
// AmbiguousTable matches a confusable rune with its partner for the Locale
137+
type AmbiguousTable struct {
138+
Confusable []rune
139+
With []rune
140+
Locale string
141+
RangeTable *unicode.RangeTable
142+
}
143+
144+
// AmbiguousCharacters provides a map by locale name to the confusable characters in that locale
145+
var AmbiguousCharacters = map[string]*AmbiguousTable{
146+
{{range .Tables}}{{printf "%q:" .Locale}} {
147+
Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} },
148+
With: []rune{ {{range .With}}{{.}},{{end}} },
149+
Locale: {{printf "%q" .Locale}},
150+
RangeTable: &unicode.RangeTable{
151+
R16: []unicode.Range16{
152+
{{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
153+
{{end}} },
154+
R32: []unicode.Range32{
155+
{{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
156+
{{end}} },
157+
LatinOffset: {{.RangeTable.LatinOffset}},
158+
},
159+
},
160+
{{end}}
161+
}
162+
163+
`))
164+
165+
func logf(format string, args ...interface{}) {
166+
fmt.Fprintf(os.Stderr, format+"\n", args...)
167+
}
168+
169+
func verbosef(format string, args ...interface{}) {
170+
if verbose {
171+
logf(format, args...)
172+
}
173+
}
174+
175+
func fatalf(format string, args ...interface{}) {
176+
logf("fatal: "+format+"\n", args...)
177+
os.Exit(1)
178+
}

modules/charset/ambiguous_gen.go

+837
Large diffs are not rendered by default.

modules/charset/ambiguous_gen_test.go

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package charset
6+
7+
import (
8+
"sort"
9+
"testing"
10+
"unicode"
11+
12+
"github.com/stretchr/testify/assert"
13+
)
14+
15+
func TestAmbiguousCharacters(t *testing.T) {
16+
for locale, ambiguous := range AmbiguousCharacters {
17+
assert.Equal(t, locale, ambiguous.Locale)
18+
assert.Equal(t, len(ambiguous.Confusable), len(ambiguous.With))
19+
assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool {
20+
return ambiguous.Confusable[i] < ambiguous.Confusable[j]
21+
}))
22+
23+
for _, confusable := range ambiguous.Confusable {
24+
assert.True(t, unicode.Is(ambiguous.RangeTable, confusable))
25+
i := sort.Search(len(ambiguous.Confusable), func(j int) bool {
26+
return ambiguous.Confusable[j] >= confusable
27+
})
28+
found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable
29+
assert.True(t, found, "%c is not in %d", confusable, i)
30+
}
31+
}
32+
}

modules/charset/breakwriter.go

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package charset
6+
7+
import (
8+
"bytes"
9+
"io"
10+
)
11+
12+
// BreakWriter wraps an io.Writer to always write '\n' as '<br>'
13+
type BreakWriter struct {
14+
io.Writer
15+
}
16+
17+
// Write writes the provided byte slice transparently replacing '\n' with '<br>'
18+
func (b *BreakWriter) Write(bs []byte) (n int, err error) {
19+
pos := 0
20+
for pos < len(bs) {
21+
idx := bytes.IndexByte(bs[pos:], '\n')
22+
if idx < 0 {
23+
wn, err := b.Writer.Write(bs[pos:])
24+
return n + wn, err
25+
}
26+
27+
if idx > 0 {
28+
wn, err := b.Writer.Write(bs[pos : pos+idx])
29+
n += wn
30+
if err != nil {
31+
return n, err
32+
}
33+
}
34+
35+
if _, err = b.Writer.Write([]byte("<br>")); err != nil {
36+
return n, err
37+
}
38+
pos += idx + 1
39+
40+
n++
41+
}
42+
43+
return n, err
44+
}

modules/charset/breakwriter_test.go

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package charset
6+
7+
import (
8+
"strings"
9+
"testing"
10+
)
11+
12+
func TestBreakWriter_Write(t *testing.T) {
13+
tests := []struct {
14+
name string
15+
kase string
16+
expect string
17+
wantErr bool
18+
}{
19+
{
20+
name: "noline",
21+
kase: "abcdefghijklmnopqrstuvwxyz",
22+
expect: "abcdefghijklmnopqrstuvwxyz",
23+
},
24+
{
25+
name: "endline",
26+
kase: "abcdefghijklmnopqrstuvwxyz\n",
27+
expect: "abcdefghijklmnopqrstuvwxyz<br>",
28+
},
29+
{
30+
name: "startline",
31+
kase: "\nabcdefghijklmnopqrstuvwxyz",
32+
expect: "<br>abcdefghijklmnopqrstuvwxyz",
33+
},
34+
{
35+
name: "onlyline",
36+
kase: "\n\n\n",
37+
expect: "<br><br><br>",
38+
},
39+
{
40+
name: "empty",
41+
kase: "",
42+
expect: "",
43+
},
44+
{
45+
name: "midline",
46+
kase: "\nabc\ndefghijkl\nmnopqrstuvwxy\nz",
47+
expect: "<br>abc<br>defghijkl<br>mnopqrstuvwxy<br>z",
48+
},
49+
}
50+
for _, tt := range tests {
51+
t.Run(tt.name, func(t *testing.T) {
52+
buf := &strings.Builder{}
53+
b := &BreakWriter{
54+
Writer: buf,
55+
}
56+
n, err := b.Write([]byte(tt.kase))
57+
if (err != nil) != tt.wantErr {
58+
t.Errorf("BreakWriter.Write() error = %v, wantErr %v", err, tt.wantErr)
59+
return
60+
}
61+
if n != len(tt.kase) {
62+
t.Errorf("BreakWriter.Write() = %v, want %v", n, len(tt.kase))
63+
}
64+
if buf.String() != tt.expect {
65+
t.Errorf("BreakWriter.Write() wrote %q, want %v", buf.String(), tt.expect)
66+
}
67+
})
68+
}
69+
}

0 commit comments

Comments
 (0)