Skip to content

Commit 1fb9c04

Browse files
committed
Implement rustc_mixed_script_confusable_detection.
1 parent 8195ca8 commit 1fb9c04

File tree

4 files changed

+497
-9
lines changed

4 files changed

+497
-9
lines changed

scripts/unicode.py

+323-9
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ def fetch(f):
4747
sys.stderr.write("cannot load %s\n" % f)
4848
exit(1)
4949

50+
def fetch_unidata(f):
51+
if not os.path.exists(os.path.basename(f)):
52+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
53+
% (UNICODE_VERSION_NUMBER, f))
54+
55+
if not os.path.exists(os.path.basename(f)):
56+
sys.stderr.write("cannot load %s" % f)
57+
exit(1)
58+
5059
# Implementation from unicode-segmentation
5160
def load_properties(f, interestingprops = None):
5261
fetch(f)
@@ -81,6 +90,41 @@ def load_properties(f, interestingprops = None):
8190

8291
return props
8392

93+
def load_script_properties(f, interestingprops):
94+
fetch_unidata(f)
95+
props = {}
96+
# Note: these regexes are different from those in unicode-segmentation,
97+
# becase we need to handle spaces here
98+
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
99+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
100+
101+
for line in fileinput.input(os.path.basename(f)):
102+
prop = None
103+
d_lo = 0
104+
d_hi = 0
105+
m = re1.match(line)
106+
if m:
107+
d_lo = m.group(1)
108+
d_hi = m.group(1)
109+
prop = m.group(2).strip()
110+
else:
111+
m = re2.match(line)
112+
if m:
113+
d_lo = m.group(1)
114+
d_hi = m.group(2)
115+
prop = m.group(3).strip()
116+
else:
117+
continue
118+
if interestingprops and prop not in interestingprops:
119+
continue
120+
d_lo = int(d_lo, 16)
121+
d_hi = int(d_hi, 16)
122+
if prop not in props:
123+
props[prop] = []
124+
props[prop].append((d_lo, d_hi))
125+
126+
return props
127+
84128
def load_confusables(f):
85129
fetch(f)
86130
confusables = []
@@ -97,12 +141,244 @@ def load_confusables(f):
97141
raise Exception('More than one code point in first column')
98142
d_input = int(d_inputs[0].strip(), 16)
99143
for d_output in m.group(2).split():
100-
d_outputitem = int(d_output, 16);
101-
d_outputs.append(d_outputitem);
144+
d_outputitem = int(d_output, 16)
145+
d_outputs.append(d_outputitem)
102146
confusables.append((d_input, d_outputs))
103147

104148
return confusables
105149

150+
def aliases():
151+
"""
152+
Fetch the shorthand aliases for each longhand Script name
153+
"""
154+
fetch_unidata("PropertyValueAliases.txt")
155+
longforms = {}
156+
shortforms = {}
157+
re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)")
158+
for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")):
159+
m = re1.match(line)
160+
if m:
161+
l = m.group(2).strip()
162+
s = m.group(1).strip()
163+
assert(s not in longforms)
164+
assert(l not in shortforms)
165+
longforms[s] = l
166+
shortforms[l] = s
167+
else:
168+
continue
169+
170+
return (longforms, shortforms)
171+
172+
def load_scripts(f):
173+
(longforms, shortforms) = aliases()
174+
scripts = load_script_properties(f, [])
175+
176+
script_table = []
177+
script_list = []
178+
179+
for script in scripts:
180+
if script not in ["Common", "Unknown", "Inherited"]:
181+
script_list.append(shortforms[script])
182+
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
183+
script_list.sort()
184+
script_table.sort(key=lambda w: w[0])
185+
return (longforms, script_table)
186+
187+
def is_script_ignored_in_mixedscript(source):
188+
return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
189+
190+
def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts):
191+
script_lst = script_list(proto_lst, scripts)
192+
script_lst.sort()
193+
# here's a few rules to process current version of Unicode data (13.0 at this time)
194+
script_lst_len = len(script_lst)
195+
assert(script_lst_len > 0)
196+
# Rule: A - A -> Processed, DontAdd
197+
if script_lst_len == 1 and script_lst[0] == script_i:
198+
return True, False
199+
# Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
200+
if (script_lst_len == 1 and not is_script_ignored_in_mixedscript(script_lst[0])
201+
and not is_script_ignored_in_mixedscript(script_i)
202+
and script_lst[0] != script_i):
203+
return True, True
204+
# Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
205+
if (script_lst_len == 1 and is_script_ignored_in_mixedscript(script_lst[0])
206+
and not is_script_ignored_in_mixedscript(script_i)):
207+
return True, True
208+
# Rule: A ... - A -> Processed, DontAdd
209+
if script_lst_len > 1 and script_i in script_lst:
210+
return True, False
211+
# Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
212+
if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0])
213+
and not is_script_ignored_in_mixedscript(script_lst[1])
214+
and not is_script_ignored_in_mixedscript(script_i)
215+
and script_lst[1] != script_i):
216+
return True, True
217+
if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[1])
218+
and not is_script_ignored_in_mixedscript(script_lst[0])
219+
and not is_script_ignored_in_mixedscript(script_i)
220+
and script_lst[0] != script_i):
221+
return True, True
222+
# Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
223+
if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0])
224+
and is_script_ignored_in_mixedscript(script_lst[1])
225+
and not is_script_ignored_in_mixedscript(script_i)):
226+
return True, True
227+
228+
# NotProcessed, DontAdd
229+
return False, False
230+
231+
def is_codepoint_identifier_allowed(c, identifier_allowed):
232+
for data in identifier_allowed:
233+
if c >= data[0] and c <= data[1]:
234+
return True
235+
return False
236+
237+
def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
238+
confusables = load_confusables(f)
239+
seekup_map = {}
240+
for item in confusables:
241+
d_proto_list = item[1]
242+
d_source = item[0]
243+
assert(len(d_proto_list) > 0)
244+
if len(d_proto_list) == 1:
245+
seekup_map[escape_char(d_source)] = d_proto_list
246+
# collect prototypes
247+
codepoint_map = {}
248+
multicodepoint_map = {}
249+
for item in confusables:
250+
d_source = item[0]
251+
if not is_codepoint_identifier_allowed(d_source, identifier_allowed):
252+
continue
253+
d_proto_list = item[1]
254+
if len(d_proto_list) == 1:
255+
d_proto = escape_char(d_proto_list[0])
256+
if d_proto not in codepoint_map:
257+
codepoint_map[d_proto] = []
258+
if d_proto not in seekup_map and is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
259+
codepoint_map[d_proto].append(d_proto_list[0])
260+
codepoint_map[d_proto].append(d_source)
261+
else:
262+
d_protos = escape_char_list(d_proto_list)
263+
if d_protos not in multicodepoint_map:
264+
multicodepoint_map[d_protos] = (d_proto_list, [])
265+
multicodepoint_map[d_protos][1].append(d_source)
266+
267+
mixedscript_confusable = {}
268+
269+
def confusable_entry_item(confusable, script, item_text, item):
270+
if script not in confusable:
271+
confusable[script] = {}
272+
script_entry = confusable[script]
273+
if item_text not in script_entry:
274+
script_entry[item_text] = (item, [])
275+
return script_entry[item_text][1]
276+
277+
# between single charpoint that has single charpoint prototype
278+
for _, source in codepoint_map.items():
279+
source_len = len(source)
280+
for i in range(0, source_len - 1):
281+
for j in range(i + 1, source_len):
282+
item_i, item_j = source[i], source[j]
283+
script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts)
284+
if script_i == script_j:
285+
continue
286+
if not is_script_ignored_in_mixedscript(script_i):
287+
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j)
288+
if not is_script_ignored_in_mixedscript(script_j):
289+
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
290+
291+
# between single charpoint that has multi charpoint prototype
292+
for _, proto_lst_and_source in multicodepoint_map.items():
293+
source = proto_lst_and_source[1]
294+
source_len = len(source)
295+
for i in range(0, source_len - 1):
296+
for j in range(i + 1, source_len):
297+
item_i, item_j = source[i], source[j]
298+
script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts)
299+
if script_i == script_j:
300+
continue
301+
if not is_script_ignored_in_mixedscript(script_i):
302+
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j)
303+
if not is_script_ignored_in_mixedscript(script_j):
304+
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
305+
306+
mixedscript_confusable_unresolved = {}
307+
# single charpoint that has multi charpoint prototype and its prototype
308+
for _, proto_lst_and_source in multicodepoint_map.items():
309+
proto_lst = proto_lst_and_source[0]
310+
proto_lst_can_be_part_of_identifier = True
311+
for c in proto_lst:
312+
if not is_codepoint_identifier_allowed(c, identifier_allowed):
313+
proto_lst_can_be_part_of_identifier = False
314+
break
315+
if not proto_lst_can_be_part_of_identifier:
316+
continue
317+
source = proto_lst_and_source[1]
318+
source_len = len(source)
319+
for i in range(0, source_len):
320+
item_i = source[i]
321+
script_i = codepoint_script(item_i, scripts)
322+
if is_script_ignored_in_mixedscript(script_i):
323+
continue
324+
processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts)
325+
if should_add:
326+
assert(processed)
327+
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi')
328+
if processed:
329+
continue
330+
proto_lst_text = escape_char_list(proto_lst)
331+
if not proto_lst_text in mixedscript_confusable_unresolved:
332+
mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, [])
333+
mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i)
334+
return (mixedscript_confusable, mixedscript_confusable_unresolved)
335+
336+
def codepoint_script(c, scripts):
337+
for x, y, script in scripts:
338+
if c >= x and c <= y:
339+
return script
340+
raise Exception("Not in scripts: " + escape_char(c))
341+
342+
def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts):
343+
f.write("/* " + text + "\n")
344+
for script, lst in mixedscript_confusable.items():
345+
f.write("/// Script - " + script + "\n")
346+
source_lst = [v[0] for (_, v) in lst.items()]
347+
source_lst.sort()
348+
for source in source_lst:
349+
source_text = escape_char(source)
350+
source_item_and_target_lst = lst[source_text]
351+
target_lst = source_item_and_target_lst[1]
352+
f.write(source_text + " => " + escape_char_list(target_lst) + " // " + escape_script_list(target_lst, scripts)+ "\n")
353+
f.write("*/\n")
354+
355+
356+
def script_list(char_lst, scripts):
357+
script_lst = []
358+
for c in char_lst:
359+
if c == 'multi':
360+
script = 'Z~multi'
361+
else:
362+
script = codepoint_script(c, scripts)
363+
if script not in script_lst:
364+
script_lst.append(script)
365+
return script_lst
366+
367+
def escape_script_list(char_lst, scripts):
368+
script_lst = script_list(char_lst, scripts)
369+
script_lst.sort()
370+
return str(script_lst)
371+
372+
def debug_emit_mixedscript_confusable_unresolved(f, map, text, scripts):
373+
if len(map) == 0:
374+
return
375+
print("// " + text + "\n")
376+
for prototype_text, pair in map.items():
377+
prototype = pair[0]
378+
source = pair[1]
379+
print(prototype_text + " => " + escape_char_list(source) + " // " + escape_script_list(prototype, scripts) + " => " + escape_script_list(source, scripts) + "\n")
380+
raise Exception("update the python script to add new rules for new data")
381+
106382
def format_table_content(f, content, indent):
107383
line = " "*indent
108384
first = True
@@ -119,18 +395,20 @@ def format_table_content(f, content, indent):
119395
f.write(line)
120396

121397
def escape_char(c):
398+
if c == 'multi':
399+
return "\"<multiple code points>\""
122400
return "'\\u{%x}'" % c
123401

124402
def escape_char_list(l):
125-
line = "[";
126-
first = True;
403+
line = "["
404+
first = True
127405
for c in l:
128406
if first:
129-
line += escape_char(c);
407+
line += escape_char(c)
130408
else:
131-
line += ", " + escape_char(c);
132-
first = False;
133-
line += "]";
409+
line += ", " + escape_char(c)
410+
first = False
411+
line += "]"
134412
return line
135413

136414
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
@@ -226,7 +504,7 @@ def emit_confusable_detection_module(f):
226504
confusable_table.sort(key=lambda w: w[0])
227505

228506
last_key = None
229-
for (k, v) in confusable_table:
507+
for (k, _) in confusable_table:
230508
if k == last_key:
231509
raise Exception("duplicate keys in confusables table: %s" % k)
232510
last_key = k
@@ -235,6 +513,40 @@ def emit_confusable_detection_module(f):
235513
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
236514
f.write("}\n\n")
237515

516+
def escape_script_constant(name, longforms):
517+
return "Script::" + longforms[name].strip()
518+
519+
def emit_rustc_mixed_script_confusable_detection(f):
520+
f.write("pub mod rustc_mixed_script_confusable_detection {")
521+
f.write("""
522+
use unicode_script::Script;
523+
524+
#[inline]
525+
pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
526+
match c as usize {
527+
_ => super::util::bsearch_value_table(c, CONFUSABLES)
528+
}
529+
}
530+
531+
""")
532+
identifier_status_table = load_properties("IdentifierStatus.txt")
533+
longforms, scripts = load_scripts("Scripts.txt")
534+
identifier_allowed = identifier_status_table['Allowed']
535+
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_rustc_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
536+
debug = False
537+
if debug == True:
538+
debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts)
539+
debug_emit_mixedscript_confusable_unresolved(f, mixedscript_confusable_unresolved, "mixedscript_confusable_unresolved", scripts)
540+
confusable_table = []
541+
for script, lst in mixedscript_confusable.items():
542+
for _, pair in lst.items():
543+
source = pair[0]
544+
confusable_table.append((source, script))
545+
confusable_table.sort(key=lambda w: w[0])
546+
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, Script)]", is_pub=False,
547+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_script_constant(x[1], longforms)))
548+
f.write("}\n\n")
549+
238550

239551
def emit_util_mod(f):
240552
f.write("""
@@ -301,3 +613,5 @@ def emit_util_mod(f):
301613
emit_identifier_module(rf)
302614
### confusable_detection module
303615
emit_confusable_detection_module(rf)
616+
### mixed_script_confusable_detection module
617+
emit_rustc_mixed_script_confusable_detection(rf)

0 commit comments

Comments
 (0)