@@ -47,6 +47,15 @@ def fetch(f):
47
47
sys .stderr .write ("cannot load %s\n " % f )
48
48
exit (1 )
49
49
50
+ def fetch_unidata (f ):
51
+ if not os .path .exists (os .path .basename (f )):
52
+ os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
53
+ % (UNICODE_VERSION_NUMBER , f ))
54
+
55
+ if not os .path .exists (os .path .basename (f )):
56
+ sys .stderr .write ("cannot load %s" % f )
57
+ exit (1 )
58
+
50
59
# Implementation from unicode-segmentation
51
60
def load_properties (f , interestingprops = None ):
52
61
fetch (f )
@@ -81,6 +90,41 @@ def load_properties(f, interestingprops = None):
81
90
82
91
return props
83
92
93
+ def load_script_properties (f , interestingprops ):
94
+ fetch_unidata (f )
95
+ props = {}
96
+ # Note: these regexes are different from those in unicode-segmentation,
97
+ # becase we need to handle spaces here
98
+ re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#]+) *#" )
99
+ re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#" )
100
+
101
+ for line in fileinput .input (os .path .basename (f )):
102
+ prop = None
103
+ d_lo = 0
104
+ d_hi = 0
105
+ m = re1 .match (line )
106
+ if m :
107
+ d_lo = m .group (1 )
108
+ d_hi = m .group (1 )
109
+ prop = m .group (2 ).strip ()
110
+ else :
111
+ m = re2 .match (line )
112
+ if m :
113
+ d_lo = m .group (1 )
114
+ d_hi = m .group (2 )
115
+ prop = m .group (3 ).strip ()
116
+ else :
117
+ continue
118
+ if interestingprops and prop not in interestingprops :
119
+ continue
120
+ d_lo = int (d_lo , 16 )
121
+ d_hi = int (d_hi , 16 )
122
+ if prop not in props :
123
+ props [prop ] = []
124
+ props [prop ].append ((d_lo , d_hi ))
125
+
126
+ return props
127
+
84
128
def load_confusables (f ):
85
129
fetch (f )
86
130
confusables = []
@@ -97,12 +141,244 @@ def load_confusables(f):
97
141
raise Exception ('More than one code point in first column' )
98
142
d_input = int (d_inputs [0 ].strip (), 16 )
99
143
for d_output in m .group (2 ).split ():
100
- d_outputitem = int (d_output , 16 );
101
- d_outputs .append (d_outputitem );
144
+ d_outputitem = int (d_output , 16 )
145
+ d_outputs .append (d_outputitem )
102
146
confusables .append ((d_input , d_outputs ))
103
147
104
148
return confusables
105
149
150
+ def aliases ():
151
+ """
152
+ Fetch the shorthand aliases for each longhand Script name
153
+ """
154
+ fetch_unidata ("PropertyValueAliases.txt" )
155
+ longforms = {}
156
+ shortforms = {}
157
+ re1 = re .compile (r"^ *sc *; *(\w+) *; *(\w+)" )
158
+ for line in fileinput .input (os .path .basename ("PropertyValueAliases.txt" )):
159
+ m = re1 .match (line )
160
+ if m :
161
+ l = m .group (2 ).strip ()
162
+ s = m .group (1 ).strip ()
163
+ assert (s not in longforms )
164
+ assert (l not in shortforms )
165
+ longforms [s ] = l
166
+ shortforms [l ] = s
167
+ else :
168
+ continue
169
+
170
+ return (longforms , shortforms )
171
+
172
+ def load_scripts (f ):
173
+ (longforms , shortforms ) = aliases ()
174
+ scripts = load_script_properties (f , [])
175
+
176
+ script_table = []
177
+ script_list = []
178
+
179
+ for script in scripts :
180
+ if script not in ["Common" , "Unknown" , "Inherited" ]:
181
+ script_list .append (shortforms [script ])
182
+ script_table .extend ([(x , y , shortforms [script ]) for (x , y ) in scripts [script ]])
183
+ script_list .sort ()
184
+ script_table .sort (key = lambda w : w [0 ])
185
+ return (longforms , script_table )
186
+
187
+ def is_script_ignored_in_mixedscript (source ):
188
+ return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
189
+
190
+ def process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts ):
191
+ script_lst = script_list (proto_lst , scripts )
192
+ script_lst .sort ()
193
+ # here's a few rules to process current version of Unicode data (13.0 at this time)
194
+ script_lst_len = len (script_lst )
195
+ assert (script_lst_len > 0 )
196
+ # Rule: A - A -> Processed, DontAdd
197
+ if script_lst_len == 1 and script_lst [0 ] == script_i :
198
+ return True , False
199
+ # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
200
+ if (script_lst_len == 1 and not is_script_ignored_in_mixedscript (script_lst [0 ])
201
+ and not is_script_ignored_in_mixedscript (script_i )
202
+ and script_lst [0 ] != script_i ):
203
+ return True , True
204
+ # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
205
+ if (script_lst_len == 1 and is_script_ignored_in_mixedscript (script_lst [0 ])
206
+ and not is_script_ignored_in_mixedscript (script_i )):
207
+ return True , True
208
+ # Rule: A ... - A -> Processed, DontAdd
209
+ if script_lst_len > 1 and script_i in script_lst :
210
+ return True , False
211
+ # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
212
+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [0 ])
213
+ and not is_script_ignored_in_mixedscript (script_lst [1 ])
214
+ and not is_script_ignored_in_mixedscript (script_i )
215
+ and script_lst [1 ] != script_i ):
216
+ return True , True
217
+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [1 ])
218
+ and not is_script_ignored_in_mixedscript (script_lst [0 ])
219
+ and not is_script_ignored_in_mixedscript (script_i )
220
+ and script_lst [0 ] != script_i ):
221
+ return True , True
222
+ # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
223
+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [0 ])
224
+ and is_script_ignored_in_mixedscript (script_lst [1 ])
225
+ and not is_script_ignored_in_mixedscript (script_i )):
226
+ return True , True
227
+
228
+ # NotProcessed, DontAdd
229
+ return False , False
230
+
231
+ def is_codepoint_identifier_allowed (c , identifier_allowed ):
232
+ for data in identifier_allowed :
233
+ if c >= data [0 ] and c <= data [1 ]:
234
+ return True
235
+ return False
236
+
237
+ def load_rustc_mixedscript_confusables (f , identifier_allowed , scripts ):
238
+ confusables = load_confusables (f )
239
+ seekup_map = {}
240
+ for item in confusables :
241
+ d_proto_list = item [1 ]
242
+ d_source = item [0 ]
243
+ assert (len (d_proto_list ) > 0 )
244
+ if len (d_proto_list ) == 1 :
245
+ seekup_map [escape_char (d_source )] = d_proto_list
246
+ # collect prototypes
247
+ codepoint_map = {}
248
+ multicodepoint_map = {}
249
+ for item in confusables :
250
+ d_source = item [0 ]
251
+ if not is_codepoint_identifier_allowed (d_source , identifier_allowed ):
252
+ continue
253
+ d_proto_list = item [1 ]
254
+ if len (d_proto_list ) == 1 :
255
+ d_proto = escape_char (d_proto_list [0 ])
256
+ if d_proto not in codepoint_map :
257
+ codepoint_map [d_proto ] = []
258
+ if d_proto not in seekup_map and is_codepoint_identifier_allowed (d_proto_list [0 ], identifier_allowed ):
259
+ codepoint_map [d_proto ].append (d_proto_list [0 ])
260
+ codepoint_map [d_proto ].append (d_source )
261
+ else :
262
+ d_protos = escape_char_list (d_proto_list )
263
+ if d_protos not in multicodepoint_map :
264
+ multicodepoint_map [d_protos ] = (d_proto_list , [])
265
+ multicodepoint_map [d_protos ][1 ].append (d_source )
266
+
267
+ mixedscript_confusable = {}
268
+
269
+ def confusable_entry_item (confusable , script , item_text , item ):
270
+ if script not in confusable :
271
+ confusable [script ] = {}
272
+ script_entry = confusable [script ]
273
+ if item_text not in script_entry :
274
+ script_entry [item_text ] = (item , [])
275
+ return script_entry [item_text ][1 ]
276
+
277
+ # between single charpoint that has single charpoint prototype
278
+ for _ , source in codepoint_map .items ():
279
+ source_len = len (source )
280
+ for i in range (0 , source_len - 1 ):
281
+ for j in range (i + 1 , source_len ):
282
+ item_i , item_j = source [i ], source [j ]
283
+ script_i , script_j = codepoint_script (item_i , scripts ), codepoint_script (item_j , scripts )
284
+ if script_i == script_j :
285
+ continue
286
+ if not is_script_ignored_in_mixedscript (script_i ):
287
+ confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append (item_j )
288
+ if not is_script_ignored_in_mixedscript (script_j ):
289
+ confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
290
+
291
+ # between single charpoint that has multi charpoint prototype
292
+ for _ , proto_lst_and_source in multicodepoint_map .items ():
293
+ source = proto_lst_and_source [1 ]
294
+ source_len = len (source )
295
+ for i in range (0 , source_len - 1 ):
296
+ for j in range (i + 1 , source_len ):
297
+ item_i , item_j = source [i ], source [j ]
298
+ script_i , script_j = codepoint_script (item_i , scripts ), codepoint_script (item_j , scripts )
299
+ if script_i == script_j :
300
+ continue
301
+ if not is_script_ignored_in_mixedscript (script_i ):
302
+ confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append (item_j )
303
+ if not is_script_ignored_in_mixedscript (script_j ):
304
+ confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
305
+
306
+ mixedscript_confusable_unresolved = {}
307
+ # single charpoint that has multi charpoint prototype and its prototype
308
+ for _ , proto_lst_and_source in multicodepoint_map .items ():
309
+ proto_lst = proto_lst_and_source [0 ]
310
+ proto_lst_can_be_part_of_identifier = True
311
+ for c in proto_lst :
312
+ if not is_codepoint_identifier_allowed (c , identifier_allowed ):
313
+ proto_lst_can_be_part_of_identifier = False
314
+ break
315
+ if not proto_lst_can_be_part_of_identifier :
316
+ continue
317
+ source = proto_lst_and_source [1 ]
318
+ source_len = len (source )
319
+ for i in range (0 , source_len ):
320
+ item_i = source [i ]
321
+ script_i = codepoint_script (item_i , scripts )
322
+ if is_script_ignored_in_mixedscript (script_i ):
323
+ continue
324
+ processed , should_add = process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts )
325
+ if should_add :
326
+ assert (processed )
327
+ confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append ('multi' )
328
+ if processed :
329
+ continue
330
+ proto_lst_text = escape_char_list (proto_lst )
331
+ if not proto_lst_text in mixedscript_confusable_unresolved :
332
+ mixedscript_confusable_unresolved [proto_lst_text ] = (proto_lst , [])
333
+ mixedscript_confusable_unresolved [proto_lst_text ][1 ].append (item_i )
334
+ return (mixedscript_confusable , mixedscript_confusable_unresolved )
335
+
336
+ def codepoint_script (c , scripts ):
337
+ for x , y , script in scripts :
338
+ if c >= x and c <= y :
339
+ return script
340
+ raise Exception ("Not in scripts: " + escape_char (c ))
341
+
342
+ def debug_emit_mixedscript_confusable (f , mixedscript_confusable , text , scripts ):
343
+ f .write ("/* " + text + "\n " )
344
+ for script , lst in mixedscript_confusable .items ():
345
+ f .write ("/// Script - " + script + "\n " )
346
+ source_lst = [v [0 ] for (_ , v ) in lst .items ()]
347
+ source_lst .sort ()
348
+ for source in source_lst :
349
+ source_text = escape_char (source )
350
+ source_item_and_target_lst = lst [source_text ]
351
+ target_lst = source_item_and_target_lst [1 ]
352
+ f .write (source_text + " => " + escape_char_list (target_lst ) + " // " + escape_script_list (target_lst , scripts )+ "\n " )
353
+ f .write ("*/\n " )
354
+
355
+
356
+ def script_list (char_lst , scripts ):
357
+ script_lst = []
358
+ for c in char_lst :
359
+ if c == 'multi' :
360
+ script = 'Z~multi'
361
+ else :
362
+ script = codepoint_script (c , scripts )
363
+ if script not in script_lst :
364
+ script_lst .append (script )
365
+ return script_lst
366
+
367
+ def escape_script_list (char_lst , scripts ):
368
+ script_lst = script_list (char_lst , scripts )
369
+ script_lst .sort ()
370
+ return str (script_lst )
371
+
372
+ def debug_emit_mixedscript_confusable_unresolved (f , map , text , scripts ):
373
+ if len (map ) == 0 :
374
+ return
375
+ print ("// " + text + "\n " )
376
+ for prototype_text , pair in map .items ():
377
+ prototype = pair [0 ]
378
+ source = pair [1 ]
379
+ print (prototype_text + " => " + escape_char_list (source ) + " // " + escape_script_list (prototype , scripts ) + " => " + escape_script_list (source , scripts ) + "\n " )
380
+ raise Exception ("update the python script to add new rules for new data" )
381
+
106
382
def format_table_content (f , content , indent ):
107
383
line = " " * indent
108
384
first = True
@@ -119,18 +395,20 @@ def format_table_content(f, content, indent):
119
395
f .write (line )
120
396
121
397
def escape_char (c ):
398
+ if c == 'multi' :
399
+ return "\" <multiple code points>\" "
122
400
return "'\\ u{%x}'" % c
123
401
124
402
def escape_char_list (l ):
125
- line = "[" ;
126
- first = True ;
403
+ line = "["
404
+ first = True
127
405
for c in l :
128
406
if first :
129
- line += escape_char (c );
407
+ line += escape_char (c )
130
408
else :
131
- line += ", " + escape_char (c );
132
- first = False ;
133
- line += "]" ;
409
+ line += ", " + escape_char (c )
410
+ first = False
411
+ line += "]"
134
412
return line
135
413
136
414
def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
@@ -226,7 +504,7 @@ def emit_confusable_detection_module(f):
226
504
confusable_table .sort (key = lambda w : w [0 ])
227
505
228
506
last_key = None
229
- for (k , v ) in confusable_table :
507
+ for (k , _ ) in confusable_table :
230
508
if k == last_key :
231
509
raise Exception ("duplicate keys in confusables table: %s" % k )
232
510
last_key = k
@@ -235,6 +513,40 @@ def emit_confusable_detection_module(f):
235
513
pfun = lambda x : "(%s, &%s)" % (escape_char (x [0 ]), escape_char_list (x [1 ])))
236
514
f .write ("}\n \n " )
237
515
516
+ def escape_script_constant (name , longforms ):
517
+ return "Script::" + longforms [name ].strip ()
518
+
519
+ def emit_rustc_mixed_script_confusable_detection (f ):
520
+ f .write ("pub mod rustc_mixed_script_confusable_detection {" )
521
+ f .write ("""
522
+ use unicode_script::Script;
523
+
524
+ #[inline]
525
+ pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
526
+ match c as usize {
527
+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
528
+ }
529
+ }
530
+
531
+ """ )
532
+ identifier_status_table = load_properties ("IdentifierStatus.txt" )
533
+ longforms , scripts = load_scripts ("Scripts.txt" )
534
+ identifier_allowed = identifier_status_table ['Allowed' ]
535
+ (mixedscript_confusable , mixedscript_confusable_unresolved ) = load_rustc_mixedscript_confusables ("confusables.txt" , identifier_allowed , scripts )
536
+ debug = False
537
+ if debug == True :
538
+ debug_emit_mixedscript_confusable (f , mixedscript_confusable , "mixedscript_confusable" , scripts )
539
+ debug_emit_mixedscript_confusable_unresolved (f , mixedscript_confusable_unresolved , "mixedscript_confusable_unresolved" , scripts )
540
+ confusable_table = []
541
+ for script , lst in mixedscript_confusable .items ():
542
+ for _ , pair in lst .items ():
543
+ source = pair [0 ]
544
+ confusable_table .append ((source , script ))
545
+ confusable_table .sort (key = lambda w : w [0 ])
546
+ emit_table (f , "CONFUSABLES" , confusable_table , "&'static [(char, Script)]" , is_pub = False ,
547
+ pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_script_constant (x [1 ], longforms )))
548
+ f .write ("}\n \n " )
549
+
238
550
239
551
def emit_util_mod (f ):
240
552
f .write ("""
@@ -301,3 +613,5 @@ def emit_util_mod(f):
301
613
emit_identifier_module (rf )
302
614
### confusable_detection module
303
615
emit_confusable_detection_module (rf )
616
+ ### mixed_script_confusable_detection module
617
+ emit_rustc_mixed_script_confusable_detection (rf )
0 commit comments