Skip to content

Commit 4dd1c9d

Browse files
closes bpo-37966: Fully implement the UAX GH-15 quick-check algorithm. (GH-15558)
The purpose of the `unicodedata.is_normalized` function is to answer the question `str == unicodedata.normalized(form, str)` more efficiently than writing just that, by using the "quick check" optimization described in the Unicode standard in UAX GH-15. However, it turns out the code doesn't implement the full algorithm from the standard, and as a result we often miss the optimization and end up having to compute the whole normalized string after all. Implement the standard's algorithm. This greatly speeds up `unicodedata.is_normalized` in many cases where our partial variant of quick-check had been returning MAYBE and the standard algorithm returns NO. At a quick test on my desktop, the existing code takes about 4.4 ms/MB (so 4.4 ns per byte) when the partial quick-check returns MAYBE and it has to do the slow normalize-and-compare: $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 50 loops, best of 5: 4.39 msec per loop With this patch, it gets the answer instantly (58 ns) on the same 1 MB string: $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 5000000 loops, best of 5: 58.2 nsec per loop This restores a small optimization that the original version of this code had for the `unicodedata.normalize` use case. With this, that case is actually faster than in master! $ build.base/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \ -- 'unicodedata.normalize("NFD", s)' 500 loops, best of 5: 561 usec per loop $ build.dev/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \ -- 'unicodedata.normalize("NFD", s)' 500 loops, best of 5: 512 usec per loop (cherry picked from commit 2f09413) Co-authored-by: Greg Price <gnprice@gmail.com>
1 parent 952ea67 commit 4dd1c9d

File tree

4 files changed

+59
-26
lines changed

4 files changed

+59
-26
lines changed

Doc/whatsnew/3.8.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -1090,8 +1090,9 @@ unicodedata
10901090
<http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.
10911091

10921092
* New function :func:`~unicodedata.is_normalized` can be used to verify a string
1093-
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
1094-
:issue:`32285`).
1093+
is in a specific normal form, often much faster than by actually normalizing
1094+
the string. (Contributed by Max Belanger, David Euresti, and Greg Price in
1095+
:issue:`32285` and :issue:`37966`).
10951096

10961097

10971098
unittest

Lib/test/test_unicodedata.py

+2
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ def test_issue29456(self):
220220
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
221221
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
222222

223+
# For tests of unicodedata.is_normalized / self.db.is_normalized ,
224+
# see test_normalization.py .
223225

224226
def test_east_asian_width(self):
225227
eaw = self.db.east_asian_width
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The implementation of :func:`~unicodedata.is_normalized` has been greatly
2+
sped up on strings that aren't normalized, by implementing the full
3+
normalization-quick-check algorithm from the Unicode standard.

Modules/unicodedata.c

+51-24
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include "ucnhash.h"
2020
#include "structmember.h"
2121

22+
#include <stdbool.h>
23+
2224
_Py_IDENTIFIER(NFC);
2325
_Py_IDENTIFIER(NFD);
2426
_Py_IDENTIFIER(NFKC);
@@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
775777
return result;
776778
}
777779

778-
typedef enum {YES, NO, MAYBE} NormalMode;
779-
780-
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
781-
static NormalMode
782-
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
780+
// This needs to match the logic in makeunicodedata.py
781+
// which constructs the quickcheck data.
782+
typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
783+
784+
/* Run the Unicode normalization "quickcheck" algorithm.
785+
*
786+
* Return YES or NO if quickcheck determines the input is certainly
787+
* normalized or certainly not, and MAYBE if quickcheck is unable to
788+
* tell.
789+
*
790+
* If `yes_only` is true, then return MAYBE as soon as we determine
791+
* the answer is not YES.
792+
*
793+
* For background and details on the algorithm, see UAX #15:
794+
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795+
*/
796+
static QuickcheckResult
797+
is_normalized_quickcheck(PyObject *self, PyObject *input,
798+
int nfc, int k, bool yes_only)
783799
{
784-
Py_ssize_t i, len;
785-
int kind;
786-
void *data;
787-
unsigned char prev_combining = 0, quickcheck_mask;
788-
789800
/* An older version of the database is requested, quickchecks must be
790801
disabled. */
791802
if (self && UCD_Check(self))
792803
return NO;
793804

794-
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
795-
as described in http://unicode.org/reports/tr15/#Annex8. */
796-
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
805+
Py_ssize_t i, len;
806+
int kind;
807+
void *data;
808+
unsigned char prev_combining = 0;
809+
810+
/* The two quickcheck bits at this shift have type QuickcheckResult. */
811+
int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812+
813+
QuickcheckResult result = YES; /* certainly normalized, unless we find something */
797814

798815
i = 0;
799816
kind = PyUnicode_KIND(input);
@@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
802819
while (i < len) {
803820
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
804821
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
805-
unsigned char combining = record->combining;
806-
unsigned char quickcheck = record->normalization_quick_check;
807822

808-
if (quickcheck & quickcheck_mask)
809-
return MAYBE; /* this string might need normalization */
823+
unsigned char combining = record->combining;
810824
if (combining && prev_combining > combining)
811825
return NO; /* non-canonical sort order, not normalized */
812826
prev_combining = combining;
827+
828+
unsigned char quickcheck_whole = record->normalization_quick_check;
829+
if (yes_only) {
830+
if (quickcheck_whole & (3 << quickcheck_shift))
831+
return MAYBE;
832+
} else {
833+
switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834+
case NO:
835+
return NO;
836+
case MAYBE:
837+
result = MAYBE; /* this string might need normalization */
838+
}
839+
}
813840
}
814-
return YES; /* certainly normalized */
841+
return result;
815842
}
816843

817844
/*[clinic input]
@@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
844871
PyObject *result;
845872
int nfc = 0;
846873
int k = 0;
847-
NormalMode m;
874+
QuickcheckResult m;
848875

849876
PyObject *cmp;
850877
int match = 0;
@@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
867894
return NULL;
868895
}
869896

870-
m = is_normalized(self, input, nfc, k);
897+
m = is_normalized_quickcheck(self, input, nfc, k, false);
871898

872899
if (m == MAYBE) {
873900
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
@@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
913940
}
914941

915942
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
916-
if (is_normalized(self, input, 1, 0) == YES) {
943+
if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
917944
Py_INCREF(input);
918945
return input;
919946
}
920947
return nfc_nfkc(self, input, 0);
921948
}
922949
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
923-
if (is_normalized(self, input, 1, 1) == YES) {
950+
if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
924951
Py_INCREF(input);
925952
return input;
926953
}
927954
return nfc_nfkc(self, input, 1);
928955
}
929956
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
930-
if (is_normalized(self, input, 0, 0) == YES) {
957+
if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
931958
Py_INCREF(input);
932959
return input;
933960
}
934961
return nfd_nfkd(self, input, 0);
935962
}
936963
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
937-
if (is_normalized(self, input, 0, 1) == YES) {
964+
if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
938965
Py_INCREF(input);
939966
return input;
940967
}

0 commit comments

Comments
 (0)