Skip to content

Commit 8cb1b67

Browse files
committed
Strings: added support for UTF8 offsets in regexp
1 parent 57536d2 commit 8cb1b67

File tree

5 files changed

+102
-20
lines changed

5 files changed

+102
-20
lines changed

src/Utils/Strings.php

+44-4
Original file line numberDiff line numberDiff line change
@@ -476,9 +476,14 @@ public static function split(
476476
int $flags = 0,
477477
bool $captureOffset = false,
478478
bool $noEmpty = false,
479+
bool $utf8Offset = false,
479480
): array {
480481
$flags |= ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($noEmpty ? PREG_SPLIT_NO_EMPTY : 0) | PREG_SPLIT_DELIM_CAPTURE;
481-
return self::pcre('preg_split', [$pattern, $subject, -1, $flags]);
482+
$m = self::pcre('preg_split', [$pattern, $subject, -1, $flags]);
483+
if ($utf8Offset && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) {
484+
return self::bytesToChars($subject, [$m])[0];
485+
}
486+
return $m;
482487
}
483488

484489

@@ -493,14 +498,22 @@ public static function match(
493498
int $offset = 0,
494499
bool $captureOffset = false,
495500
bool $unmatchedAsNull = false,
501+
bool $utf8Offset = false,
496502
): ?array {
497503
$flags |= ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
504+
if ($utf8Offset) {
505+
$offset = strlen(self::substring($subject, 0, $offset));
506+
}
498507
if ($offset > strlen($subject)) {
499508
return null;
500509
}
501-
return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
502-
? $m
503-
: null;
510+
if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
511+
return null;
512+
}
513+
if ($utf8Offset && ($flags & PREG_OFFSET_CAPTURE)) {
514+
return self::bytesToChars($subject, [$m])[0];
515+
}
516+
return $m;
504517
}
505518

506519

@@ -516,8 +529,12 @@ public static function matchAll(
516529
bool $captureOffset = false,
517530
bool $unmatchedAsNull = false,
518531
bool $patternOrder = false,
532+
bool $utf8Offset = false,
519533
): array {
520534
$flags |= ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
535+
if ($utf8Offset) {
536+
$offset = strlen(self::substring($subject, 0, $offset));
537+
}
521538
if ($offset > strlen($subject)) {
522539
return [];
523540
}
@@ -526,6 +543,9 @@ public static function matchAll(
526543
($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
527544
$offset,
528545
]);
546+
if ($utf8Offset && ($flags & PREG_OFFSET_CAPTURE)) {
547+
return self::bytesToChars($subject, $m);
548+
}
529549
return $m;
530550
}
531551

@@ -540,12 +560,16 @@ public static function replace(
540560
int $limit = -1,
541561
bool $captureOffset = false,
542562
bool $unmatchedAsNull = false,
563+
bool $utf8Offset = false,
543564
): string {
544565
if (is_object($replacement) || is_array($replacement)) {
545566
if (!is_callable($replacement, false, $textual)) {
546567
throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
547568
}
548569
$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
570+
if ($utf8Offset && $captureOffset) {
571+
$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
572+
}
549573
return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
550574

551575
} elseif (is_array($pattern) && is_string(key($pattern))) {
@@ -557,6 +581,22 @@ public static function replace(
557581
}
558582

559583

584+
private static function bytesToChars(string $s, array $groups): array
585+
{
586+
$lastBytes = $lastChars = 0;
587+
foreach ($groups as &$matches) {
588+
foreach ($matches as &$match) {
589+
if ($match[1] > $lastBytes) {
590+
$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
591+
$lastBytes = $match[1];
592+
}
593+
$match[1] = $lastChars;
594+
}
595+
}
596+
return $groups;
597+
}
598+
599+
560600
/** @internal */
561601
public static function pcre(string $func, array $args)
562602
{

tests/Utils/Strings.match().phpt

+9-3
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,16 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));
1919

2020
Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));
2121

22-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
23-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
22+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE));
23+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true));
2424

25-
Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', 0, 2));
25+
Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8Offset: true));
26+
27+
Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', 0, 2));
28+
29+
Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8Offset: true, offset: 2));
30+
31+
Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8Offset: true, offset: 2));
2632

2733
Assert::null(Strings::match('hello world!', '', 0, 50));
2834
Assert::null(Strings::match('', '', 0, 1));

tests/Utils/Strings.matchAll().phpt

+15
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,29 @@ Assert::same([
4545
[['u', 3], ['u', 7], ['', 11], ['', 15]],
4646
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));
4747

48+
Assert::same([
49+
[['lu', 1], ['l', 1], ['u', 2]],
50+
[['ou', 4], ['o', 4], ['u', 5]],
51+
[['k', 7], ['k', 7], ['', 8]],
52+
[['k', 10], ['k', 10], ['', 11]],
53+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8Offset: true));
54+
4855
Assert::same([
4956
[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
5057
[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
5158
[['u', 3], ['u', 7], ['', 11], ['', 15]],
5259
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));
5360

61+
Assert::same([
62+
[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
63+
[['l', 10], ['o', 10], ['k', 10], ['k', 10]],
64+
[['u', 10], ['u', 10], ['', 10], ['', 11]],
65+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8Offset: true, patternOrder: true));
66+
5467
Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', 0, 2));
5568

69+
Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', utf8Offset: true, offset: 2));
70+
5671
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
5772
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', patternOrder: true, offset: 2));
5873

tests/Utils/Strings.replace().phpt

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,6 @@ Assert::same('#@ @@@#d!', Strings::replace('hello world!', [
3434
]));
3535
Assert::same(' !', Strings::replace('hello world!', '#\w#'));
3636
Assert::same(' !', Strings::replace('hello world!', ['#\w#']));
37-
Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode($m[0]), captureOffset: true));
37+
Assert::same('žl2uťoučk10ý k14ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true));
38+
Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true, utf8Offset: true));
3839
Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);

tests/Utils/Strings.split().phpt

+32-12
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,37 @@ Assert::same([
3838
], Strings::split('a, b, c', '#(,)\s*#', noEmpty: true));
3939

4040
Assert::same([
41-
['a', 0],
42-
[',', 1],
43-
['b', 3],
44-
[',', 4],
45-
['c', 6],
46-
], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));
41+
['ž', 0],
42+
['lu', 2],
43+
['ť', 4],
44+
['ou', 6],
45+
['č', 8],
46+
['k', 10],
47+
['ý ', 11],
48+
['k', 14],
49+
['ůň', 15],
50+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', PREG_SPLIT_OFFSET_CAPTURE));
4751

4852
Assert::same([
49-
['a', 0],
50-
[',', 1],
51-
['b', 3],
52-
[',', 4],
53-
['c', 6],
54-
], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
53+
['ž', 0],
54+
['lu', 2],
55+
['ť', 4],
56+
['ou', 6],
57+
['č', 8],
58+
['k', 10],
59+
['ý ', 11],
60+
['k', 14],
61+
['ůň', 15],
62+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));
63+
64+
Assert::same([
65+
['ž', 0],
66+
['lu', 1],
67+
['ť', 3],
68+
['ou', 4],
69+
['č', 6],
70+
['k', 7],
71+
['ý ', 8],
72+
['k', 10],
73+
['ůň', 11],
74+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8Offset: true));

0 commit comments

Comments
 (0)