Skip to content
This repository was archived by the owner on Aug 5, 2024. It is now read-only.

Commit bdc4740

Browse files
committed
Update patch for fixing (null) bug
Older versions of the Objective C library would occasionally create patches from invalid separations of the surrogate pairs. It would start by finding the longest common prefix which split a surrogate pair in half when inserting in between existing surrogates sharing the same high surrogate: `\ud83c\udd70\ud83c\udd71` -> `\ud83c\udd70\ud83c\udd72\ud83c\udd71` `\ud83c\udd70\ud83c` + `\udd72\ud83c` + `\udd71` Next it would try to create diff groups from these and fail because of the unpaired surrogates. The middle group is entirely wrong and gets replaced with `(null)` while the other surrogate halves stick around _as counts_ since that's how `toDelta()` converts them. `=3\t+(null)\t=1` When the libraries receive this patch they end up reconstructing an invalid Unicode sequence because of the numbers which now instruct it to split those same surrogate pairs. --- In this patch we're identifying this specific sequence in `fromDelta()` and removing the additional breakage by eliminating the `(null)` group and re-joining the split surrogate halves. You can see that this will effectively undo the change operation because that `(null)` is where the new character was inserted. _There is no way to avoid this_ as the problem occurred in the past and we have lost information to `(null)`. In this patch we are merely removing the vestige of a failure that would lead to additional failures if we passed on the data.
1 parent fa122d3 commit bdc4740

File tree

11 files changed

+188
-65
lines changed

11 files changed

+188
-65
lines changed

java/src/name/fraser/neil/plaintext/diff_match_patch.java

+28-11
Original file line numberDiff line numberDiff line change
@@ -1576,16 +1576,7 @@ private String decodeURI(String text) throws IllegalArgumentException {
15761576
throw new IllegalArgumentException();
15771577
}
15781578

1579-
// some objective-c versions of the library produced patches with
1580-
// (null) in the place where surrogates were split across diff
1581-
// boundaries. if we leave those in we'll be stuck with a
1582-
// high-surrogate (null) low-surrogate pattern that will break
1583-
// deeper in the library or consuming application. we'll "fix"
1584-
// these by dropping the (null) and re-joining the surrogate halves
1585-
return decoded.toString().replaceAll(
1586-
"([\\uD800-\\uDBFF])\\(null\\)([\\uDC00-\\uDFFF])",
1587-
"$1$2"
1588-
);
1579+
return decoded.toString();
15891580
}
15901581

15911582
/**
@@ -1601,7 +1592,8 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
16011592
LinkedList<Diff> diffs = new LinkedList<Diff>();
16021593
int pointer = 0; // Cursor in text1
16031594
String[] tokens = delta.split("\t");
1604-
for (String token : tokens) {
1595+
for (int x = 0; x < tokens.length; x++) {
1596+
String token = tokens[x];
16051597
if (token.length() == 0) {
16061598
// Blank tokens are ok (from a trailing \t).
16071599
continue;
@@ -1637,6 +1629,31 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
16371629
"Negative number in diff_fromDelta: " + param);
16381630
}
16391631
String text;
1632+
1633+
// some objective-c versions of the library produced patches with
1634+
// (null) in the place where surrogates were split across diff
1635+
// boundaries. if we leave those in we'll be stuck with a
1636+
// high-surrogate (null) low-surrogate pattern that will break
1637+
// deeper in the library or consuming application. we'll "fix"
1638+
// these by dropping the (null) and re-joining the surrogate halves
1639+
if (
1640+
x + 2 < tokens.length &&
1641+
Character.isHighSurrogate(text1.charAt(pointer + n - 1)) &&
1642+
tokens[x + 1].substring(1).equals("(null)") &&
1643+
Character.isLowSurrogate(text1.charAt(pointer + n))
1644+
) {
1645+
n -= 1;
1646+
tokens[x + 1] = "+";
1647+
int m;
1648+
try {
1649+
m = Integer.parseInt(tokens[x + 2].substring(1));
1650+
} catch (NumberFormatException e) {
1651+
throw new IllegalArgumentException(
1652+
"Invalid number in diff_fromDelta: " + tokens[x + 2].substring(1), e);
1653+
}
1654+
tokens[x + 2] = tokens[x + 2].charAt(0) + String.valueOf(m + 1);
1655+
}
1656+
16401657
try {
16411658
text = text1.substring(pointer, pointer += n);
16421659
} catch (StringIndexOutOfBoundsException e) {

java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,12 @@ public static void testDiffText() {
376376
LinkedList<Diff> diffs = diffList(new Diff(EQUAL, "jump"), new Diff(DELETE, "s"), new Diff(INSERT, "ed"), new Diff(EQUAL, " over "), new Diff(DELETE, "the"), new Diff(INSERT, "a"), new Diff(EQUAL, " lazy"));
377377
assertEquals("diff_text1:", "jumps over the lazy", dmp.diff_text1(diffs));
378378
assertEquals("diff_text2:", "jumped over a lazy", dmp.diff_text2(diffs));
379+
380+
assertEquals(
381+
"diff_text2: Objective-C (null) bug",
382+
"🙂🙁",
383+
dmp.diff_text2(dmp.diff_fromDelta("🙂🙁", "=3\t+(null)\t=1"))
384+
);
379385
}
380386

381387
public static void testDiffDelta() {
@@ -460,12 +466,6 @@ public static void testDiffDelta() {
460466
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
461467
);
462468

463-
assertEquals(
464-
"diff_fromDelta: Invalid diff from objective-c with (null) string",
465-
diffList(new Diff(INSERT, "\ud83c\udd70")),
466-
dmp.diff_fromDelta("", "+%ED%A0%BC%28null%29%ED%B5%B0")
467-
);
468-
469469
// Verify pool of unchanged characters.
470470
diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
471471
String text2 = dmp.diff_text2(diffs);

javascript/diff_match_patch.js

+8-7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

javascript/diff_match_patch_uncompressed.js

+23-7
Original file line numberDiff line numberDiff line change
@@ -1512,13 +1512,7 @@ diff_match_patch.prototype.decodeURI = function(text) {
15121512
throw new URIError('URI malformed');
15131513
}
15141514

1515-
// some objective-c versions of the library produced patches with
1516-
// (null) in the place where surrogates were split across diff
1517-
// boundaries. if we leave those in we'll be stuck with a
1518-
// high-surrogate (null) low-surrogate pattern that will break
1519-
// deeper in the library or consuming application. we'll "fix"
1520-
// these by dropping the (null) and re-joining the surrogate halves
1521-
return decoded.replace(/([\uD800-\uDBFF])\(null\)([\uDC00-\uDFFF])/g, "$1$2");
1515+
return decoded;
15221516
}
15231517
};
15241518

@@ -1556,6 +1550,28 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
15561550
if (isNaN(n) || n < 0) {
15571551
throw new Error('Invalid number in diff_fromDelta: ' + param);
15581552
}
1553+
1554+
// some objective-c versions of the library produced patches with
1555+
// (null) in the place where surrogates were split across diff
1556+
// boundaries. if we leave those in we'll be stuck with a
1557+
// high-surrogate (null) low-surrogate pattern that will break
1558+
// deeper in the library or consuming application. we'll "fix"
1559+
// these by dropping the (null) and re-joining the surrogate halves
1560+
if (
1561+
x + 2 < tokens.length &&
1562+
this.isHighSurrogate(text1[pointer + n - 1]) &&
1563+
'(null)' === tokens[x + 1].substring(1) &&
1564+
this.isLowSurrogate(text1[pointer + n])
1565+
) {
1566+
n -= 1;
1567+
tokens[x + 1] = "+";
1568+
var m = parseInt(tokens[x + 2].substring(1),10);
1569+
if (isNaN(m) || m < 0) {
1570+
throw new Error('Invalid number in diff_fromDelta: ' + tokens[x + 2].substring(1));
1571+
}
1572+
tokens[x + 2] = tokens[x + 2][0] + (m + 1).toString();
1573+
}
1574+
15591575
var text = text1.substring(pointer, pointer += n);
15601576
if (tokens[x].charAt(0) == '=') {
15611577
diffs[diffsLength++] = new diff_match_patch.Diff(DIFF_EQUAL, text);

javascript/tests/diff_match_patch_test.js

+6-9
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,12 @@ function testDiffText() {
443443
assertEquals('jumps over the lazy', dmp.diff_text1(diffs));
444444

445445
assertEquals('jumped over a lazy', dmp.diff_text2(diffs));
446+
447+
// diff_text2: Objective-C (null) bug
448+
assertEquals(
449+
'\ud83d\ude42\ud83d\ude41',
450+
dmp.diff_text2(dmp.diff_fromDelta('\ud83d\ude42\ud83d\ude41', '=3\t+(null)\t=1'))
451+
);
446452
}
447453

448454
function testDiffDelta() {
@@ -605,15 +611,6 @@ function testDiffDelta() {
605611
assertEquals('Swap surrogate pair', 'crashed');
606612
}
607613

608-
try {
609-
assertEquivalent(
610-
dmp.diff_fromDelta('', '+%ED%A0%BC%28null%29%ED%B5%B0'),
611-
[[DIFF_INSERT, '\ud83c\udd70']]
612-
);
613-
} catch ( e ) {
614-
assertEquals('Invalid diff from objective-c with (null) string' );
615-
}
616-
617614
// Empty diff groups
618615
assertEquivalent(
619616
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),

objectivec/DiffMatchPatch.m

+43-19
Original file line numberDiff line numberDiff line change
@@ -1493,23 +1493,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded
14931493
return nil;
14941494
}
14951495

1496-
// some objective-c versions of the library produced patches with
1497-
// (null) in the place where surrogates were split across diff
1498-
// boundaries. if we leave those in we'll be stuck with a
1499-
// high-surrogate (null) low-surrogate pattern that will break
1500-
// deeper in the library or consuming application. we'll "fix"
1501-
// these by dropping the (null) and re-joining the surrogate halves
1502-
NSString *result = [NSString stringWithCharacters:decoded length:output];
1503-
NSRegularExpression *replacer = [NSRegularExpression
1504-
regularExpressionWithPattern:@"([\\x{D800}-\\x{DBFF}])\\(null\\)([\\x{DC00}-\\x{DFFF}])"
1505-
options:0
1506-
error:nil];
1507-
1508-
return [replacer
1509-
stringByReplacingMatchesInString:result
1510-
options:0
1511-
range:NSMakeRange(0, [result length])
1512-
withTemplate:@"$1$2"];
1496+
return [NSString stringWithCharacters:decoded length:output];
15131497
}
15141498

15151499
/**
@@ -1526,10 +1510,12 @@ - (NSMutableArray *)diff_fromDeltaWithText:(NSString *)text1
15261510
{
15271511
NSMutableArray *diffs = [NSMutableArray array];
15281512
NSUInteger thisPointer = 0; // Cursor in text1
1529-
NSArray *tokens = [delta componentsSeparatedByString:@"\t"];
1513+
NSMutableArray *tokens = [NSMutableArray arrayWithArray:([delta componentsSeparatedByString:@"\t"])];
15301514
NSInteger n;
15311515
NSDictionary *errorDetail = nil;
1532-
for (NSString *token in tokens) {
1516+
NSInteger tokenCount = [tokens count];
1517+
for (NSInteger x = 0; x < tokenCount; x++) {
1518+
NSString *token = tokens[x];
15331519
if (token.length == 0) {
15341520
// Blank tokens are ok (from a trailing \t).
15351521
continue;
@@ -1572,6 +1558,44 @@ - (NSMutableArray *)diff_fromDeltaWithText:(NSString *)text1
15721558
}
15731559
return nil;
15741560
}
1561+
1562+
// some objective-c versions of the library produced patches with
1563+
// (null) in the place where surrogates were split across diff
1564+
// boundaries. if we leave those in we'll be stuck with a
1565+
// high-surrogate (null) low-surrogate pattern that will break
1566+
// deeper in the library or consuming application. we'll "fix"
1567+
// these by dropping the (null) and re-joining the surrogate halves
1568+
if (x + 2 < tokenCount &&
1569+
CFStringIsSurrogateHighCharacter([text1 characterAtIndex:(thisPointer + n - 1)]) &&
1570+
[@"(null)" isEqualToString:([tokens[x + 1] substringFromIndex:1])] &&
1571+
CFStringIsSurrogateLowCharacter([text1 characterAtIndex:(thisPointer + n)])
1572+
) {
1573+
n -= 1;
1574+
tokens[x + 1] = @"+";
1575+
1576+
NSInteger m = [[tokens[x + 2] substringFromIndex:1] integerValue];
1577+
1578+
if (m == 0) {
1579+
if (error != NULL) {
1580+
errorDetail = [NSDictionary dictionaryWithObjectsAndKeys:
1581+
[NSString stringWithFormat:NSLocalizedString(@"Invalid number in diff_fromDelta: %@", @"Error"), param],
1582+
NSLocalizedDescriptionKey, nil];
1583+
*error = [NSError errorWithDomain:@"DiffMatchPatchErrorDomain" code:100 userInfo:errorDetail];
1584+
}
1585+
return nil;
1586+
} else if (m < 0) {
1587+
if (error != NULL) {
1588+
errorDetail = [NSDictionary dictionaryWithObjectsAndKeys:
1589+
[NSString stringWithFormat:NSLocalizedString(@"Negative number in diff_fromDelta: %@", @"Error"), param],
1590+
NSLocalizedDescriptionKey, nil];
1591+
*error = [NSError errorWithDomain:@"DiffMatchPatchErrorDomain" code:101 userInfo:errorDetail];
1592+
}
1593+
return nil;
1594+
}
1595+
1596+
tokens[x + 2] = [NSString stringWithFormat:@"=%ld", m + 1];
1597+
}
1598+
15751599
NSString *text;
15761600
@try {
15771601
text = [text1 substringWithRange:NSMakeRange(thisPointer, (NSUInteger)n)];

objectivec/Tests/DiffMatchPatchTest.m

+2-4
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,8 @@ - (void)test_diff_textTest {
686686

687687
XCTAssertEqualObjects(@"jumped over a lazy", [dmp diff_text2:diffs], @"Compute the source and destination texts #2");
688688

689+
XCTAssertEqualObjects(@"🙂🙁", [dmp diff_text2:([dmp diff_fromDeltaWithText:@"🙂🙁" andDelta:@"=3\t+(null)\t=1" error:nil])]);
690+
689691
[dmp release];
690692
}
691693

@@ -814,10 +816,6 @@ - (void)test_diff_deltaTest {
814816
[Diff diffWithOperation:DIFF_INSERT andText:[NSString stringWithFormat:@"%C", 0xdd71]],
815817
nil])]);
816818

817-
// Invalid diff from objective-c with (null) string
818-
XCTAssertEqualObjects([dmp diff_fromDeltaWithText:@"" andDelta:@"+%ED%A0%BC%28null%29%ED%B5%B0" error:nil],
819-
([NSMutableArray arrayWithObjects:[Diff diffWithOperation:DIFF_INSERT andText:@"🅰"],nil]));
820-
821819
// Verify pool of unchanged characters.
822820
diffs = [NSMutableArray arrayWithObject:
823821
[Diff diffWithOperation:DIFF_INSERT andText:@"A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "]];

0 commit comments

Comments
 (0)