aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--includes/diff/TextSlotDiffRenderer.php5
-rw-r--r--includes/languages/LanguageZh.php15
-rw-r--r--tests/phpunit/includes/languages/LanguageZhTest.php20
3 files changed, 35 insertions, 5 deletions
diff --git a/includes/diff/TextSlotDiffRenderer.php b/includes/diff/TextSlotDiffRenderer.php
index c254c08f1897..22ca94ce9124 100644
--- a/includes/diff/TextSlotDiffRenderer.php
+++ b/includes/diff/TextSlotDiffRenderer.php
@@ -92,14 +92,15 @@ class TextSlotDiffRenderer extends SlotDiffRenderer {
* Convenience helper to use getTextDiff without an instance.
* @param string $oldText
* @param string $newText
+ * @param array $options
* @return string
*/
- public static function diff( $oldText, $newText ) {
+ public static function diff( $oldText, $newText, $options = [] ) {
/** @var TextSlotDiffRenderer $slotDiffRenderer */
$slotDiffRenderer = MediaWikiServices::getInstance()
->getContentHandlerFactory()
->getContentHandler( CONTENT_MODEL_TEXT )
- ->getSlotDiffRenderer( RequestContext::getMain() );
+ ->getSlotDiffRenderer( RequestContext::getMain(), $options );
'@phan-var TextSlotDiffRenderer $slotDiffRenderer';
return $slotDiffRenderer->getTextDiff( $oldText, $newText );
}
diff --git a/includes/languages/LanguageZh.php b/includes/languages/LanguageZh.php
index 4e23ca3934bd..11aa6be44480 100644
--- a/includes/languages/LanguageZh.php
+++ b/includes/languages/LanguageZh.php
@@ -29,13 +29,22 @@
*/
class LanguageZh extends LanguageZh_hans {
/**
- * this should give much better diff info
+ * Add a formfeed character between each non-ASCII character, so that
+ * "word-level" diffs will effectively operate on a character level. The FF
+ * characters are stripped out by unsegmentForDiff().
+ *
+ * We use FF because it is the least used character that is matched by
+ * PCRE's \s class.
+ *
+ * In the unlikely event that an FF character appears in the input, it will
+ * be displayed in the diff as a replacement character.
*
* @param string $text
* @return string
*/
public function segmentForDiff( $text ) {
- return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', ' $0', $text );
+ $text = str_replace( "\x0c", "\u{FFFD}", $text );
+ return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', "\x0c$0", $text );
}
/**
@@ -43,7 +52,7 @@ class LanguageZh extends LanguageZh_hans {
* @return string
*/
public function unsegmentForDiff( $text ) {
- return preg_replace( '/ ([\xc0-\xff][\x80-\xbf]*)/', '$1', $text );
+ return str_replace( "\x0c", '', $text );
}
/**
diff --git a/tests/phpunit/includes/languages/LanguageZhTest.php b/tests/phpunit/includes/languages/LanguageZhTest.php
new file mode 100644
index 000000000000..815890d82d90
--- /dev/null
+++ b/tests/phpunit/includes/languages/LanguageZhTest.php
@@ -0,0 +1,20 @@
+<?php
+
+use MediaWiki\MainConfigNames;
+
+/**
+ * @covers LanguageZh
+ */
+class LanguageZhTest extends LanguageClassesTestCase {
+ public function testSegmentForDiff() {
+ $this->overrideConfigValue( MainConfigNames::DiffEngine, 'php' );
+ $lhs = '维基';
+ $rhs = '维基百科';
+ $diff = TextSlotDiffRenderer::diff( $lhs, $rhs, [ 'contentLanguage' => 'zh' ] );
+ // Check that only the second part is highlighted, and word segmentation markers are not present
+ $this->assertStringContainsString(
+ '<div>维基<ins class="diffchange diffchange-inline">百科</ins></div>',
+ $diff
+ );
+ }
+}