aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrion Vibber <brion@users.mediawiki.org>2004-10-07 05:59:10 +0000
committerBrion Vibber <brion@users.mediawiki.org>2004-10-07 05:59:10 +0000
commit0824182956884c743a6b2edc6da591e9ec1d1c02 (patch)
treead5decb52767ce609e097df857c5b7fa20f26ada
parentbcd1e9e844f55a4ffa13f9068edc539818bbcc4b (diff)
downloadmediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.tar.gz
mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.zip
Add support for using ICU to perform normalization, which is much much faster than the PHP code!
Still need to add support for cleanup/verification.
Notes
Notes: http://mediawiki.org/wiki/Special:Code/MediaWiki/5730
-rw-r--r--includes/normal/UtfNormal.php30
-rw-r--r--includes/normal/UtfNormalBench.php4
-rw-r--r--includes/normal/UtfNormalTest.php7
3 files changed, 36 insertions, 5 deletions
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 3ea8ef705b8b..110793a9999c 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -92,6 +92,20 @@ define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
define( 'UTF8_HEAD', false );
define( 'UTF8_TAIL', true );
+
+/**
+ * For using the ICU wrapper
+ */
+define( 'UNORM_NONE', 1 );
+define( 'UNORM_NFD', 2 );
+define( 'UNORM_NFKD', 3 );
+define( 'UNORM_NFC', 4 );
+define( 'UNORM_DEFAULT', UNORM_NFC );
+define( 'UNORM_NFKC', 5 );
+define( 'UNORM_FCD', 6 );
+
+define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+
/**
*
* @package MediaWiki
@@ -123,7 +137,9 @@ class UtfNormal {
* @return string a UTF-8 string in normal form C
*/
function toNFC( $string ) {
- if( UtfNormal::quickIsNFC( $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFC );
+ elseif( UtfNormal::quickIsNFC( $string ) )
return $string;
else
return UtfNormal::NFC( $string );
@@ -137,7 +153,9 @@ class UtfNormal {
* @return string a UTF-8 string in normal form D
*/
function toNFD( $string ) {
- if( preg_match( '/[\x80-\xff]/', $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFD );
+ elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFD( $string );
else
return $string;
@@ -152,7 +170,9 @@ class UtfNormal {
* @return string a UTF-8 string in normal form KC
*/
function toNFKC( $string ) {
- if( preg_match( '/[\x80-\xff]/', $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFKC );
+ elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKC( $string );
else
return $string;
@@ -167,7 +187,9 @@ class UtfNormal {
* @return string a UTF-8 string in normal form KD
*/
function toNFKD( $string ) {
- if( preg_match( '/[\x80-\xff]/', $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFKD );
+ elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKD( $string );
else
return $string;
diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php
index 2e1740c22b0c..d42d592099b4 100644
--- a/includes/normal/UtfNormalBench.php
+++ b/includes/normal/UtfNormalBench.php
@@ -23,6 +23,10 @@
*/
/** */
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+ dl( 'php_utfnormal.so' );
+}
+
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php
index 6360a7ca445c..16992be9fbcb 100644
--- a/includes/normal/UtfNormalTest.php
+++ b/includes/normal/UtfNormalTest.php
@@ -44,6 +44,10 @@ if( defined( 'PRETTY_UTF8' ) ) {
}
}
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+ dl( 'php_utfnormal.so' );
+}
+
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
@@ -106,7 +110,8 @@ while( false !== ($line = fgets( $in ) ) ) {
$cols = explode( ';', $line );
$char = codepointToUtf8( hexdec( $cols[0] ) );
$desc = $cols[0] . ": " . $cols[1];
- if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+ if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+ # Can't check NULL with the ICU plugin, as null bytes fail in C land.
# Surrogates are illegal on their own or in UTF-8, ignore.
continue;
}