diff options
author | Brion Vibber <brion@users.mediawiki.org> | 2004-10-07 05:59:10 +0000 |
---|---|---|
committer | Brion Vibber <brion@users.mediawiki.org> | 2004-10-07 05:59:10 +0000 |
commit | 0824182956884c743a6b2edc6da591e9ec1d1c02 (patch) | |
tree | ad5decb52767ce609e097df857c5b7fa20f26ada /includes/normal | |
parent | bcd1e9e844f55a4ffa13f9068edc539818bbcc4b (diff) | |
download | mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.tar.gz mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.zip |
Add support for using ICU to perform normalization, which is much much faster than the PHP code!
Still need to add support for cleanup/verification.
Notes
Notes:
http://mediawiki.org/wiki/Special:Code/MediaWiki/5730
Diffstat (limited to 'includes/normal')
-rw-r--r-- | includes/normal/UtfNormal.php | 30 | ||||
-rw-r--r-- | includes/normal/UtfNormalBench.php | 4 | ||||
-rw-r--r-- | includes/normal/UtfNormalTest.php | 7 |
3 files changed, 36 insertions, 5 deletions
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 3ea8ef705b8b..110793a9999c 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -92,6 +92,20 @@ define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) ); define( 'UTF8_HEAD', false ); define( 'UTF8_TAIL', true ); + +/** + * For using the ICU wrapper + */ +define( 'UNORM_NONE', 1 ); +define( 'UNORM_NFD', 2 ); +define( 'UNORM_NFKD', 3 ); +define( 'UNORM_NFC', 4 ); +define( 'UNORM_DEFAULT', UNORM_NFC ); +define( 'UNORM_NFKC', 5 ); +define( 'UNORM_FCD', 6 ); + +define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); + /** * * @package MediaWiki @@ -123,7 +137,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form C */ function toNFC( $string ) { - if( UtfNormal::quickIsNFC( $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFC ); + elseif( UtfNormal::quickIsNFC( $string ) ) return $string; else return UtfNormal::NFC( $string ); @@ -137,7 +153,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form D */ function toNFD( $string ) { - if( preg_match( '/[\x80-\xff]/', $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFD ); + elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFD( $string ); else return $string; @@ -152,7 +170,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form KC */ function toNFKC( $string ) { - if( preg_match( '/[\x80-\xff]/', $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFKC ); + elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKC( $string ); else return $string; @@ -167,7 +187,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form KD */ function toNFKD( $string ) { - if( preg_match( '/[\x80-\xff]/', $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFKD ); + elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKD( $string ); else return $string; diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php index 2e1740c22b0c..d42d592099b4 100644 --- a/includes/normal/UtfNormalBench.php +++ b/includes/normal/UtfNormalBench.php @@ -23,6 +23,10 @@ */ /** */ +if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) { + dl( 'php_utfnormal.so' ); +} + require_once 'UtfNormalUtil.php'; require_once 'UtfNormal.php'; diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php index 6360a7ca445c..16992be9fbcb 100644 --- a/includes/normal/UtfNormalTest.php +++ b/includes/normal/UtfNormalTest.php @@ -44,6 +44,10 @@ if( defined( 'PRETTY_UTF8' ) ) { } } +if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) { + dl( 'php_utfnormal.so' ); +} + require_once 'UtfNormalUtil.php'; require_once 'UtfNormal.php'; @@ -106,7 +110,8 @@ while( false !== ($line = fgets( $in ) ) ) { $cols = explode( ';', $line ); $char = codepointToUtf8( hexdec( $cols[0] ) ); $desc = $cols[0] . ": " . $cols[1]; - if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) { + if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) { + # Can't check NULL with the ICU plugin, as null bytes fail in C land. # Surrogates are illegal on their own or in UTF-8, ignore. continue; } |