Add support for using ICU to perform normalization, which is much much faster than the PHP code!

Still need to add support for cleanup/verification.
author: Brion Vibber <brion@users.mediawiki.org> 2004-10-07 05:59:10 +0000
committer: Brion Vibber <brion@users.mediawiki.org> 2004-10-07 05:59:10 +0000
commit: 0824182956884c743a6b2edc6da591e9ec1d1c02 (patch)
tree: ad5decb52767ce609e097df857c5b7fa20f26ada /includes/normal
parent: bcd1e9e844f55a4ffa13f9068edc539818bbcc4b (diff)
download: mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.tar.gz
mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.zip
3 files changed, 36 insertions, 5 deletions
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 3ea8ef705b8b..110793a9999c 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -92,6 +92,20 @@ define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
 define( 'UTF8_HEAD', false );
 define( 'UTF8_TAIL', true );
 
+
+/**
+ * For using the ICU wrapper
+ */
+define( 'UNORM_NONE', 1 );
+define( 'UNORM_NFD',  2 );
+define( 'UNORM_NFKD', 3 );
+define( 'UNORM_NFC',  4 );
+define( 'UNORM_DEFAULT', UNORM_NFC );
+define( 'UNORM_NFKC', 5 );
+define( 'UNORM_FCD',  6 );
+
+define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+
 /**
  *
  * @package MediaWiki
@@ -123,7 +137,9 @@ class UtfNormal {
 	 * @return string a UTF-8 string in normal form C
 	 */
 	function toNFC( $string ) {
-		if( UtfNormal::quickIsNFC( $string ) )
+		if( NORMALIZE_ICU )
+			return utf8_normalize( $string, UNORM_NFC );
+		elseif( UtfNormal::quickIsNFC( $string ) )
 			return $string;
 		else
 			return UtfNormal::NFC( $string );
@@ -137,7 +153,9 @@ class UtfNormal {
 	 * @return string a UTF-8 string in normal form D
 	 */
 	function toNFD( $string ) {
-		if( preg_match( '/[\x80-\xff]/', $string ) )
+		if( NORMALIZE_ICU )
+			return utf8_normalize( $string, UNORM_NFD );
+		elseif( preg_match( '/[\x80-\xff]/', $string ) )
 			return UtfNormal::NFD( $string );
 		else
 			return $string;
@@ -152,7 +170,9 @@ class UtfNormal {
 	 * @return string a UTF-8 string in normal form KC
 	 */
 	function toNFKC( $string ) {
-		if( preg_match( '/[\x80-\xff]/', $string ) )
+		if( NORMALIZE_ICU )
+			return utf8_normalize( $string, UNORM_NFKC );
+		elseif( preg_match( '/[\x80-\xff]/', $string ) )
 			return UtfNormal::NFKC( $string );
 		else
 			return $string;
@@ -167,7 +187,9 @@ class UtfNormal {
 	 * @return string a UTF-8 string in normal form KD
 	 */
 	function toNFKD( $string ) {
-		if( preg_match( '/[\x80-\xff]/', $string ) )
+		if( NORMALIZE_ICU )
+			return utf8_normalize( $string, UNORM_NFKD );
+		elseif( preg_match( '/[\x80-\xff]/', $string ) )
 			return UtfNormal::NFKD( $string );
 		else
 			return $string;
diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php
index 2e1740c22b0c..d42d592099b4 100644
--- a/includes/normal/UtfNormalBench.php
+++ b/includes/normal/UtfNormalBench.php
@@ -23,6 +23,10 @@
  */
 
 /** */
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+	dl( 'php_utfnormal.so' );
+}
+
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php
index 6360a7ca445c..16992be9fbcb 100644
--- a/includes/normal/UtfNormalTest.php
+++ b/includes/normal/UtfNormalTest.php
@@ -44,6 +44,10 @@ if( defined( 'PRETTY_UTF8' ) ) {
 	}	
 }
 
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+	dl( 'php_utfnormal.so' );
+}
+
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
@@ -106,7 +110,8 @@ while( false !== ($line = fgets( $in ) ) ) {
 	$cols = explode( ';', $line );
 	$char = codepointToUtf8( hexdec( $cols[0] ) );
 	$desc = $cols[0] . ": " . $cols[1];
-	if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+	if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+		# Can't check NULL with the ICU plugin, as null bytes fail in C land.
 		# Surrogates are illegal on their own or in UTF-8, ignore.
 		continue;
 	}
author	Brion Vibber <brion@users.mediawiki.org>	2004-10-07 05:59:10 +0000
committer	Brion Vibber <brion@users.mediawiki.org>	2004-10-07 05:59:10 +0000
commit	0824182956884c743a6b2edc6da591e9ec1d1c02 (patch)
tree	ad5decb52767ce609e097df857c5b7fa20f26ada /includes/normal
parent	bcd1e9e844f55a4ffa13f9068edc539818bbcc4b (diff)
download	mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.tar.gz mediawikicore-0824182956884c743a6b2edc6da591e9ec1d1c02.zip