aboutsummaryrefslogtreecommitdiffstats
path: root/includes/StringUtils.php
diff options
context:
space:
mode:
authorFran Rogers <krimpet@users.mediawiki.org>2008-09-15 00:42:17 +0000
committerFran Rogers <krimpet@users.mediawiki.org>2008-09-15 00:42:17 +0000
commitad5f1acdb3cc78c2607e08862c5a4aa6c8f0bc1c (patch)
treea80602bfc3fdadcc90957c8ee6bd0653bb44f62d /includes/StringUtils.php
parentae414c95849d044395e32a01d27432b9b2fdd026 (diff)
downloadmediawikicore-ad5f1acdb3cc78c2607e08862c5a4aa6c8f0bc1c.tar.gz
mediawikicore-ad5f1acdb3cc78c2607e08862c5a4aa6c8f0bc1c.zip
Fix for bug #332 - all UTF-8 output is now cleaned of invalid forms as defined by RFC 3629. All output from MediaWiki should now be valid UTF-8 in all circumstances.
Notes
Notes: http://mediawiki.org/wiki/Special:Code/MediaWiki/40837
Diffstat (limited to 'includes/StringUtils.php')
-rw-r--r--includes/StringUtils.php80
1 files changed, 80 insertions, 0 deletions
diff --git a/includes/StringUtils.php b/includes/StringUtils.php
index c437b3c19e48..1e3489024ad7 100644
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -179,6 +179,86 @@ class StringUtils {
return new ArrayIterator( explode( $separator, $subject ) );
}
}
+
+ /**
+ * Clean characters that are invalid in the given character set
+ * from a given string.
+ *
+ * @param $string \type{$string} String to clean
+ * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
+ * @return \type{$string} Cleaned string
+ */
+ public static function cleanForCharset( $string, $charset='' ) {
+ global $wgOutputEncoding;
+ switch ( $charset ? $charset : $wgOutputEncoding ) {
+ # UTF-8 should be all we need to worry about. :)
+ case 'UTF-8':
+ return self::cleanUtf8( $string );
+ default:
+ return $string;
+ }
+ }
+
+ /**
+ * Clean invalid UTF-8 characters and sequences from a given string,
+ * replacing them with U+FFFD.
+ * Should be RFC 3629 compliant.
+ *
+ * @param $string \type{$string} String to clean
+ * @return \type{$string} Cleaned string
+ */
+ private static function cleanUtf8( $str ) {
+ # HERE BE DRAGONS!
+ # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
+
+ $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
+ 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
+ $len = strlen( $str );
+ $left = $bytes = 0;
+ for ( $i = 0; $i < $len; $i++ ) {
+ $ch = ord( $str[$i] );
+ if ( !$left ) {
+ if ( !($ch & 0x80 ) )
+ continue;
+ $left = (( $ch & 0xFE ) == 0xFC ? 5 :
+ (( $ch & 0xFC ) == 0xF8 ? 4 :
+ (( $ch & 0xF8 ) == 0xF0 ? 3 :
+ (( $ch & 0xF0 ) == 0xE0 ? 2 :
+ (( $ch & 0xE0 ) == 0xC0 ? 1 :
+ 0 )))));
+ if ( $left ) {
+ $bytes = $left + 1;
+ $sum = $ch & ( 0xFF >> $bytes + 1 );
+ continue;
+ } else if ( $ch & 0x80 ) {
+ $bytes = 1;
+ }
+ } else if ( ( $ch & 0xC0 ) == 0x80 ) {
+ $sum <<= 6;
+ $sum += $ch & 0x3F;
+ if ( --$left ) continue;
+ if ( ( $bytes == 2 && $sum < 0x80 ) ||
+ ( $bytes == 3 && $sum < 0x800 ) ||
+ ( $bytes == 4 && $sum < 0x10000 ) ||
+ ( $bytes > 4 || $sum > 0x10FFFF ) ||
+ in_array( $sum, $illegal ) ) {
+ } else continue;
+
+ } else {
+ $bytes -= $left;
+ $i--;
+ }
+
+ $str = ( substr( $str, 0, $i - $bytes + 1 ) .
+ "\xEF\xBF\xBD" .
+ substr( $str, $i + 1 ) );
+ $i += 3 - $bytes;
+ $len += 3 - $bytes;
+ $left = 0;
+ }
+
+ return $str;
+ }
}
/**