aboutsummaryrefslogtreecommitdiffstats
path: root/maintenance
diff options
context:
space:
mode:
authorBrion Vibber <brion@users.mediawiki.org>2004-09-27 06:23:33 +0000
committerBrion Vibber <brion@users.mediawiki.org>2004-09-27 06:23:33 +0000
commitb2411deec4b1732e73754f69eeaf19c0bd7c78f7 (patch)
tree8d62dd74f87a952526128bf2c19aca9706a9942d /maintenance
parent75e8383ce16fdcfabd934936cba45d961aa74d40 (diff)
downloadmediawikicore-b2411deec4b1732e73754f69eeaf19c0bd7c78f7.tar.gz
mediawikicore-b2411deec4b1732e73754f69eeaf19c0bd7c78f7.zip
Far from finished in-place UTF-8 wiki converter.
I had this code lying around for a while and wanted to check it in before I forget about it.
Notes
Notes: http://mediawiki.org/wiki/Special:Code/MediaWiki/5568
Diffstat (limited to 'maintenance')
-rw-r--r--maintenance/convertUtf8.php199
1 files changed, 199 insertions, 0 deletions
diff --git a/maintenance/convertUtf8.php b/maintenance/convertUtf8.php
new file mode 100644
index 000000000000..3281d6b6ff6f
--- /dev/null
+++ b/maintenance/convertUtf8.php
@@ -0,0 +1,199 @@
+<?php
+
+die("This file is not complete; it's checked in so I don't forget it.");
+
+/*
+UTF-8 conversion of DOOOOOOOM
+
+1. Lock the wiki
+2. Make a convertlist of all pages
+3. Enable CONVERTLOCK mode and switch to UTF-8
+4. As quick as possible, convert the cur, images, *links, user, etc tables. Clear cache tables.
+5. Unlock the wiki. Attempts to access pages on the convertlist will be trapped to read-only.
+6. Go through the list, fixing up old revisions. Remove pages from the convertlist.
+*/
+
+
+if(function_exists("iconv")) {
+ # There are likely to be Windows code page 1252 chars in there.
+ # Convert them to the proper UTF-8 chars if possible.
+ function toUtf8($string) {
+ return wfStrencode(iconv("CP1252", "UTF-8", $string));
+ }
+} else {
+ # Will work from plain iso 8859-1 and may corrupt these chars
+ function toUtf8($string) {
+ return wfStrencode(utf8_encode($string));
+ }
+}
+
+
+
+# user table
+$sql = "SELECT user_id,user_name,user_real_name,user_options FROM user";
+$res = wfQuery( $sql, DB_WRITE );
+print "Converting " . wfNumResults( $res ) . " user accounts:\n";
+$n = 0;
+while( $s = wfFetchObject( $res ) ) {
+ $uname = toUtf8( $s->user_name );
+ $ureal = toUtf8( $s->user_real_name );
+ $uoptions = toUtf8( $s->user_options );
+ if( $uname != wfStrencode( $s->user_name ) ||
+ $ureal != wfStrencode( $s->user_real_name ) ||
+ $uoptions != wfStrencode( $s->user_options ) ) {
+ $now = wfTimestampNow();
+ $sql = "UPDATE user
+ SET user_name='$uname',user_real_name='$ureal',
+ user_options='$uoptions',user_touched='$now'
+ WHERE user_id={$s->user_id}";
+ wfQuery( $sql, DB_WRITE );
+ $wgMemc->delete( "$wgDBname:user:id:{$s->user_id}" );
+ $u++;
+ }
+ if( ++$n % 100 == 0 ) print "$n\n";
+}
+wfFreeResult( $res );
+if( $n ) {
+ printf("%2.02%% required conversion.\n\n", $u / $n);
+} else {
+ print "None?\n\n";
+}
+
+# ipblocks
+$sql = "SELECT DISTINCT ipb_reason FROM ipblocks";
+$res = wfQuery( $sql, DB_WRITE );
+print "Converting " . wfNumResults( $res ) . " IP block comments:\n";
+$n = 0;
+while( $s = wfFetchObject( $res ) ) {
+ $ucomment = toUtf8($s->ipb_reason);
+ $ocomment = wfStrencode( $s->ipb_reason );
+ if( $u != $o ) {
+ $sql = "UPDATE ipblocks SET ipb_reason='$ucomment' WHERE ipb_reason='$ocomment'";
+ wfQuery( $sql, DB_WRITE );
+ $u++;
+ }
+ if( ++$n % 100 == 0 ) print "$n\n";
+}
+wfFreeResult( $res );
+if( $n ) {
+ printf("%2.02%% required conversion.\n\n", $u / $n);
+} else {
+ print "None?\n\n";
+}
+
+# image
+$sql = "SELECT img_name,img_description,img_user_text FROM image";
+ img_name --> also need to rename files
+ img_description
+ img_user_text
+
+oldimage
+ oi_name
+ oi_archive_name --> also need to rename files
+ oi_user_text
+
+recentchanges
+ rc_user_text
+ rc_title
+ rc_comment
+
+# searchindex
+print "Clearing searchindex... don't forget to rebuild it.\n";
+$sql = "DELETE FROM searchindex";
+wfQuery( $sql, DB_WRITE );
+
+# linkscc
+print "Clearing linkscc...\n";
+$sql = "DELETE FROM linkscc";
+wfQuery( $sql, DB_WRITE );
+
+# querycache: just rebuild these
+print "Clearing querycache...\n";
+$sql = "DELETE FROM querycache";
+wfQuery( $sql, DB_WRITE );
+
+# objectcache
+print "Clearing objectcache...\n";
+$sql = "DELETE FROM objectcache";
+wfQuery( $sql, DB_WRITE );
+
+
+function unicodeLinks( $table, $field ) {
+ $sql = "SELECT DISTINCT $field FROM $table WHERE $field RLIKE '[\x80-\xff]'";
+ $res = wfQuery( $sql, DB_WRITE );
+ print "Converting " . wfNumResults( $res ) . " from $table:\n";
+ $n = 0;
+ while( $s = wfFetchObject( $res ) ) {
+ $ulink = toUtf8( $s->$field );
+ $olink = wfStrencode( $s->$field );
+ $sql = "UPDATE $table SET $field='$ulink' WHERE $field='$olink'";
+ wfQuery( $sql, DB_WRITE );
+ if( ++$n % 100 == 0 ) print "$n\n";
+ }
+ wfFreeResult( $res );
+ print "Done.\n\n";
+}
+unicodeLinks( "brokenlinks", "bl_to" );
+unicodeLinks( "imagelinks", "il_to" );
+unicodeLinks( "categorylinks", "cl_to" );
+
+
+# The big guys...
+$sql = "SELECT cur_id,cur_namespace,cur_title,cur_text,cur_user_text FROM cur
+WHERE cur_title rlike '[\x80-\xff]' OR cur_comment rlike '[\x80-\xff]'
+OR cur_user_text rlike '[\x80-\xff]' OR cur_text rlike '[\x80-\xff]'";
+$res = wfQuery( $sql, DB_WRITE );
+print "Converting " . wfNumResults( $res ) . " cur pages:\n";
+$n = 0;
+while( $s = wfFetchObject( $res ) ) {
+ $utitle = toUtf8( $s->cur_title );
+ $uuser = toUtf8( $s->cur_user_text );
+ $ucomment = toUtf8( $s->cur_comment );
+ $utext = toUtf8( $s->cur_text );
+ $now = wfTimestampNow();
+
+ $sql = "UPDATE cur
+ SET cur_title='$utitle',cur_user_text='$uuser',
+ cur_comment='$ucomment',cur_text='$utext'
+ WHERE cur_id={$s->cur_id}";
+ wfQuery( $sql, DB_WRITE );
+ #$wgMemc->delete( "$wgDBname:user:id:{$s->user_id}" );
+
+ $otitle = wfStrencode( $s->cur_title );
+ if( $otitle != $utitle ) {
+ # Also update titles in watchlist and old
+ $sql = "UPDATE old SET old_title='$utitle'
+ WHERE old_namespace={$s->cur_namespace} AND old_title='$otitle'";
+ wfQuery( $sql, DB_WRITE );
+
+ $ns = IntVal( $s->cur_namespace) & ~1;
+ $sql = "UPDATE watchlist SET wl_title='$utitle'
+ WHERE wl_namespace=$ns AND wl_title='$otitle'";
+ wfQuery( $sql, DB_WRITE );
+ $u++;
+ }
+
+ if( ++$n % 100 == 0 ) print "$n\n";
+}
+wfFreeResult( $res );
+if( $n ) {
+ printf("Updated old/watchlist titles on %2.02%%.\n\n", $u / $n);
+} else {
+ print "Didn't update any old/watchlist titles.\n\n";
+}
+
+/*
+old
+ old_title
+ old_text -> may be gzipped
+ old_comment
+ old_user_text
+
+archive
+ ar_title
+ ar_text -> may be gzipped
+ ar_comment
+ ar_user_text
+*/
+
+?> \ No newline at end of file