aboutsummaryrefslogtreecommitdiffstats
path: root/maintenance/cleanupTitles.php
blob: 3992cb73444164e8b6bb031ffaf023044e3608c8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
<?php
/**
 * Clean up broken, unparseable titles.
 *
 * Copyright © 2005 Brooke Vibber <bvibber@wikimedia.org>
 * https://www.mediawiki.org/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @author Brooke Vibber <bvibber@wikimedia.org>
 * @ingroup Maintenance
 */

use MediaWiki\Title\Title;
use Wikimedia\Rdbms\IDBAccessObject;

// @codeCoverageIgnoreStart
require_once __DIR__ . '/TableCleanup.php';
// @codeCoverageIgnoreEnd

/**
 * Maintenance script to clean up broken, unparseable titles.
 *
 * @ingroup Maintenance
 */
class TitleCleanup extends TableCleanup {

	private string $prefix;

	public function __construct() {
		parent::__construct();
		$this->addDescription( 'Script to clean up broken, unparseable titles' );
		$this->addOption( 'prefix', "Broken pages will be renamed to titles with  " .
			"<prefix> prepended before the article name. Defaults to 'Broken'", false, true );
		$this->setBatchSize( 1000 );
	}

	/**
	 * @inheritDoc
	 */
	public function execute() {
		$this->prefix = $this->getOption( 'prefix', 'Broken' ) . "/";
		// Make sure the prefix itself is a valid title now
		// rather than spewing errors for every page being cleaned up
		// if it's not (We assume below that concatenating the prefix to a title leaves it in NS0)
		// The trailing slash above ensures that concatenating the title to something
		// can't turn it into a namespace or interwiki
		$title = Title::newFromText( $this->prefix );
		if ( !$title || !$title->canExist() || $title->getInterwiki() || $title->getNamespace() !== 0 ) {
			$this->fatalError( "Invalid prefix {$this->prefix}. Must be a valid mainspace title." );
		}
		parent::execute();
	}

	/**
	 * @param stdClass $row
	 */
	protected function processRow( $row ) {
		$display = Title::makeName( $row->page_namespace, $row->page_title );
		$verified = $this->getServiceContainer()->getContentLanguage()->normalize( $display );
		$title = Title::newFromText( $verified );

		if ( $title !== null
			&& $title->canExist()
			&& $title->getNamespace() == $row->page_namespace
			&& $title->getDBkey() === $row->page_title
		) {
			// all is fine
			$this->progress( 0 );

			return;
		}

		if ( $row->page_namespace == NS_FILE && $this->fileExists( $row->page_title ) ) {
			$this->output( "file $row->page_title needs cleanup, please run cleanupImages.php.\n" );
			$this->progress( 0 );
		} elseif ( $title === null ) {
			$this->output( "page $row->page_id ($display) is illegal.\n" );
			$this->moveIllegalPage( $row );
			$this->progress( 1 );
		} else {
			$this->output( "page $row->page_id ($display) doesn't match self.\n" );
			$this->moveInconsistentPage( $row, $title );
			$this->progress( 1 );
		}
	}

	/**
	 * @param string $name
	 * @return bool
	 */
	protected function fileExists( $name ) {
		// XXX: Doesn't actually check for file existence, just presence of image record.
		// This is reasonable, since cleanupImages.php only iterates over the image table.
		$dbr = $this->getReplicaDB();
		$row = $dbr->newSelectQueryBuilder()
			->select( '*' )
			->from( 'image' )
			->where( [ 'img_name' => $name ] )
			->caller( __METHOD__ )
			->fetchRow();

		return $row !== false;
	}

	/**
	 * @param stdClass $row
	 */
	protected function moveIllegalPage( $row ) {
		$legalChars = Title::legalChars();
		$legalizedUnprefixed = preg_replace_callback( "/([^$legalChars])/",
			[ $this, 'hexChar' ],
			$row->page_title );
		if ( $legalizedUnprefixed == '.' ) {
			$legalizedUnprefixed = '(dot)';
		}
		if ( $legalizedUnprefixed == '_' ) {
			$legalizedUnprefixed = '(space)';
		}
		$ns = (int)$row->page_namespace;

		$title = null;
		// Try to move "Talk:Project:Foo" -> "Project talk:Foo"
		if ( $ns === 1 ) {
			$subjectTitle = Title::newFromText( $legalizedUnprefixed );
			if ( $subjectTitle && !$subjectTitle->isTalkPage() ) {
				$talkTitle = $subjectTitle->getTalkPageIfDefined();
				if ( $talkTitle !== null && !$talkTitle->exists() ) {
					$ns = $talkTitle->getNamespace();
					$title = $talkTitle;
				}
			}
		}

		if ( $title === null ) {
			// Not a talk page or that didn't work
			// move any other broken pages to the main namespace so they can be found together
			if ( $ns !== 0 ) {
				$namespaceInfo = $this->getServiceContainer()->getNamespaceInfo();
				$namespaceName = $namespaceInfo->getCanonicalName( $ns );
				if ( $namespaceName === false ) {
					$namespaceName = "NS$ns"; // Fallback for unknown namespaces
				}
				$ns = 0;
				$legalizedUnprefixed = "$namespaceName:$legalizedUnprefixed";
			}
			$title = Title::newFromText( $this->prefix . $legalizedUnprefixed );
		}

		if ( $title === null ) {
			// It's still not a valid title, try again with a much smaller
			// allowed character set. This will mangle any titles with non-ASCII
			// characters, but if we don't do this the result will be
			// falling back to the Broken/id:foo failsafe below which is worse
			$legalizedUnprefixed = preg_replace_callback( '!([^A-Za-z0-9_:\\-])!',
				[ $this, 'hexChar' ],
				$legalizedUnprefixed
			);
			$title = Title::newFromText( $this->prefix . $legalizedUnprefixed );
		}

		if ( $title === null ) {
			// Oh well, we tried
			$clean = $this->prefix . 'id:' . $row->page_id;
			$legalized = $this->prefix . $legalizedUnprefixed;
			$this->output( "Couldn't legalize; form '$legalized' still invalid; using '$clean'\n" );
			$title = Title::newFromText( $clean );
		} elseif ( $title->exists( IDBAccessObject::READ_LATEST ) ) {
			$clean = $this->prefix . 'id:' . $row->page_id;
			$conflict = $title->getDBKey();
			$this->output( "Legalized for '$conflict' exists; using '$clean'\n" );
			$title = Title::newFromText( $clean );
		}

		if ( !$title || $title->exists( IDBAccessObject::READ_LATEST ) ) {
			// This can happen in corner cases like if numbers are made not valid
			// title characters using the (deprecated) $wgLegalTitleChars or
			// a 'Broken/id:foo' title already exists
			$this->error( "Destination page {$title->getText()} is invalid or already exists, skipping." );
			return;
		}

		$dest = $title->getDBkey();
		if ( $this->dryrun ) {
			$this->output( "DRY RUN: would rename $row->page_id ($row->page_namespace," .
				"'$row->page_title') to ($ns,'$dest')\n" );
		} else {
			$this->output( "renaming $row->page_id ($row->page_namespace," .
				"'$row->page_title') to ($ns,'$dest')\n" );
			$this->getPrimaryDB()
				->newUpdateQueryBuilder()
				->update( 'page' )
				->set( [ 'page_title' => $dest, 'page_namespace' => $ns ] )
				->where( [ 'page_id' => $row->page_id ] )
				->caller( __METHOD__ )->execute();
		}
	}

	/**
	 * @param stdClass $row
	 * @param Title $title
	 */
	protected function moveInconsistentPage( $row, Title $title ) {
		$titleImpossible = $title->getInterwiki() || !$title->canExist();
		if ( $title->exists( IDBAccessObject::READ_LATEST ) || $titleImpossible ) {
			if ( $titleImpossible ) {
				$prior = $title->getPrefixedDBkey();
			} else {
				$prior = $title->getDBkey();
			}

			$ns = (int)$row->page_namespace;
			# If a page is saved in the main namespace with a namespace prefix then try to move it into
			# that namespace. If there's no conflict then it will succeed. Otherwise it will hit the condition
			# } else if ($ns !== 0) { and be moved to Broken/Namespace:Title
			# whereas without this check it would just go to Broken/Title
			if ( $ns === 0 ) {
				$ns = $title->getNamespace();
			}

			# Old cleanupTitles could move articles there. See T25147.
			# or a page could be stored as (0, "Special:Foo") in which case the $titleImpossible
			# condition would be true and we've already added a prefix so pretend we're in mainspace
			# and don't add another
			if ( $ns < 0 ) {
				$ns = 0;
			}

			# Namespace which no longer exists. Put the page in the main namespace
			# since we don't have any idea of the old namespace name. See T70501.
			# We build the new title ourself rather than relying on getDBKey() because
			# that will return Special:BadTitle
			$namespaceInfo = $this->getServiceContainer()->getNamespaceInfo();
			if ( !$namespaceInfo->exists( $ns ) ) {
				$clean = "{$this->prefix}NS$ns:$row->page_title";
				$ns = 0;
			} elseif ( !$titleImpossible && !$title->exists( IDBAccessObject::READ_LATEST ) ) {
				// Looks like the current title, after cleaning it up, is valid and available
				$clean = $prior;
			} elseif ( $ns !== 0 ) {
				// Put all broken pages in the main namespace so that they can be found via Special:PrefixIndex
				$nsName = $namespaceInfo->getCanonicalName( $ns );
				$clean = "{$this->prefix}$nsName:{$prior}";
				$ns = 0;
			} else {
				$clean = $this->prefix . $prior;
			}
			$verified = Title::makeTitleSafe( $ns, $clean );
			if ( !$verified || $verified->exists( IDBAccessObject::READ_LATEST ) ) {
				$lastResort = "{$this->prefix}id: {$row->page_id}";
				$this->output( "Couldn't legalize; form '$clean' exists; using '$lastResort'\n" );
				$verified = Title::makeTitleSafe( $ns, $lastResort );
				if ( !$verified || $verified->exists( IDBAccessObject::READ_LATEST ) ) {
					// This can happen in corner cases like if numbers are made not valid
					// title characters using the (deprecated) $wgLegalTitleChars or
					// a 'Broken/id:foo' title already exists
					$this->error( "Destination page $lastResort invalid or already exists." );
					return;
				}
			}
			$title = $verified;
		}

		$ns = $title->getNamespace();
		$dest = $title->getDBkey();

		if ( $this->dryrun ) {
			$this->output( "DRY RUN: would rename $row->page_id ($row->page_namespace," .
				"'$row->page_title') to ($ns,'$dest')\n" );
		} else {
			$this->output( "renaming $row->page_id ($row->page_namespace," .
				"'$row->page_title') to ($ns,'$dest')\n" );
			$this->getPrimaryDB()
				->newUpdateQueryBuilder()
				->update( 'page' )
				->set( [
					'page_namespace' => $ns,
					'page_title' => $dest
				] )
				->where( [ 'page_id' => $row->page_id ] )
				->caller( __METHOD__ )->execute();
			$this->getServiceContainer()->getLinkCache()->clear();
		}
	}
}

// @codeCoverageIgnoreStart
$maintClass = TitleCleanup::class;
require_once RUN_MAINTENANCE_IF_MAIN;
// @codeCoverageIgnoreEnd