* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * * @file */ namespace MediaWiki\Specials; use MediaWiki\Export\WikiExporterFactory; use MediaWiki\HTMLForm\Field\HTMLTextAreaField; use MediaWiki\HTMLForm\HTMLForm; use MediaWiki\Linker\LinksMigration; use MediaWiki\Logger\LoggerFactory; use MediaWiki\MainConfigNames; use MediaWiki\Page\PageIdentity; use MediaWiki\SpecialPage\SpecialPage; use MediaWiki\Title\Title; use MediaWiki\Title\TitleFormatter; use WikiExporter; use Wikimedia\Rdbms\IConnectionProvider; use Wikimedia\Rdbms\SelectQueryBuilder; /** * A special page that allows users to export pages in a XML file * * @ingroup SpecialPage * @ingroup Dump */ class SpecialExport extends SpecialPage { protected bool $curonly; protected bool $doExport; protected int $pageLinkDepth; protected bool $templates; private IConnectionProvider $dbProvider; private WikiExporterFactory $wikiExporterFactory; private TitleFormatter $titleFormatter; private LinksMigration $linksMigration; /** * @param IConnectionProvider $dbProvider * @param WikiExporterFactory $wikiExporterFactory * @param TitleFormatter $titleFormatter * @param LinksMigration $linksMigration */ public function __construct( IConnectionProvider $dbProvider, WikiExporterFactory $wikiExporterFactory, TitleFormatter $titleFormatter, LinksMigration $linksMigration ) { parent::__construct( 'Export' ); $this->dbProvider = $dbProvider; $this->wikiExporterFactory = $wikiExporterFactory; $this->titleFormatter = $titleFormatter; $this->linksMigration = $linksMigration; } public function execute( $par ) { $this->setHeaders(); $this->outputHeader(); $config = $this->getConfig(); $this->curonly = true; $this->doExport = false; $request = $this->getRequest(); $this->templates = $request->getCheck( 'templates' ); $this->pageLinkDepth = $this->validateLinkDepth( $request->getIntOrNull( 'pagelink-depth' ) ); $nsindex = ''; $exportall = false; if ( $request->getCheck( 'addcat' ) ) { $page = $request->getText( 'pages' ); $catname = $request->getText( 'catname' ); if ( $catname !== '' && $catname !== null && $catname !== false ) { $t = Title::makeTitleSafe( NS_MAIN, $catname ); if ( $t ) { /** * @todo FIXME: This can lead to hitting memory limit for very large * categories. Ideally we would do the lookup synchronously * during the export in a single query. */ $catpages = $this->getPagesFromCategory( $t ); if ( $catpages ) { if ( $page !== '' ) { $page .= "\n"; } $page .= implode( "\n", $catpages ); } } } } elseif ( $request->getCheck( 'addns' ) && $config->get( MainConfigNames::ExportFromNamespaces ) ) { $page = $request->getText( 'pages' ); $nsindex = $request->getText( 'nsindex', '' ); if ( strval( $nsindex ) !== '' ) { /** * Same implementation as above, so same @todo */ $nspages = $this->getPagesFromNamespace( (int)$nsindex ); if ( $nspages ) { $page .= "\n" . implode( "\n", $nspages ); } } } elseif ( $request->getCheck( 'exportall' ) && $config->get( MainConfigNames::ExportAllowAll ) ) { $this->doExport = true; $exportall = true; /* Although $page and $history are not used later on, we nevertheless set them to avoid that PHP notices about using undefined variables foul up our XML output (see call to doExport(...) further down) */ $page = ''; $history = ''; } elseif ( $request->wasPosted() && $par == '' ) { // Log to see if certain parameters are actually used. // If not, we could deprecate them and do some cleanup, here and in WikiExporter. LoggerFactory::getInstance( 'export' )->debug( 'Special:Export POST, dir: [{dir}], offset: [{offset}], limit: [{limit}]', [ 'dir' => $request->getRawVal( 'dir' ), 'offset' => $request->getRawVal( 'offset' ), 'limit' => $request->getRawVal( 'limit' ), ] ); $page = $request->getText( 'pages' ); $this->curonly = $request->getCheck( 'curonly' ); $rawOffset = $request->getVal( 'offset' ); if ( $rawOffset ) { $offset = wfTimestamp( TS_MW, $rawOffset ); } else { $offset = null; } $maxHistory = $config->get( MainConfigNames::ExportMaxHistory ); $limit = $request->getInt( 'limit' ); $dir = $request->getVal( 'dir' ); $history = [ 'dir' => 'asc', 'offset' => false, 'limit' => $maxHistory, ]; $historyCheck = $request->getCheck( 'history' ); if ( $this->curonly ) { $history = WikiExporter::CURRENT; } elseif ( !$historyCheck ) { if ( $limit > 0 && ( $maxHistory == 0 || $limit < $maxHistory ) ) { $history['limit'] = $limit; } if ( $offset !== null ) { $history['offset'] = $offset; } if ( strtolower( $dir ?? '' ) == 'desc' ) { $history['dir'] = 'desc'; } } if ( $page != '' ) { $this->doExport = true; } } else { // Default to current-only for GET requests. $page = $request->getText( 'pages', $par ?? '' ); $historyCheck = $request->getCheck( 'history' ); if ( $historyCheck ) { $history = WikiExporter::FULL; } else { $history = WikiExporter::CURRENT; } if ( $page != '' ) { $this->doExport = true; } } if ( !$config->get( MainConfigNames::ExportAllowHistory ) ) { // Override $history = WikiExporter::CURRENT; } $list_authors = $request->getCheck( 'listauthors' ); if ( !$this->curonly || !$config->get( MainConfigNames::ExportAllowListContributors ) ) { $list_authors = false; } if ( $this->doExport ) { $this->getOutput()->disable(); // Cancel output buffering and gzipping if set // This should provide safer streaming for pages with history wfResetOutputBuffers(); $request->response()->header( 'Content-type: application/xml; charset=utf-8' ); $request->response()->header( 'X-Robots-Tag: noindex,nofollow' ); if ( $request->getCheck( 'wpDownload' ) ) { // Provide a sensible filename suggestion $filename = urlencode( $config->get( MainConfigNames::Sitename ) . '-' . wfTimestampNow() . '.xml' ); $request->response()->header( "Content-disposition: attachment;filename={$filename}" ); } // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable // @phan-suppress-next-line PhanTypeMismatchArgumentNullable history is set when used $this->doExport( $page, $history, $list_authors, $exportall ); return; } $out = $this->getOutput(); $out->addWikiMsg( 'exporttext' ); if ( $page == '' ) { $categoryName = $request->getText( 'catname' ); } else { $categoryName = ''; } $canExportAll = $config->get( MainConfigNames::ExportAllowAll ); $hideIf = $canExportAll ? [ 'hide-if' => [ '===', 'exportall', '1' ] ] : []; $formDescriptor = [ 'catname' => [ 'type' => 'textwithbutton', 'name' => 'catname', 'horizontal-label' => true, 'label-message' => 'export-addcattext', 'default' => $categoryName, 'size' => 40, 'buttontype' => 'submit', 'buttonname' => 'addcat', 'buttondefault' => $this->msg( 'export-addcat' )->text(), ] + $hideIf, ]; if ( $config->get( MainConfigNames::ExportFromNamespaces ) ) { $formDescriptor += [ 'nsindex' => [ 'type' => 'namespaceselectwithbutton', 'default' => $nsindex, 'label-message' => 'export-addnstext', 'horizontal-label' => true, 'name' => 'nsindex', 'id' => 'namespace', 'cssclass' => 'namespaceselector', 'buttontype' => 'submit', 'buttonname' => 'addns', 'buttondefault' => $this->msg( 'export-addns' )->text(), ] + $hideIf, ]; } if ( $canExportAll ) { $formDescriptor += [ 'exportall' => [ 'type' => 'check', 'label-message' => 'exportall', 'name' => 'exportall', 'id' => 'exportall', 'default' => $request->wasPosted() && $request->getCheck( 'exportall' ), ], ]; } $formDescriptor += [ 'textarea' => [ 'class' => HTMLTextAreaField::class, 'name' => 'pages', 'label-message' => 'export-manual', 'nodata' => true, 'rows' => 10, 'default' => $page, ] + $hideIf, ]; if ( $config->get( MainConfigNames::ExportAllowHistory ) ) { $formDescriptor += [ 'curonly' => [ 'type' => 'check', 'label-message' => 'exportcuronly', 'name' => 'curonly', 'id' => 'curonly', 'default' => !$request->wasPosted() || $request->getCheck( 'curonly' ), ], ]; } else { $out->addWikiMsg( 'exportnohistory' ); } $formDescriptor += [ 'templates' => [ 'type' => 'check', 'label-message' => 'export-templates', 'name' => 'templates', 'id' => 'wpExportTemplates', 'default' => $request->wasPosted() && $request->getCheck( 'templates' ), ], ]; if ( $config->get( MainConfigNames::ExportMaxLinkDepth ) || $this->userCanOverrideExportDepth() ) { $formDescriptor += [ 'pagelink-depth' => [ 'type' => 'text', 'name' => 'pagelink-depth', 'id' => 'pagelink-depth', 'label-message' => 'export-pagelinks', 'default' => '0', 'size' => 20, ], ]; } $formDescriptor += [ 'wpDownload' => [ 'type' => 'check', 'name' => 'wpDownload', 'id' => 'wpDownload', 'default' => !$request->wasPosted() || $request->getCheck( 'wpDownload' ), 'label-message' => 'export-download', ], ]; if ( $config->get( MainConfigNames::ExportAllowListContributors ) ) { $formDescriptor += [ 'listauthors' => [ 'type' => 'check', 'label-message' => 'exportlistauthors', 'default' => $request->wasPosted() && $request->getCheck( 'listauthors' ), 'name' => 'listauthors', 'id' => 'listauthors', ], ]; } $htmlForm = HTMLForm::factory( 'ooui', $formDescriptor, $this->getContext() ); $htmlForm->setSubmitTextMsg( 'export-submit' ); $htmlForm->prepareForm()->displayForm( false ); $this->addHelpLink( 'Help:Export' ); } /** * @return bool */ protected function userCanOverrideExportDepth() { return $this->getAuthority()->isAllowed( 'override-export-depth' ); } /** * Do the actual page exporting * * @param string $page User input on what page(s) to export * @param int $history One of the WikiExporter history export constants * @param bool $list_authors Whether to add distinct author list (when * not returning full history) * @param bool $exportall Whether to export everything */ protected function doExport( $page, $history, $list_authors, $exportall ) { // If we are grabbing everything, enable full history and ignore the rest if ( $exportall ) { $history = WikiExporter::FULL; } else { $pageSet = []; // Inverted index of all pages to look up // Split up and normalize input foreach ( explode( "\n", $page ) as $pageName ) { $pageName = trim( $pageName ); $title = Title::newFromText( $pageName ); if ( $title && !$title->isExternal() && $title->getText() !== '' ) { // Only record each page once! $pageSet[$title->getPrefixedText()] = true; } } // Set of original pages to pass on to further manipulation... $inputPages = array_keys( $pageSet ); // Look up any linked pages if asked... if ( $this->templates ) { $pageSet = $this->getTemplates( $inputPages, $pageSet ); } $pageSet = $this->getExtraPages( $inputPages, $pageSet ); $linkDepth = $this->pageLinkDepth; if ( $linkDepth ) { $pageSet = $this->getPageLinks( $inputPages, $pageSet, $linkDepth ); } $pages = array_keys( $pageSet ); // Normalize titles to the same format and remove dupes, see T19374 foreach ( $pages as $k => $v ) { $pages[$k] = str_replace( ' ', '_', $v ); } $pages = array_unique( $pages ); } /* Ok, let's get to it... */ $db = $this->dbProvider->getReplicaDatabase(); $exporter = $this->wikiExporterFactory->getWikiExporter( $db, $history ); $exporter->list_authors = $list_authors; $exporter->openStream(); if ( $exportall ) { $exporter->allPages(); } else { // @phan-suppress-next-line PhanPossiblyUndeclaredVariable foreach ( $pages as $page ) { # T10824: Only export pages the user can read $title = Title::newFromText( $page ); if ( $title === null ) { // @todo Perhaps output an tag or something. continue; } if ( !$this->getAuthority()->authorizeRead( 'read', $title ) ) { // @todo Perhaps output an tag or something. continue; } $exporter->pageByTitle( $title ); } } $exporter->closeStream(); } /** * @param PageIdentity $page * @return string[] */ protected function getPagesFromCategory( PageIdentity $page ) { $maxPages = $this->getConfig()->get( MainConfigNames::ExportPagelistLimit ); $name = $page->getDBkey(); $dbr = $this->dbProvider->getReplicaDatabase(); $res = $dbr->newSelectQueryBuilder() ->select( [ 'page_namespace', 'page_title' ] ) ->from( 'page' ) ->join( 'categorylinks', null, 'cl_from=page_id' ) ->where( [ 'cl_to' => $name ] ) ->limit( $maxPages ) ->caller( __METHOD__ )->fetchResultSet(); $pages = []; foreach ( $res as $row ) { $pages[] = Title::makeName( $row->page_namespace, $row->page_title ); } return $pages; } /** * @param int $nsindex * @return string[] */ protected function getPagesFromNamespace( $nsindex ) { $maxPages = $this->getConfig()->get( MainConfigNames::ExportPagelistLimit ); $dbr = $this->dbProvider->getReplicaDatabase(); $res = $dbr->newSelectQueryBuilder() ->select( [ 'page_namespace', 'page_title' ] ) ->from( 'page' ) ->where( [ 'page_namespace' => $nsindex ] ) ->limit( $maxPages ) ->caller( __METHOD__ )->fetchResultSet(); $pages = []; foreach ( $res as $row ) { $pages[] = Title::makeName( $row->page_namespace, $row->page_title ); } return $pages; } /** * Expand a list of pages to include templates used in those pages. * @param array $inputPages List of titles to look up * @param array $pageSet Associative array indexed by titles for output * @return array Associative array index by titles */ protected function getTemplates( $inputPages, $pageSet ) { [ $nsField, $titleField ] = $this->linksMigration->getTitleFields( 'templatelinks' ); $queryInfo = $this->linksMigration->getQueryInfo( 'templatelinks' ); $dbr = $this->dbProvider->getReplicaDatabase(); $queryBuilder = $dbr->newSelectQueryBuilder() ->caller( __METHOD__ ) ->select( [ 'namespace' => $nsField, 'title' => $titleField ] ) ->from( 'page' ) ->join( 'templatelinks', null, 'page_id=tl_from' ) ->tables( array_diff( $queryInfo['tables'], [ 'templatelinks' ] ) ) ->joinConds( $queryInfo['joins'] ); return $this->getLinks( $inputPages, $pageSet, $queryBuilder ); } /** * Add extra pages to the list of pages to export. * @param string[] $inputPages List of page titles to export * @param bool[] $pageSet Initial associative array indexed by string page titles * @return bool[] Associative array indexed by string page titles including extra pages */ private function getExtraPages( $inputPages, $pageSet ) { $extraPages = []; $this->getHookRunner()->onSpecialExportGetExtraPages( $inputPages, $extraPages ); foreach ( $extraPages as $extraPage ) { $pageSet[$this->titleFormatter->getPrefixedText( $extraPage )] = true; } return $pageSet; } /** * Validate link depth setting, if available. * @param int|null $depth * @return int */ protected function validateLinkDepth( $depth ) { if ( $depth === null || $depth < 0 ) { return 0; } if ( !$this->userCanOverrideExportDepth() ) { $maxLinkDepth = $this->getConfig()->get( MainConfigNames::ExportMaxLinkDepth ); if ( $depth > $maxLinkDepth ) { return $maxLinkDepth; } } /* * There's a HARD CODED limit of 5 levels of recursion here to prevent a * crazy-big export from being done by someone setting the depth * number too high. In other words, last resort safety net. */ return intval( min( $depth, 5 ) ); } /** * Expand a list of pages to include pages linked to from that page. * @param array $inputPages * @param array $pageSet * @param int $depth * @return array */ protected function getPageLinks( $inputPages, $pageSet, $depth ) { for ( ; $depth > 0; --$depth ) { [ $nsField, $titleField ] = $this->linksMigration->getTitleFields( 'pagelinks' ); $queryInfo = $this->linksMigration->getQueryInfo( 'pagelinks' ); $dbr = $this->dbProvider->getReplicaDatabase(); $queryBuilder = $dbr->newSelectQueryBuilder() ->caller( __METHOD__ ) ->select( [ 'namespace' => $nsField, 'title' => $titleField ] ) ->from( 'page' ) ->join( 'pagelinks', null, 'page_id=pl_from' ) ->tables( array_diff( $queryInfo['tables'], [ 'pagelinks' ] ) ) ->joinConds( $queryInfo['joins'] ); $pageSet = $this->getLinks( $inputPages, $pageSet, $queryBuilder ); $inputPages = array_keys( $pageSet ); } return $pageSet; } /** * Expand a list of pages to include items used in those pages. * @param array $inputPages Array of page titles * @param array $pageSet * @param SelectQueryBuilder $queryBuilder * @return array */ protected function getLinks( $inputPages, $pageSet, SelectQueryBuilder $queryBuilder ) { foreach ( $inputPages as $page ) { $title = Title::newFromText( $page ); if ( $title ) { $pageSet[$title->getPrefixedText()] = true; /// @todo FIXME: May or may not be more efficient to batch these /// by namespace when given multiple input pages. $result = ( clone $queryBuilder ) ->where( [ 'page_namespace' => $title->getNamespace(), 'page_title' => $title->getDBkey() ] ) ->fetchResultSet(); foreach ( $result as $row ) { $template = Title::makeTitle( $row->namespace, $row->title ); $pageSet[$template->getPrefixedText()] = true; } } } return $pageSet; } protected function getGroupName() { return 'pagetools'; } } /** @deprecated class alias since 1.41 */ class_alias( SpecialExport::class, 'SpecialExport' );