diff options
Diffstat (limited to 'maintenance/includes/TextPassDumper.php')
-rw-r--r-- | maintenance/includes/TextPassDumper.php | 116 |
1 files changed, 70 insertions, 46 deletions
diff --git a/maintenance/includes/TextPassDumper.php b/maintenance/includes/TextPassDumper.php index 067f0be620f8..ae0a4186adbb 100644 --- a/maintenance/includes/TextPassDumper.php +++ b/maintenance/includes/TextPassDumper.php @@ -30,6 +30,7 @@ require_once __DIR__ . '/SevenZipStream.php'; require_once __DIR__ . '/../../includes/export/WikiExporter.php'; use MediaWiki\MediaWikiServices; +use MediaWiki\Revision\SlotRecord; use MediaWiki\Shell\Shell; use MediaWiki\Storage\BlobAccessException; use MediaWiki\Storage\SqlBlobStore; @@ -45,6 +46,8 @@ class TextPassDumper extends BackupDumper { private $thisPage; /** @var string|bool */ private $thisRev; + /** @var string|bool */ + private $thisRole = null; // when we spend more than maxTimeAllowed seconds on this run, we continue // processing until we write out the next complete page, then save output file(s), @@ -65,7 +68,7 @@ class TextPassDumper extends BackupDumper { protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go. /** @var array */ - protected $php = []; + protected $php = [ PHP_BINARY ]; protected $spawn = false; /** @@ -445,6 +448,7 @@ TEXT $this->lastName = ""; $this->thisPage = 0; $this->thisRev = 0; + $this->thisRole = null; $this->thisRevModel = null; $this->thisRevFormat = null; @@ -530,6 +534,9 @@ TEXT ->exportTransform( $text, $format ); } catch ( MWException $ex ) { + wfWarn( "Unable to apply export transformation for content model '$model': " . + $ex->getMessage() ); + $this->progress( "Unable to apply export transformation for content model '$model': " . $ex->getMessage() @@ -552,16 +559,21 @@ TEXT * * @param int|string $id Content address, or text row ID. * @param string|bool|null $model The content model used to determine - * applicable export transformations. - * If $model is null, it will be determined from the database. + * applicable export transformations. If $model is null, no transformation is applied. * @param string|null $format The content format used when applying export transformations. + * @param int|null $expSize Expected length of the text, for sanity checks * - * @throws MWException * @return string The revision text for $id, or "" + * @throws MWException */ - protected function getText( $id, $model = null, $format = null ) { - global $wgContentHandlerUseDB; + protected function getText( $id, $model = null, $format = null, $expSize = null ) { + if ( !$this->isValidTextId( $id ) ) { + $msg = "Skipping bad text id " . $id . " of revision " . $this->thisRev; + $this->progress( $msg ); + return ''; + } + $model = $model ?: null; $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch. $text = false; // The candidate for a good text. false if no proper value. $failures = 0; // The number of times, this invocation of getText already failed. @@ -577,25 +589,6 @@ TEXT $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals; $consecutiveFailedTextRetrievals = 0; - if ( $model === null && $wgContentHandlerUseDB ) { - // TODO: MCR: use content table - $row = $this->db->selectRow( - 'revision', - [ 'rev_content_model', 'rev_content_format' ], - [ 'rev_id' => $this->thisRev ], - __METHOD__ - ); - - if ( $row ) { - $model = $row->rev_content_model; - $format = $row->rev_content_format; - } - } - - if ( $model === null || $model === '' ) { - $model = false; - } - while ( $failures < $this->maxFailures ) { // As soon as we found a good text for the $id, we will return immediately. // Hence, if we make it past the try catch block, we know that we did not @@ -609,13 +602,17 @@ TEXT if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) { $prefetchNotTried = false; $tryIsPrefetch = true; - $text = $this->prefetch->prefetch( (int)$this->thisPage, (int)$this->thisRev ); + $text = $this->prefetch->prefetch( + (int)$this->thisPage, + (int)$this->thisRev, + trim( $this->thisRole ) + ); if ( $text === null ) { $text = false; } - if ( is_string( $text ) && $model !== false ) { + if ( is_string( $text ) && $model !== null ) { // Apply export transformation to text coming from an old dump. // The purpose of this transformation is to convert up from legacy // formats, which may still be used in the older dump that is used @@ -634,7 +631,7 @@ TEXT $text = $this->getTextDb( $id ); } - if ( $text !== false && $model !== false ) { + if ( $text !== false && $model !== null ) { // Apply export transformation to text coming from the database. // Prefetched text should already have transformations applied. $text = $this->exportTransform( $text, $model, $format ); @@ -656,18 +653,8 @@ TEXT // Step 2: Checking for plausibility and return the text if it is // plausible - $revID = intval( $this->thisRev ); - if ( !isset( $this->db ) ) { - throw new MWException( "No database available" ); - } - if ( $model !== CONTENT_MODEL_WIKITEXT ) { - $revLength = strlen( $text ); - } else { - $revLength = $this->db->selectField( 'revision', 'rev_len', [ 'rev_id' => $revID ] ); - } - - if ( strlen( $text ) == $revLength ) { + if ( $expSize === null || strlen( $text ) == $expSize ) { if ( $tryIsPrefetch ) { $this->prefetchCount++; } @@ -678,7 +665,8 @@ TEXT $text = false; throw new MWException( "Received text is unplausible for id " . $id ); } catch ( Exception $e ) { - $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")"; + $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() + . ") for revision " . $this->thisRev; if ( $failures + 1 < $this->maxFailures ) { $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)"; } @@ -800,10 +788,10 @@ TEXT return false; } - list( + [ $this->spawnWrite, // -> stdin $this->spawnRead, // <- stdout - ) = $pipes; + ] = $pipes; return true; } @@ -920,18 +908,35 @@ TEXT $this->buffer = ""; $this->atStart = false; } + } elseif ( $name === 'mediawiki' ) { + if ( isset( $attribs['version'] ) ) { + if ( $attribs['version'] !== $this->schemaVersion ) { + throw new MWException( 'Mismatching schema version. ' + . 'Use the --schema-version option to set the output schema version to ' + . 'the version declared by the stub file, namely ' . $attribs['version'] ); + } + } } - if ( $name == "text" && isset( $attribs['id'] ) ) { - $id = $attribs['id']; + if ( $name == "text" && ( isset( $attribs['id'] ) || isset( $attribs['location'] ) ) ) { + $id = $attribs['location'] ?? $attribs['id']; $model = trim( $this->thisRevModel ); $format = trim( $this->thisRevFormat ); $model = $model === '' ? null : $model; $format = $format === '' ? null : $format; + $expSize = !empty( $attribs['bytes'] ) && $model === CONTENT_MODEL_WIKITEXT + ? (int)$attribs['bytes'] : null; + + $text = $this->getText( $id, $model, $format, $expSize ); + + unset( $attribs['id'] ); + unset( $attribs['location'] ); + if ( strlen( $text ) > 0 ) { + $attribs['xml:space'] = 'preserve'; + } - $text = $this->getText( $id, $model, $format ); - $this->openElement = [ $name, [ 'xml:space' => 'preserve' ] ]; + $this->openElement = [ $name, $attribs ]; if ( strlen( $text ) > 0 ) { $this->characterData( $parser, $text ); } @@ -953,6 +958,7 @@ TEXT $this->egress->writeRevision( null, $this->buffer ); $this->buffer = ""; $this->thisRev = ""; + $this->thisRole = null; $this->thisRevModel = null; $this->thisRevFormat = null; } elseif ( $name == 'page' ) { @@ -1005,6 +1011,7 @@ TEXT if ( $this->lastName == "id" ) { if ( $this->state == "revision" ) { $this->thisRev .= $data; + $this->thisRole = SlotRecord::MAIN; } elseif ( $this->state == "page" ) { $this->thisPage .= $data; } @@ -1012,6 +1019,12 @@ TEXT $this->thisRevModel .= $data; } elseif ( $this->lastName == "format" ) { $this->thisRevFormat .= $data; + } elseif ( $this->lastName == "content" ) { + $this->thisRole = ""; + $this->thisRevModel = ""; + $this->thisRevFormat = ""; + } elseif ( $this->lastName == "role" ) { + $this->thisRole .= $data; } // have to skip the newline left over from closepagetag line of @@ -1031,4 +1044,15 @@ TEXT $this->openElement = false; } } + + private function isValidTextId( $id ) { + if ( preg_match( '/:/', $id ) ) { + return $id !== 'tt:0'; + } elseif ( preg_match( '/^\d+$/', $id ) ) { + return intval( $id ) > 0; + } + + return false; + } + } |