aboutsummaryrefslogtreecommitdiffstats
path: root/maintenance/includes/TextPassDumper.php
diff options
context:
space:
mode:
Diffstat (limited to 'maintenance/includes/TextPassDumper.php')
-rw-r--r--maintenance/includes/TextPassDumper.php116
1 files changed, 70 insertions, 46 deletions
diff --git a/maintenance/includes/TextPassDumper.php b/maintenance/includes/TextPassDumper.php
index 067f0be620f8..ae0a4186adbb 100644
--- a/maintenance/includes/TextPassDumper.php
+++ b/maintenance/includes/TextPassDumper.php
@@ -30,6 +30,7 @@ require_once __DIR__ . '/SevenZipStream.php';
require_once __DIR__ . '/../../includes/export/WikiExporter.php';
use MediaWiki\MediaWikiServices;
+use MediaWiki\Revision\SlotRecord;
use MediaWiki\Shell\Shell;
use MediaWiki\Storage\BlobAccessException;
use MediaWiki\Storage\SqlBlobStore;
@@ -45,6 +46,8 @@ class TextPassDumper extends BackupDumper {
private $thisPage;
/** @var string|bool */
private $thisRev;
+ /** @var string|bool */
+ private $thisRole = null;
// when we spend more than maxTimeAllowed seconds on this run, we continue
// processing until we write out the next complete page, then save output file(s),
@@ -65,7 +68,7 @@ class TextPassDumper extends BackupDumper {
protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go.
/** @var array */
- protected $php = [];
+ protected $php = [ PHP_BINARY ];
protected $spawn = false;
/**
@@ -445,6 +448,7 @@ TEXT
$this->lastName = "";
$this->thisPage = 0;
$this->thisRev = 0;
+ $this->thisRole = null;
$this->thisRevModel = null;
$this->thisRevFormat = null;
@@ -530,6 +534,9 @@ TEXT
->exportTransform( $text, $format );
}
catch ( MWException $ex ) {
+ wfWarn( "Unable to apply export transformation for content model '$model': " .
+ $ex->getMessage() );
+
$this->progress(
"Unable to apply export transformation for content model '$model': " .
$ex->getMessage()
@@ -552,16 +559,21 @@ TEXT
*
* @param int|string $id Content address, or text row ID.
* @param string|bool|null $model The content model used to determine
- * applicable export transformations.
- * If $model is null, it will be determined from the database.
+ * applicable export transformations. If $model is null, no transformation is applied.
* @param string|null $format The content format used when applying export transformations.
+ * @param int|null $expSize Expected length of the text, for sanity checks
*
- * @throws MWException
* @return string The revision text for $id, or ""
+ * @throws MWException
*/
- protected function getText( $id, $model = null, $format = null ) {
- global $wgContentHandlerUseDB;
+ protected function getText( $id, $model = null, $format = null, $expSize = null ) {
+ if ( !$this->isValidTextId( $id ) ) {
+ $msg = "Skipping bad text id " . $id . " of revision " . $this->thisRev;
+ $this->progress( $msg );
+ return '';
+ }
+ $model = $model ?: null;
$prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
$text = false; // The candidate for a good text. false if no proper value.
$failures = 0; // The number of times, this invocation of getText already failed.
@@ -577,25 +589,6 @@ TEXT
$oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
$consecutiveFailedTextRetrievals = 0;
- if ( $model === null && $wgContentHandlerUseDB ) {
- // TODO: MCR: use content table
- $row = $this->db->selectRow(
- 'revision',
- [ 'rev_content_model', 'rev_content_format' ],
- [ 'rev_id' => $this->thisRev ],
- __METHOD__
- );
-
- if ( $row ) {
- $model = $row->rev_content_model;
- $format = $row->rev_content_format;
- }
- }
-
- if ( $model === null || $model === '' ) {
- $model = false;
- }
-
while ( $failures < $this->maxFailures ) {
// As soon as we found a good text for the $id, we will return immediately.
// Hence, if we make it past the try catch block, we know that we did not
@@ -609,13 +602,17 @@ TEXT
if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) {
$prefetchNotTried = false;
$tryIsPrefetch = true;
- $text = $this->prefetch->prefetch( (int)$this->thisPage, (int)$this->thisRev );
+ $text = $this->prefetch->prefetch(
+ (int)$this->thisPage,
+ (int)$this->thisRev,
+ trim( $this->thisRole )
+ );
if ( $text === null ) {
$text = false;
}
- if ( is_string( $text ) && $model !== false ) {
+ if ( is_string( $text ) && $model !== null ) {
// Apply export transformation to text coming from an old dump.
// The purpose of this transformation is to convert up from legacy
// formats, which may still be used in the older dump that is used
@@ -634,7 +631,7 @@ TEXT
$text = $this->getTextDb( $id );
}
- if ( $text !== false && $model !== false ) {
+ if ( $text !== false && $model !== null ) {
// Apply export transformation to text coming from the database.
// Prefetched text should already have transformations applied.
$text = $this->exportTransform( $text, $model, $format );
@@ -656,18 +653,8 @@ TEXT
// Step 2: Checking for plausibility and return the text if it is
// plausible
- $revID = intval( $this->thisRev );
- if ( !isset( $this->db ) ) {
- throw new MWException( "No database available" );
- }
- if ( $model !== CONTENT_MODEL_WIKITEXT ) {
- $revLength = strlen( $text );
- } else {
- $revLength = $this->db->selectField( 'revision', 'rev_len', [ 'rev_id' => $revID ] );
- }
-
- if ( strlen( $text ) == $revLength ) {
+ if ( $expSize === null || strlen( $text ) == $expSize ) {
if ( $tryIsPrefetch ) {
$this->prefetchCount++;
}
@@ -678,7 +665,8 @@ TEXT
$text = false;
throw new MWException( "Received text is unplausible for id " . $id );
} catch ( Exception $e ) {
- $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")";
+ $msg = "getting/checking text " . $id . " failed (" . $e->getMessage()
+ . ") for revision " . $this->thisRev;
if ( $failures + 1 < $this->maxFailures ) {
$msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)";
}
@@ -800,10 +788,10 @@ TEXT
return false;
}
- list(
+ [
$this->spawnWrite, // -> stdin
$this->spawnRead, // <- stdout
- ) = $pipes;
+ ] = $pipes;
return true;
}
@@ -920,18 +908,35 @@ TEXT
$this->buffer = "";
$this->atStart = false;
}
+ } elseif ( $name === 'mediawiki' ) {
+ if ( isset( $attribs['version'] ) ) {
+ if ( $attribs['version'] !== $this->schemaVersion ) {
+ throw new MWException( 'Mismatching schema version. '
+ . 'Use the --schema-version option to set the output schema version to '
+ . 'the version declared by the stub file, namely ' . $attribs['version'] );
+ }
+ }
}
- if ( $name == "text" && isset( $attribs['id'] ) ) {
- $id = $attribs['id'];
+ if ( $name == "text" && ( isset( $attribs['id'] ) || isset( $attribs['location'] ) ) ) {
+ $id = $attribs['location'] ?? $attribs['id'];
$model = trim( $this->thisRevModel );
$format = trim( $this->thisRevFormat );
$model = $model === '' ? null : $model;
$format = $format === '' ? null : $format;
+ $expSize = !empty( $attribs['bytes'] ) && $model === CONTENT_MODEL_WIKITEXT
+ ? (int)$attribs['bytes'] : null;
+
+ $text = $this->getText( $id, $model, $format, $expSize );
+
+ unset( $attribs['id'] );
+ unset( $attribs['location'] );
+ if ( strlen( $text ) > 0 ) {
+ $attribs['xml:space'] = 'preserve';
+ }
- $text = $this->getText( $id, $model, $format );
- $this->openElement = [ $name, [ 'xml:space' => 'preserve' ] ];
+ $this->openElement = [ $name, $attribs ];
if ( strlen( $text ) > 0 ) {
$this->characterData( $parser, $text );
}
@@ -953,6 +958,7 @@ TEXT
$this->egress->writeRevision( null, $this->buffer );
$this->buffer = "";
$this->thisRev = "";
+ $this->thisRole = null;
$this->thisRevModel = null;
$this->thisRevFormat = null;
} elseif ( $name == 'page' ) {
@@ -1005,6 +1011,7 @@ TEXT
if ( $this->lastName == "id" ) {
if ( $this->state == "revision" ) {
$this->thisRev .= $data;
+ $this->thisRole = SlotRecord::MAIN;
} elseif ( $this->state == "page" ) {
$this->thisPage .= $data;
}
@@ -1012,6 +1019,12 @@ TEXT
$this->thisRevModel .= $data;
} elseif ( $this->lastName == "format" ) {
$this->thisRevFormat .= $data;
+ } elseif ( $this->lastName == "content" ) {
+ $this->thisRole = "";
+ $this->thisRevModel = "";
+ $this->thisRevFormat = "";
+ } elseif ( $this->lastName == "role" ) {
+ $this->thisRole .= $data;
}
// have to skip the newline left over from closepagetag line of
@@ -1031,4 +1044,15 @@ TEXT
$this->openElement = false;
}
}
+
+ private function isValidTextId( $id ) {
+ if ( preg_match( '/:/', $id ) ) {
+ return $id !== 'tt:0';
+ } elseif ( preg_match( '/^\d+$/', $id ) ) {
+ return intval( $id ) > 0;
+ }
+
+ return false;
+ }
+
}