diff options
-rw-r--r-- | includes/AutoLoader.php | 3 | ||||
-rw-r--r-- | includes/parser/LinkHolderArray.php | 90 | ||||
-rw-r--r-- | includes/parser/Parser.php | 269 | ||||
-rw-r--r-- | includes/parser/Preprocessor_DOM.php | 2 | ||||
-rw-r--r-- | includes/parser/Preprocessor_Hash.php | 2 | ||||
-rw-r--r-- | includes/parser/StripState.php | 138 |
6 files changed, 330 insertions, 174 deletions
diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 019c94dfbff6..4b7c34642d25 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -518,7 +518,6 @@ $wgAutoloadLocalClasses = array( 'DateFormatter' => 'includes/parser/DateFormatter.php', 'LinkHolderArray' => 'includes/parser/LinkHolderArray.php', 'LinkMarkerReplacer' => 'includes/parser/Parser_LinkHooks.php', - 'OnlyIncludeReplacer' => 'includes/parser/Parser.php', 'PPCustomFrame_Hash' => 'includes/parser/Preprocessor_Hash.php', 'PPCustomFrame_DOM' => 'includes/parser/Preprocessor_DOM.php', 'PPDAccum_Hash' => 'includes/parser/Preprocessor_Hash.php', @@ -548,7 +547,7 @@ $wgAutoloadLocalClasses = array( 'Preprocessor' => 'includes/parser/Preprocessor.php', 'Preprocessor_DOM' => 'includes/parser/Preprocessor_DOM.php', 'Preprocessor_Hash' => 'includes/parser/Preprocessor_Hash.php', - 'StripState' => 'includes/parser/Parser.php', + 'StripState' => 'includes/parser/StripState.php', 'MWTidy' => 'includes/parser/Tidy.php', # includes/search diff --git a/includes/parser/LinkHolderArray.php b/includes/parser/LinkHolderArray.php index 31e121f42762..1a76fa867c25 100644 --- a/includes/parser/LinkHolderArray.php +++ b/includes/parser/LinkHolderArray.php @@ -12,6 +12,7 @@ class LinkHolderArray { var $internals = array(), $interwikis = array(); var $size = 0; var $parent; + protected $tempIdOffset; function __construct( $parent ) { $this->parent = $parent; @@ -27,6 +28,15 @@ class LinkHolderArray { } /** + * Don't serialize the parent object, it is big, and not needed when it is + * a parameter to mergeForeign(), which is the only application of + * serializing at present. + */ + function __sleep() { + return array( 'internals', 'interwikis', 'size' ); + } + + /** * Merge another LinkHolderArray into this one * @param $other LinkHolderArray */ @@ -43,6 +53,86 @@ class LinkHolderArray { } /** + * Merge a LinkHolderArray from another parser instance into this one. The + * keys will not be preserved. Any text which went with the old + * LinkHolderArray and needs to work with the new one should be passed in + * the $texts array. The strings in this array will have their link holders + * converted for use in the destination link holder. The resulting array of + * strings will be returned. + * + * @param $other LinkHolderArray + * @param $text Array of strings + * @return Array + */ + function mergeForeign( $other, $texts ) { + $this->tempIdOffset = $idOffset = $this->parent->nextLinkID(); + $maxId = 0; + + # Renumber internal links + foreach ( $other->internals as $ns => $nsLinks ) { + foreach ( $nsLinks as $key => $entry ) { + $newKey = $idOffset + $key; + $this->internals[$ns][$newKey] = $entry; + $maxId = $newKey > $maxId ? $newKey : $maxId; + } + } + $texts = preg_replace_callback( '(<!--LINK \d+:)(\d+)(-->)', + array( $this, 'mergeForeignCallback' ), $texts ); + + # Renumber interwiki links + foreach ( $links['interwiki'] as $key => $entry ) { + $newKey = $idOffset + $key; + $this->interwikis[$newKey] = $entry; + $maxId = $newKey > $maxId ? $newKey : $maxId; + + } + $texts = preg_replace_callback( '(<!--IWLINK )(\d+)(-->)', + array( $this, 'mergeForeignCallback' ), $texts ); + + # Set the parent link ID to be the highest used ID + $this->parent->setLinkID( $maxId ); + $this->tempIdOffset = null; + } + + protected function mergeForeignCallback( $m ) { + return $m[1] . ( $m[2] + $this->tempIdOffset ) . $m[3]; + } + + /** + * Get a subset of the current LinkHolderArray which is sufficient to + * interpret the given text. + */ + function getSubArray( $text ) { + $sub = new LinkHolderArray( $this->parent ); + + # Internal links + $pos = 0; + while ( $pos < strlen( $text ) ) { + if ( !preg_match( '/<!--LINK (\d+):(\d+)-->/', + $text, $m, PREG_OFFSET_CAPTURE, $pos ) ) + { + break; + } + $ns = $m[1][0]; + $key = $m[2][0]; + $sub->internals[$ns][$key] = $this->internals[$ns][$key]; + $pos = $m[0][1] + strlen( $m[0][0] ); + } + + # Interwiki links + $pos = 0; + while ( $pos < strlen( $text ) ) { + if ( !preg_match( '/<!--IWLINK (\d+)-->/', $text, $m, PREG_OFFSET_CAPTURE, $pos ) ) { + break; + } + $key = $m[1][0]; + $sub->interwikis[$key] = $this->interwikis[$key]; + $pos = $m[0][1] + strlen( $m[0][0] ); + } + return $sub; + } + + /** * Returns true if the memory requirements of this object are getting large */ function isBig() { diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 8bbeb2db2e10..afa39345a186 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -55,6 +55,12 @@ class Parser { */ const VERSION = '1.6.4'; + /** + * Update this version number when the output of serialiseHalfParsedText() + * changes in an incompatible way + */ + const HALF_PARSED_VERSION = 2; + # Flags for Parser::setFunctionHook # Also available as global constants from Defines.php const SFH_NO_HASH = 1; @@ -203,7 +209,6 @@ class Parser { $this->mLastSection = ''; $this->mDTopen = false; $this->mIncludeCount = array(); - $this->mStripState = new StripState; $this->mArgStack = false; $this->mInPre = false; $this->mLinkHolders = new LinkHolderArray( $this ); @@ -226,6 +231,7 @@ class Parser { # $this->mUniqPrefix = "\x07UNIQ" . Parser::getRandomString(); # Changed to \x7f to allow XML double-parsing -- TS $this->mUniqPrefix = "\x7fUNIQ" . self::getRandomString(); + $this->mStripState = new StripState( $this->mUniqPrefix ); # Clear these on every parse, bug 4549 @@ -353,23 +359,7 @@ class Parser { wfRunHooks( 'ParserBeforeTidy', array( &$this, &$text ) ); -//!JF Move to its own function - - $uniq_prefix = $this->mUniqPrefix; - $matches = array(); - $elements = array_keys( $this->mTransparentTagHooks ); - $text = $this->extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); - - foreach ( $matches as $marker => $data ) { - list( $element, $content, $params, $tag ) = $data; - $tagName = strtolower( $element ); - if ( isset( $this->mTransparentTagHooks[$tagName] ) ) { - $output = call_user_func_array( $this->mTransparentTagHooks[$tagName], array( $content, $params, $this ) ); - } else { - $output = $tag; - } - $this->mStripState->general->setPair( $marker, $output ); - } + $text = $this->replaceTransparentTags( $text ); $text = $this->mStripState->unstripGeneral( $text ); $text = Sanitizer::normalizeCharReferences( $text ); @@ -620,6 +610,10 @@ class Parser { return $this->mLinkID++; } + function setLinkID( $id ) { + $this->mLinkID = $id; + } + /** * @return Language */ @@ -793,7 +787,7 @@ class Parser { function insertStripItem( $text ) { $rnd = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}-" . self::MARKER_SUFFIX; $this->mMarkerIndex++; - $this->mStripState->general->setPair( $rnd, $text ); + $this->mStripState->addGeneral( $rnd, $text ); return $rnd; } @@ -3542,9 +3536,9 @@ class Parser { if ( $markerType === 'none' ) { return $output; } elseif ( $markerType === 'nowiki' ) { - $this->mStripState->nowiki->setPair( $marker, $output ); + $this->mStripState->addNoWiki( $marker, $output ); } elseif ( $markerType === 'general' ) { - $this->mStripState->general->setPair( $marker, $output ); + $this->mStripState->addGeneral( $marker, $output ); } else { throw new MWException( __METHOD__.': invalid marker type' ); } @@ -4859,6 +4853,30 @@ class Parser { } /** + * Replace transparent tags in $text with the values given by the callbacks. + * + * Transparent tag hooks are like regular XML-style tag hooks, except they + * operate late in the transformation sequence, on HTML instead of wikitext. + */ + function replaceTransparentTags( $text ) { + $matches = array(); + $elements = array_keys( $this->mTransparentTagHooks ); + $text = $this->extractTagsAndParams( $elements, $text, $matches, $this->mUniqPrefix ); + + foreach ( $matches as $marker => $data ) { + list( $element, $content, $params, $tag ) = $data; + $tagName = strtolower( $element ); + if ( isset( $this->mTransparentTagHooks[$tagName] ) ) { + $output = call_user_func_array( $this->mTransparentTagHooks[$tagName], array( $content, $params, $this ) ); + } else { + $output = $tag; + } + $this->mStripState->addGeneral( $marker, $output ); + } + return $text; + } + + /** * Break wikitext input into sections, and either pull or replace * some particular section's text. * @@ -5203,6 +5221,17 @@ class Parser { return $this->testSrvus( $text, $title, $options, self::OT_PREPROCESS ); } + /** + * Call a callback function on all regions of the given text that are not + * inside strip markers, and replace those regions with the return value + * of the callback. For example, with input: + * + * aaa<MARKER>bbb + * + * This will call the callback function twice, with 'aaa' and 'bbb'. Those + * two strings will be replaced with the value returned by the callback in + * each case. + */ function markerSkipCallback( $s, $callback ) { $i = 0; $out = ''; @@ -5227,168 +5256,68 @@ class Parser { return $out; } - function serialiseHalfParsedText( $text ) { - $data = array(); - $data['text'] = $text; - - # First, find all strip markers, and store their - # data in an array. - $stripState = new StripState; - $pos = 0; - while ( ( $start_pos = strpos( $text, $this->mUniqPrefix, $pos ) ) - && ( $end_pos = strpos( $text, self::MARKER_SUFFIX, $pos ) ) ) - { - $end_pos += strlen( self::MARKER_SUFFIX ); - $marker = substr( $text, $start_pos, $end_pos-$start_pos ); - - if ( !empty( $this->mStripState->general->data[$marker] ) ) { - $replaceArray = $stripState->general; - $stripText = $this->mStripState->general->data[$marker]; - } elseif ( !empty( $this->mStripState->nowiki->data[$marker] ) ) { - $replaceArray = $stripState->nowiki; - $stripText = $this->mStripState->nowiki->data[$marker]; - } else { - throw new MWException( "Hanging strip marker: '$marker'." ); - } - - $replaceArray->setPair( $marker, $stripText ); - $pos = $end_pos; - } - $data['stripstate'] = $stripState; - - # Now, find all of our links, and store THEIR - # data in an array! :) - $links = array( 'internal' => array(), 'interwiki' => array() ); - $pos = 0; - - # Internal links - while ( ( $start_pos = strpos( $text, '<!--LINK ', $pos ) ) ) { - list( $ns, $trail ) = explode( ':', substr( $text, $start_pos + strlen( '<!--LINK ' ) ), 2 ); - - $ns = trim( $ns ); - if ( empty( $links['internal'][$ns] ) ) { - $links['internal'][$ns] = array(); - } - - $key = trim( substr( $trail, 0, strpos( $trail, '-->' ) ) ); - $links['internal'][$ns][] = $this->mLinkHolders->internals[$ns][$key]; - $pos = $start_pos + strlen( "<!--LINK $ns:$key-->" ); - } - - $pos = 0; - - # Interwiki links - while ( ( $start_pos = strpos( $text, '<!--IWLINK ', $pos ) ) ) { - $data = substr( $text, $start_pos ); - $key = trim( substr( $data, 0, strpos( $data, '-->' ) ) ); - $links['interwiki'][] = $this->mLinkHolders->interwiki[$key]; - $pos = $start_pos + strlen( "<!--IWLINK $key-->" ); - } - - $data['linkholder'] = $links; - + /** + * Save the parser state required to convert the given half-parsed text to + * HTML. "Half-parsed" in this context means the output of + * recursiveTagParse() or internalParse(). This output has strip markers + * from replaceVariables (extensionSubstitution() etc.), and link + * placeholders from replaceLinkHolders(). + * + * Returns an array which can be serialized and stored persistently. This + * array can later be loaded into another parser instance with + * unserializeHalfParsed(). The text can then be safely incorporated into + * the return value of a parser hook. + */ + function serializeHalfParsedText( $text ) { + wfProfileIn( __METHOD__ ); + $data = array( + 'text' => $text, + 'version' => self::HALF_PARSED_VERSION, + 'stripState' => $this->mStripState->getSubState( $text ), + 'linkHolders' => $this->mLinkHolders->getSubArray( $text ) + ); + wfProfileOut( __METHOD__ ); return $data; } /** - * TODO: document - * @param $data Array - * @param $intPrefix String unique identifying prefix + * Load the parser state given in the $data array, which is assumed to + * have been generated by serializeHalfParsedText(). The text contents is + * extracted from the array, and its markers are transformed into markers + * appropriate for the current Parser instance. This transformed text is + * returned, and can be safely included in the return value of a parser + * hook. + * + * If the $data array has been stored persistently, the caller should first + * check whether it is still valid, by calling isValidHalfParsedData(). + * + * @param $data Serialized data * @return String */ - function unserialiseHalfParsedText( $data, $intPrefix = null ) { - if ( !$intPrefix ) { - $intPrefix = self::getRandomString(); + function unserializeHalfParsedText( $data ) { + if ( !isset( $data['version'] ) || $data['version'] != self::HALF_PARSED_VERSION ) { + throw new MWException( __METHOD__.': invalid version' ); } # First, extract the strip state. - $stripState = $data['stripstate']; - $this->mStripState->general->merge( $stripState->general ); - $this->mStripState->nowiki->merge( $stripState->nowiki ); - - # Now, extract the text, and renumber links - $text = $data['text']; - $links = $data['linkholder']; - - # Internal... - foreach ( $links['internal'] as $ns => $nsLinks ) { - foreach ( $nsLinks as $key => $entry ) { - $newKey = $intPrefix . '-' . $key; - $this->mLinkHolders->internals[$ns][$newKey] = $entry; + $texts = array( $data['text'] ); + $texts = $this->mStripState->merge( $data['stripState'], $texts ); - $text = str_replace( "<!--LINK $ns:$key-->", "<!--LINK $ns:$newKey-->", $text ); - } - } - - # Interwiki... - foreach ( $links['interwiki'] as $key => $entry ) { - $newKey = "$intPrefix-$key"; - $this->mLinkHolders->interwikis[$newKey] = $entry; - - $text = str_replace( "<!--IWLINK $key-->", "<!--IWLINK $newKey-->", $text ); - } + # Now renumber links + $texts = $this->mLinkHolders->mergeForeign( $data['linkHolders'], $texts ); # Should be good to go. - return $text; - } -} - -/** - * @todo document, briefly. - * @ingroup Parser - */ -class StripState { - var $general, $nowiki; - - function __construct() { - $this->general = new ReplacementArray; - $this->nowiki = new ReplacementArray; + return $texts[0]; } - function unstripGeneral( $text ) { - wfProfileIn( __METHOD__ ); - do { - $oldText = $text; - $text = $this->general->replace( $text ); - } while ( $text !== $oldText ); - wfProfileOut( __METHOD__ ); - return $text; - } - - function unstripNoWiki( $text ) { - wfProfileIn( __METHOD__ ); - do { - $oldText = $text; - $text = $this->nowiki->replace( $text ); - } while ( $text !== $oldText ); - wfProfileOut( __METHOD__ ); - return $text; - } - - function unstripBoth( $text ) { - wfProfileIn( __METHOD__ ); - do { - $oldText = $text; - $text = $this->general->replace( $text ); - $text = $this->nowiki->replace( $text ); - } while ( $text !== $oldText ); - wfProfileOut( __METHOD__ ); - return $text; - } -} - -/** - * @todo document, briefly. - * @ingroup Parser - */ -class OnlyIncludeReplacer { - var $output = ''; - - function replace( $matches ) { - if ( substr( $matches[1], -1 ) === "\n" ) { - $this->output .= substr( $matches[1], 0, -1 ); - } else { - $this->output .= $matches[1]; - } + /** + * Returns true if the given array, presumed to be generated by + * serializeHalfParsedText(), is compatible with the current version of the + * parser. + * + * @param $data Array. + */ + function isValidHalfParsedData( $data ) { + return isset( $data['version'] ) && $data['version'] == self::HALF_PARSED_VERSION; } } diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index 3dfd3e11ed0b..0ba580575eaa 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -1070,7 +1070,7 @@ class PPFrame_DOM implements PPFrame { $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX; $count = $contextNode->getAttribute( 'level' ); $s = substr( $s, 0, $count ) . $marker . substr( $s, $count ); - $this->parser->mStripState->general->setPair( $marker, '' ); + $this->parser->mStripState->addGeneral( $marker, '' ); } $out .= $s; } else { diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index 45af9ee15654..8a51a6b96cd7 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -1018,7 +1018,7 @@ class PPFrame_Hash implements PPFrame { $serial = count( $this->parser->mHeadings ) - 1; $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX; $s = substr( $s, 0, $bits['level'] ) . $marker . substr( $s, $bits['level'] ); - $this->parser->mStripState->general->setPair( $marker, '' ); + $this->parser->mStripState->setGeneral( $marker, '' ); $out .= $s; } else { # Expand in virtual stack diff --git a/includes/parser/StripState.php b/includes/parser/StripState.php new file mode 100644 index 000000000000..0bf9e17acee5 --- /dev/null +++ b/includes/parser/StripState.php @@ -0,0 +1,138 @@ +<?php + +/** + * @todo document, briefly. + * @ingroup Parser + */ +class StripState { + protected $prefix; + protected $data; + protected $regex; + + protected $tempType, $tempMergePrefix; + + function __construct( $prefix ) { + $this->prefix = $prefix; + $this->data = array( + 'nowiki' => array(), + 'general' => array() + ); + $this->regex = "/{$this->prefix}([^\x7f]+)" . Parser::MARKER_SUFFIX . '/'; + } + + /** + * Add a nowiki strip item + */ + function addNoWiki( $marker, $value ) { + $this->addItem( 'nowiki', $marker, $value ); + } + + function addGeneral( $marker, $value ) { + $this->addItem( 'general', $marker, $value ); + } + + protected function addItem( $type, $marker, $value ) { + if ( !preg_match( $this->regex, $marker, $m ) ) { + throw new MWException( "Invalid marker: $marker" ); + } + + $this->data[$type][$m[1]] = $value; + } + + function unstripGeneral( $text ) { + return $this->unstripType( 'general', $text ); + } + + function unstripNoWiki( $text ) { + return $this->unstripType( 'nowiki', $text ); + } + + function unstripBoth( $text ) { + $text = $this->unstripType( 'general', $text ); + $text = $this->unstripType( 'nowiki', $text ); + return $text; + } + + protected function unstripType( $type, $text ) { + // Shortcut + if ( !count( $this->data[$type] ) ) { + return $text; + } + + wfProfileIn( __METHOD__ ); + $this->tempType = $type; + $out = preg_replace_callback( $this->regex, array( $this, 'unstripCallback' ), $text ); + $this->tempType = null; + wfProfileOut( __METHOD__ ); + return $out; + } + + protected function unstripCallback( $m ) { + if ( isset( $this->data[$this->tempType][$m[1]] ) ) { + return $this->data[$this->tempType][$m[1]]; + } else { + return $m[0]; + } + } + + /** + * Get a StripState object which is sufficient to unstrip the given text. + * It will contain the minimum subset of strip items necessary. + */ + function getSubState( $text ) { + $subState = new StripState( $this->prefix ); + $pos = 0; + while ( true ) { + $startPos = strpos( $text, $this->prefix, $pos ); + $endPos = strpos( $text, Parser::MARKER_SUFFIX, $pos ); + if ( $startPos === false || $endPos === false ) { + break; + } + + $endPos += strlen( Parser::MARKER_SUFFIX ); + $marker = substr( $text, $startPos, $endPos - $startPos ); + if ( !preg_match( $this->regex, $marker, $m ) ) { + continue; + } + + $key = $m[1]; + if ( isset( $this->data['nowiki'][$key] ) ) { + $subState->data['nowiki'][$key] = $this->data['nowiki'][$key]; + } elseif ( isset( $this->data['general'][$key] ) ) { + $subState->data['general'][$key] = $this->data['general'][$key]; + } + $pos = $endPos; + } + return $subState; + } + + /** + * Merge another StripState object into this one. The strip marker keys + * will not be preserved. The strings in the $texts array will have their + * strip markers rewritten, the resulting array of strings will be returned. + * + * @param $otherState StripState + * @param $texts Array + * @return Array + */ + function merge( $otherState, $texts ) { + $mergePrefix = Parser::getRandomString(); + + foreach ( $otherState->data as $type => $items ) { + foreach ( $items as $key => $value ) { + $this->data[$type]["$mergePrefix-$key"] = $value; + } + } + + $this->tempMergePrefix = $mergePrefix; + $texts = preg_replace_callback( $otherState->regex, array( $this, 'mergeCallback' ), $texts ); + $this->tempMergePrefix = null; + return $texts; + } + + protected function mergeCallback( $m ) { + $key = $m[1]; + return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX; + } +} + |