aboutsummaryrefslogtreecommitdiffstats
path: root/includes/parser
diff options
context:
space:
mode:
authorSubramanya Sastry <ssastry@wikimedia.org>2025-01-03 14:42:02 -0600
committerSubramanya Sastry <ssastry@wikimedia.org>2025-01-17 08:38:15 -0600
commitb727f290f5ea827b9b2d9820fb4007bbf29101c4 (patch)
tree4ae1b30637c52c8a8b2aba5ce3e7b4ca63e89c2c /includes/parser
parent56590fc471aef3920d93d556d371cef4bca33a7f (diff)
downloadmediawikicore-b727f290f5ea827b9b2d9820fb4007bbf29101c4.tar.gz
mediawikicore-b727f290f5ea827b9b2d9820fb4007bbf29101c4.zip
For Parsoid calls, treat preprocessing as starting in SOL state
* If Parsoid calls the preprocessor, initialize lineStart to true. Track this through: - parser function calls that return expandable template messages int: parser function is an example in core Extensions seem to define a number of other such parser functions - template-arg substitions So {{templatename|mytemplate}} with text {{{{{1}}}}} which is effectively a call to {{mytemplate}} continues to set sol-state to true across the expansion. See test "Preprocessor precedence 5: tplarg takes precedence over template" in preprocessor.txt which exercises this use case. - However, note that this is a best-faith effort because this flag is set while building the preprocessor DOM tree before templates are expanded. So, this is mostly a source syntax flag and constructs that expand to empty strings can blind the preprocessor to the true value of SOL state in the expanded string. This is true for both the legacy parser and Parsoid, and as such T2529 behavior is a hack with a set of associated edge cases. * Parsoid models templates as independent documents as always starting in start-of-line state (and does some patch up for b/c reasons where this assumption fails). So, there is no reason to add newlines for some set of wikitext characters (per T2529) when Parsoid is involved. * This lets us eliminate some hacks in Parsoid to strip these added newlines when Parsoid was already in SOL state but which then introduces edge cases. See discussion in T382464 where Parsoid currently cannot distinguish between a couple of test cases. * But, with this change, where Parsoid no longer gets a newline added, Parsoid doesn't have to heuristically remove the newline (and incorrectly as in the edge case in the bug report) which eliminates the edge case from the bug report. * This change has to be backed by a change in Parsoid to undo the T2529 newline removal hack in TokenStreamPatcher to ensure Parsoid CI doesn't break with this change. * To let us safely test this in Parsoid's round-trip testing and safely (and conservatively) roll this out to production, this change is backed by a new config flag (ParsoidTemplateExpansionMode) which defaults to false. We unconditionally set this to true in the ParserTestRunner for all parser tests. This flag will be removed once we roll out this change and the Parsoid change to production. Bug: T382464 Change-Id: I194a9550500bf7ece215791c51d6feb78a80b1a8
Diffstat (limited to 'includes/parser')
-rw-r--r--includes/parser/Parser.php54
-rw-r--r--includes/parser/Parsoid/Config/DataAccess.php7
-rw-r--r--includes/parser/Preprocessor.php5
-rw-r--r--includes/parser/Preprocessor_Hash.php3
4 files changed, 55 insertions, 14 deletions
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index d7562dd5e412..5558f011dfe0 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -2934,10 +2934,16 @@ class Parser {
* double-brace expansion.
* @param bool $stripExtTags When true, put extension tags in general strip state; when
* false extension tags are skipped during OT_PREPROCESS
+ * @param bool $parsoidTopLevelCall Is this coming from Parsoid for top-level templates?
+ * This is used to set start-of-line flag to true for template expansions since that
+ * is how Parsoid models templates.
+ *
* @return string
* @since 1.24 method is public
*/
- public function replaceVariables( $text, $frame = false, $argsOnly = false, $stripExtTags = true ) {
+ public function replaceVariables(
+ $text, $frame = false, $argsOnly = false, $stripExtTags = true, bool $parsoidTopLevelCall = false
+ ) {
# Is there any text? Also, Prevent too big inclusions!
$textSize = strlen( $text );
if ( $textSize < 1 || $textSize > $this->mOptions->getMaxIncludeSize() ) {
@@ -2955,7 +2961,7 @@ class Parser {
$frame = $this->getPreprocessor()->newCustomFrame( $frame );
}
- $dom = $this->preprocessToDom( $text );
+ $dom = $this->preprocessToDom( $text, $parsoidTopLevelCall ? Preprocessor::START_IN_SOL_STATE : 0 );
$flags = $argsOnly ? PPFrame::NO_TEMPLATES : 0;
[ $stripExtTags, $this->mStripExtTags ] = [ $this->mStripExtTags, $stripExtTags ];
$text = $frame->expand( $dom, $flags );
@@ -3052,6 +3058,8 @@ class Parser {
$sawDeprecatedTemplateEquals = false; // T91154
+ $isParsoid = $this->mOptions->getUseParsoid();
+
# SUBST
// @phan-suppress-next-line PhanImpossibleCondition
if ( !$found ) {
@@ -3125,7 +3133,9 @@ class Parser {
$funcArgs[] = $args->item( $i );
}
- $result = $this->callParserFunction( $frame, $func, $funcArgs );
+ $result = $this->callParserFunction(
+ $frame, $func, $funcArgs, $isParsoid && $piece['lineStart']
+ );
// Extract any forwarded flags
if ( isset( $result['title'] ) ) {
@@ -3243,7 +3253,7 @@ class Parser {
": template inclusion denied for " . $title->getPrefixedDBkey()
);
} else {
- [ $text, $title ] = $this->getTemplateDom( $title );
+ [ $text, $title ] = $this->getTemplateDom( $title, $isParsoid && $piece['lineStart'] );
if ( $text !== false ) {
$found = true;
$isChildObj = true;
@@ -3274,7 +3284,8 @@ class Parser {
} else {
$text = $this->interwikiTransclude( $title, 'raw' );
# Preprocess it like a template
- $text = $this->preprocessToDom( $text, Preprocessor::DOM_FOR_INCLUSION );
+ $sol = ( $isParsoid && $piece['lineStart'] ) ? Preprocessor::START_IN_SOL_STATE : 0;
+ $text = $this->preprocessToDom( $text, Preprocessor::DOM_FOR_INCLUSION | $sol );
$isChildObj = true;
}
$found = true;
@@ -3351,9 +3362,12 @@ class Parser {
&& !$piece['lineStart']
&& preg_match( '/^(?:{\\||:|;|#|\*)/', $text )
) {
- # T2529: if the template begins with a table or block-level
- # element, it should be treated as beginning a new line.
- # This behavior is somewhat controversial.
+ // T2529: if the template begins with a table or block-level
+ // element, it should be treated as beginning a new line.
+ // This behavior is somewhat controversial.
+ //
+ // T382464: Parsoid sets $piece['lineStart'] at top-level when
+ // expanding templates, so this hack is restricted to nested expansions.
$text = "\n" . $text;
}
@@ -3398,9 +3412,15 @@ class Parser {
* @param PPFrame $frame The current frame, contains template arguments
* @param string $function Function name
* @param array $args Arguments to the function
+ * @param bool $inSolState Is the template processing starting in Start-Of-Line (SOL) position?
+ * Prepreprocessing (on behalf of Parsoid) uses this flag to set lineStart property on
+ * processor DOM tree nodes. Since the preprocessor tree doesn't rely on expanded templates,
+ * this flag is a best guess since {{expands-to-empty-string}} can blind it to SOL context.
+ * This flag is always false for legacy parser template expansions.
+ *
* @return array
*/
- public function callParserFunction( PPFrame $frame, $function, array $args = [] ) {
+ public function callParserFunction( PPFrame $frame, $function, array $args = [], bool $inSolState = false ) {
# Case sensitive functions
if ( isset( $this->mFunctionSynonyms[1][$function] ) ) {
$function = $this->mFunctionSynonyms[1][$function];
@@ -3474,6 +3494,7 @@ class Parser {
}
if ( !$noparse ) {
+ $preprocessFlags |= ( $inSolState ? Preprocessor::START_IN_SOL_STATE : 0 );
$result['text'] = $this->preprocessToDom( $result['text'], $preprocessFlags );
$result['isChildObj'] = true;
}
@@ -3486,11 +3507,16 @@ class Parser {
* and its redirect destination title. Cached.
*
* @param LinkTarget $title
+ * @param bool $inSolState Is the template processing starting in Start-Of-Line (SOL) position?
+ * Prepreprocessing (on behalf of Parsoid) uses this flag to set lineStart property on
+ * processor DOM tree nodes. Since the preprocessor tree doesn't rely on expanded templates,
+ * this flag is a best guess since {{expands-to-empty-string}} can blind it to SOL context.
+ * This flag is always false for legacy parser template expansions.
*
* @return array
* @since 1.12
*/
- public function getTemplateDom( LinkTarget $title ) {
+ public function getTemplateDom( LinkTarget $title, bool $inSolState = false ) {
$cacheTitle = $title;
$titleKey = CacheKeyHelper::getKeyForPage( $title );
@@ -3499,11 +3525,16 @@ class Parser {
$title = Title::makeTitle( $ns, $dbk );
$titleKey = CacheKeyHelper::getKeyForPage( $title );
}
+
+ // Factor in sol-state in the cache key
+ $titleKey = "$titleKey:sol=" . ( $inSolState ? "0" : "1" );
if ( isset( $this->mTplDomCache[$titleKey] ) ) {
return [ $this->mTplDomCache[$titleKey], $title ];
}
# Cache miss, go to the database
+ // FIXME T383919: if $title is changed by this call, caching below
+ // will be ineffective.
[ $text, $title ] = $this->fetchTemplateAndTitle( $title );
if ( $text === false ) {
@@ -3511,7 +3542,8 @@ class Parser {
return [ false, $title ];
}
- $dom = $this->preprocessToDom( $text, Preprocessor::DOM_FOR_INCLUSION );
+ $flags = Preprocessor::DOM_FOR_INCLUSION | ( $inSolState ? Preprocessor::START_IN_SOL_STATE : 0 );
+ $dom = $this->preprocessToDom( $text, $flags );
$this->mTplDomCache[$titleKey] = $dom;
if ( !$title->isSamePageAs( $cacheTitle ) ) {
diff --git a/includes/parser/Parsoid/Config/DataAccess.php b/includes/parser/Parsoid/Config/DataAccess.php
index ef61ed855f74..96ad6c23170c 100644
--- a/includes/parser/Parsoid/Config/DataAccess.php
+++ b/includes/parser/Parsoid/Config/DataAccess.php
@@ -58,6 +58,7 @@ use Wikimedia\Rdbms\ReadOnlyMode;
class DataAccess extends IDataAccess {
public const CONSTRUCTOR_OPTIONS = [
MainConfigNames::ParsoidFragmentSupport,
+ MainConfigNames::ParsoidNewTemplateExpansionMode,
MainConfigNames::SVGMaxSize,
];
@@ -388,14 +389,16 @@ class DataAccess extends IDataAccess {
# $wikitext is passed by reference and mutated
$parser, $wikitext, $parser->getStripState()
);
+ $parsoidNewTemplateExpansionMode = $this->config->get( MainConfigNames::ParsoidNewTemplateExpansionMode );
if ( $this->config->get( MainConfigNames::ParsoidFragmentSupport ) === false ) {
// Original support: just unstrip (T289545)
- $wikitext = $parser->replaceVariables( $wikitext, $this->ppFrame );
+ $wikitext = $parser->replaceVariables(
+ $wikitext, $this->ppFrame, false, true, $parsoidNewTemplateExpansionMode );
$wikitext = $parser->getStripState()->unstripBoth( $wikitext );
} else {
// New PFragment-based support (T374616)
$wikitext = $parser->replaceVariables(
- $wikitext, $this->ppFrame, false, false
+ $wikitext, $this->ppFrame, false, false, $parsoidNewTemplateExpansionMode
);
// Where the result has strip state markers, tunnel this content
// through Parsoid as a PFragment type.
diff --git a/includes/parser/Preprocessor.php b/includes/parser/Preprocessor.php
index 47d75eca67f1..07fa5205f746 100644
--- a/includes/parser/Preprocessor.php
+++ b/includes/parser/Preprocessor.php
@@ -35,6 +35,11 @@ abstract class Preprocessor {
public const DOM_LANG_CONVERSION_DISABLED = 2;
/** Preprocessor cache bypass flag for Preprocessor::preprocessToObj */
public const DOM_UNCACHED = 4;
+ // Does preprocessing start in Start-Of-Line(SOL) state? Only relevant for Parsoid
+ // content, since Parsoid models templates as independent documents in SOL start.
+ // This flag is never set by the legacy parser (but see T2529 which has a similar
+ // effect).
+ public const START_IN_SOL_STATE = 8;
/** @var Parser */
public $parser;
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php
index efad96514454..e37b8a78c5d5 100644
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -149,6 +149,7 @@ class Preprocessor_Hash extends Preprocessor {
* @return array JSON-serializable document object model array
*/
private function buildDomTreeArrayFromText( $text, $flags ) {
+ $textStartsInSOLState = $flags & self::START_IN_SOL_STATE;
$forInclusion = ( $flags & self::DOM_FOR_INCLUSION );
$langConversionDisabled = ( $flags & self::DOM_LANG_CONVERSION_DISABLED );
@@ -606,7 +607,7 @@ class Preprocessor_Hash extends Preprocessor {
: strspn( $text, $curChar, $i );
$savedPrefix = '';
- $lineStart = $i > 0 && $text[$i - 1] === "\n";
+ $lineStart = ( $i === 0 ) ? $textStartsInSOLState : ( $text[$i - 1] === "\n" );
if ( $curChar === "-{" && $count > $curLen ) {
// -{ => {{ transition because rightmost wins