Add XPath parser/evaluator (#34463)

* Add XPath parser/evaluator Signed-off-by: Ville Lindholm <ville@lindholm.dev> * Correctly annotate XPathEvaluator IDL Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: have bindings pass in `can_gc` Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: add docstrings Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: implement PartialEq for Value for readability Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: add docstrings for CoreFunctions Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: simplify node test code Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: add unit tests for string handling xpath functions Signed-off-by: Ville Lindholm <ville@lindholm.dev> * put xpath features behind dom.xpath.enabled pref Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review] remove rstest and insta dev-deps Signed-off-by: Ville Lindholm <ville@lindholm.dev> * update wpt test expectations Signed-off-by: Ville Lindholm <ville@lindholm.dev> * [PR review]: tweak metadata files Signed-off-by: Ville Lindholm <ville@lindholm.dev> * update wpt test expectations AGAIN Signed-off-by: Ville Lindholm <ville@lindholm.dev> --------- Signed-off-by: Ville Lindholm <ville@lindholm.dev>
author: Ville Lindholm <ville@lindholm.dev> 2024-12-08 04:01:50 +0200
committer: GitHub <noreply@github.com> 2024-12-08 02:01:50 +0000
commit: bc7fe41a02b044b52a54c7347f347557a4bd0e8d (patch)
tree: 328133ab7eea5f406965c24a3fd2c6cc4858c55c /components/script/xpath/eval.rs
parent: 264c0f972fd1a732ee1d1490d78a1d26a65c6f5f (diff)
download: servo-bc7fe41a02b044b52a54c7347f347557a4bd0e8d.tar.gz
servo-bc7fe41a02b044b52a54c7347f347557a4bd0e8d.zip
1 files changed, 589 insertions, 0 deletions
diff --git a/components/script/xpath/eval.rs b/components/script/xpath/eval.rs
new file mode 100644
index 00000000000..d880e2f3930
--- /dev/null
+++ b/components/script/xpath/eval.rs
@@ -0,0 +1,589 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
+
+use std::fmt;
+
+use html5ever::{local_name, namespace_prefix, namespace_url, ns, QualName};
+
+use super::parser::{
+    AdditiveOp, Axis, EqualityOp, Expr, FilterExpr, KindTest, Literal, MultiplicativeOp, NodeTest,
+    NumericLiteral, PathExpr, PredicateExpr, PredicateListExpr, PrimaryExpr,
+    QName as ParserQualName, RelationalOp, StepExpr, UnaryOp,
+};
+use super::{EvaluationCtx, Value};
+use crate::dom::bindings::codegen::Bindings::NodeBinding::NodeMethods;
+use crate::dom::bindings::inheritance::{Castable, CharacterDataTypeId, NodeTypeId};
+use crate::dom::bindings::root::DomRoot;
+use crate::dom::bindings::xmlname::validate_and_extract;
+use crate::dom::element::Element;
+use crate::dom::node::{Node, ShadowIncluding};
+use crate::dom::processinginstruction::ProcessingInstruction;
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum Error {
+    NotANodeset,
+    InvalidPath,
+    UnknownFunction { name: QualName },
+    UnknownVariable { name: QualName },
+    UnknownNamespace { prefix: String },
+    InvalidQName { qname: ParserQualName },
+    FunctionEvaluation { fname: String },
+    Internal { msg: String },
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Error::NotANodeset => write!(f, "expression did not evaluate to a nodeset"),
+            Error::InvalidPath => write!(f, "invalid path expression"),
+            Error::UnknownFunction { name } => write!(f, "unknown function {:?}", name),
+            Error::UnknownVariable { name } => write!(f, "unknown variable {:?}", name),
+            Error::UnknownNamespace { prefix } => {
+                write!(f, "unknown namespace prefix {:?}", prefix)
+            },
+            Error::InvalidQName { qname } => {
+                write!(f, "invalid QName {:?}", qname)
+            },
+            Error::FunctionEvaluation { fname } => {
+                write!(f, "error while evaluating function: {}", fname)
+            },
+            Error::Internal { msg } => {
+                write!(f, "internal error: {}", msg)
+            },
+        }
+    }
+}
+
+impl std::error::Error for Error {}
+
+pub fn try_extract_nodeset(v: Value) -> Result<Vec<DomRoot<Node>>, Error> {
+    match v {
+        Value::Nodeset(ns) => Ok(ns),
+        _ => Err(Error::NotANodeset),
+    }
+}
+
+pub trait Evaluatable: fmt::Debug {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error>;
+    /// Returns true if this expression evaluates to a primitive value, without needing to touch the DOM
+    fn is_primitive(&self) -> bool;
+}
+
+impl<T: ?Sized> Evaluatable for Box<T>
+where
+    T: Evaluatable,
+{
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        (**self).evaluate(context)
+    }
+
+    fn is_primitive(&self) -> bool {
+        (**self).is_primitive()
+    }
+}
+
+impl Evaluatable for Expr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        match self {
+            Expr::And(left, right) => {
+                let left_bool = left.evaluate(context)?.boolean();
+                let v = left_bool && right.evaluate(context)?.boolean();
+                Ok(Value::Boolean(v))
+            },
+            Expr::Or(left, right) => {
+                let left_bool = left.evaluate(context)?.boolean();
+                let v = left_bool || right.evaluate(context)?.boolean();
+                Ok(Value::Boolean(v))
+            },
+            Expr::Equality(left, equality_op, right) => {
+                let left_val = left.evaluate(context)?;
+                let right_val = right.evaluate(context)?;
+
+                let v = match equality_op {
+                    EqualityOp::Eq => left_val == right_val,
+                    EqualityOp::NotEq => left_val != right_val,
+                };
+
+                Ok(Value::Boolean(v))
+            },
+            Expr::Relational(left, relational_op, right) => {
+                let left_val = left.evaluate(context)?.number();
+                let right_val = right.evaluate(context)?.number();
+
+                let v = match relational_op {
+                    RelationalOp::Lt => left_val < right_val,
+                    RelationalOp::Gt => left_val > right_val,
+                    RelationalOp::LtEq => left_val <= right_val,
+                    RelationalOp::GtEq => left_val >= right_val,
+                };
+                Ok(Value::Boolean(v))
+            },
+            Expr::Additive(left, additive_op, right) => {
+                let left_val = left.evaluate(context)?.number();
+                let right_val = right.evaluate(context)?.number();
+
+                let v = match additive_op {
+                    AdditiveOp::Add => left_val + right_val,
+                    AdditiveOp::Sub => left_val - right_val,
+                };
+                Ok(Value::Number(v))
+            },
+            Expr::Multiplicative(left, multiplicative_op, right) => {
+                let left_val = left.evaluate(context)?.number();
+                let right_val = right.evaluate(context)?.number();
+
+                let v = match multiplicative_op {
+                    MultiplicativeOp::Mul => left_val * right_val,
+                    MultiplicativeOp::Div => left_val / right_val,
+                    MultiplicativeOp::Mod => left_val % right_val,
+                };
+                Ok(Value::Number(v))
+            },
+            Expr::Unary(unary_op, expr) => {
+                let v = expr.evaluate(context)?.number();
+
+                match unary_op {
+                    UnaryOp::Minus => Ok(Value::Number(-v)),
+                }
+            },
+            Expr::Union(left, right) => {
+                let as_nodes = |e: &Expr| e.evaluate(context).and_then(try_extract_nodeset);
+
+                let mut left_nodes = as_nodes(left)?;
+                let right_nodes = as_nodes(right)?;
+
+                left_nodes.extend(right_nodes);
+                Ok(Value::Nodeset(left_nodes))
+            },
+            Expr::Path(path_expr) => path_expr.evaluate(context),
+        }
+    }
+
+    fn is_primitive(&self) -> bool {
+        match self {
+            Expr::Or(left, right) => left.is_primitive() && right.is_primitive(),
+            Expr::And(left, right) => left.is_primitive() && right.is_primitive(),
+            Expr::Equality(left, _, right) => left.is_primitive() && right.is_primitive(),
+            Expr::Relational(left, _, right) => left.is_primitive() && right.is_primitive(),
+            Expr::Additive(left, _, right) => left.is_primitive() && right.is_primitive(),
+            Expr::Multiplicative(left, _, right) => left.is_primitive() && right.is_primitive(),
+            Expr::Unary(_, expr) => expr.is_primitive(),
+            Expr::Union(_, _) => false,
+            Expr::Path(path_expr) => path_expr.is_primitive(),
+        }
+    }
+}
+
+impl Evaluatable for PathExpr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        let mut current_nodes = vec![context.context_node.clone()];
+
+        // If path starts with '//', add an implicit descendant-or-self::node() step
+        if self.is_descendant {
+            current_nodes = current_nodes
+                .iter()
+                .flat_map(|n| n.traverse_preorder(ShadowIncluding::No))
+                .collect();
+        }
+
+        trace!("[PathExpr] Evaluating path expr: {:?}", self);
+
+        let have_multiple_steps = self.steps.len() > 1;
+
+        for step in &self.steps {
+            let mut next_nodes = Vec::new();
+            for node in current_nodes {
+                let step_context = context.subcontext_for_node(&node);
+                let step_result = step.evaluate(&step_context)?;
+                match (have_multiple_steps, step_result) {
+                    (_, Value::Nodeset(mut nodes)) => {
+                        // as long as we evaluate to nodesets, keep going
+                        next_nodes.append(&mut nodes);
+                    },
+                    (false, value) => {
+                        trace!("[PathExpr] Got single primitive value: {:?}", value);
+                        return Ok(value);
+                    },
+                    (true, value) => {
+                        error!(
+                        "Expected nodeset from step evaluation, got: {:?} node: {:?}, step: {:?}",
+                        value, node, step
+                    );
+                        return Ok(value);
+                    },
+                }
+            }
+            current_nodes = next_nodes;
+        }
+
+        trace!("[PathExpr] Got nodes: {:?}", current_nodes);
+
+        Ok(Value::Nodeset(current_nodes))
+    }
+
+    fn is_primitive(&self) -> bool {
+        !self.is_absolute &&
+            !self.is_descendant &&
+            self.steps.len() == 1 &&
+            self.steps[0].is_primitive()
+    }
+}
+
+impl TryFrom<&ParserQualName> for QualName {
+    type Error = Error;
+
+    fn try_from(qname: &ParserQualName) -> Result<Self, Self::Error> {
+        let qname_as_str = qname.to_string();
+        if let Ok((ns, prefix, local)) = validate_and_extract(None, &qname_as_str) {
+            Ok(QualName { prefix, ns, local })
+        } else {
+            Err(Error::InvalidQName {
+                qname: qname.clone(),
+            })
+        }
+    }
+}
+
+pub enum NameTestComparisonMode {
+    /// Namespaces must match exactly
+    XHtml,
+    /// Missing namespace information is treated as the HTML namespace
+    Html,
+}
+
+pub fn element_name_test(
+    expected_name: QualName,
+    element_qualname: QualName,
+    comparison_mode: NameTestComparisonMode,
+) -> bool {
+    let is_wildcard = expected_name.local == local_name!("*");
+
+    let test_prefix = expected_name
+        .prefix
+        .clone()
+        .unwrap_or(namespace_prefix!(""));
+    let test_ns_uri = match test_prefix {
+        namespace_prefix!("*") => ns!(*),
+        namespace_prefix!("html") => ns!(html),
+        namespace_prefix!("xml") => ns!(xml),
+        namespace_prefix!("xlink") => ns!(xlink),
+        namespace_prefix!("svg") => ns!(svg),
+        namespace_prefix!("mathml") => ns!(mathml),
+        namespace_prefix!("") => {
+            if matches!(comparison_mode, NameTestComparisonMode::XHtml) {
+                ns!()
+            } else {
+                ns!(html)
+            }
+        },
+        _ => {
+            // We don't support custom namespaces, use fallback or panic depending on strictness
+            if matches!(comparison_mode, NameTestComparisonMode::XHtml) {
+                panic!("Unrecognized namespace prefix: {}", test_prefix)
+            } else {
+                ns!(html)
+            }
+        },
+    };
+
+    if is_wildcard {
+        test_ns_uri == element_qualname.ns
+    } else {
+        test_ns_uri == element_qualname.ns && expected_name.local == element_qualname.local
+    }
+}
+
+fn apply_node_test(test: &NodeTest, node: &Node) -> Result<bool, Error> {
+    let result = match test {
+        NodeTest::Name(qname) => {
+            // Convert the unvalidated "parser QualName" into the proper QualName structure
+            let wanted_name: QualName = qname.try_into()?;
+            if matches!(node.type_id(), NodeTypeId::Element(_)) {
+                let element = node.downcast::<Element>().unwrap();
+                let comparison_mode = if node.owner_doc().is_xhtml_document() {
+                    NameTestComparisonMode::XHtml
+                } else {
+                    NameTestComparisonMode::Html
+                };
+                let element_qualname = QualName::new(
+                    element.prefix().as_ref().cloned(),
+                    element.namespace().clone(),
+                    element.local_name().clone(),
+                );
+                element_name_test(wanted_name, element_qualname, comparison_mode)
+            } else {
+                false
+            }
+        },
+        NodeTest::Wildcard => true,
+        NodeTest::Kind(kind) => match kind {
+            KindTest::PI(target) => {
+                if NodeTypeId::CharacterData(CharacterDataTypeId::ProcessingInstruction) ==
+                    node.type_id()
+                {
+                    let pi = node.downcast::<ProcessingInstruction>().unwrap();
+                    match (target, pi.target()) {
+                        (Some(target_name), node_target_name)
+                            if target_name == &node_target_name.to_string() =>
+                        {
+                            true
+                        },
+                        (Some(_), _) => false,
+                        (None, _) => true,
+                    }
+                } else {
+                    false
+                }
+            },
+            KindTest::Comment => matches!(
+                node.type_id(),
+                NodeTypeId::CharacterData(CharacterDataTypeId::Comment)
+            ),
+            KindTest::Text => matches!(
+                node.type_id(),
+                NodeTypeId::CharacterData(CharacterDataTypeId::Text(_))
+            ),
+            KindTest::Node => true,
+        },
+    };
+    Ok(result)
+}
+
+impl Evaluatable for StepExpr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        match self {
+            StepExpr::Filter(filter_expr) => filter_expr.evaluate(context),
+            StepExpr::Axis(axis_step) => {
+                let nodes: Vec<DomRoot<Node>> = match axis_step.axis {
+                    Axis::Child => context.context_node.children().collect(),
+                    Axis::Descendant => context
+                        .context_node
+                        .traverse_preorder(ShadowIncluding::No)
+                        .skip(1)
+                        .collect(),
+                    Axis::Parent => vec![context.context_node.GetParentNode()]
+                        .into_iter()
+                        .flatten()
+                        .collect(),
+                    Axis::Ancestor => context.context_node.ancestors().collect(),
+                    Axis::Following => context
+                        .context_node
+                        .following_nodes(&context.context_node)
+                        .skip(1)
+                        .collect(),
+                    Axis::Preceding => context
+                        .context_node
+                        .preceding_nodes(&context.context_node)
+                        .skip(1)
+                        .collect(),
+                    Axis::FollowingSibling => context.context_node.following_siblings().collect(),
+                    Axis::PrecedingSibling => context.context_node.preceding_siblings().collect(),
+                    Axis::Attribute => {
+                        if matches!(Node::type_id(&context.context_node), NodeTypeId::Element(_)) {
+                            let element = context.context_node.downcast::<Element>().unwrap();
+                            element
+                                .attrs()
+                                .iter()
+                                .map(|attr| attr.upcast::<Node>())
+                                .map(DomRoot::from_ref)
+                                .collect()
+                        } else {
+                            vec![]
+                        }
+                    },
+                    Axis::Self_ => vec![context.context_node.clone()],
+                    Axis::DescendantOrSelf => context
+                        .context_node
+                        .traverse_preorder(ShadowIncluding::No)
+                        .collect(),
+                    Axis::AncestorOrSelf => context
+                        .context_node
+                        .inclusive_ancestors(ShadowIncluding::No)
+                        .collect(),
+                    Axis::Namespace => Vec::new(), // Namespace axis is not commonly implemented
+                };
+
+                trace!("[StepExpr] Axis {:?} got nodes {:?}", axis_step.axis, nodes);
+
+                // Filter nodes according to the step's node_test. Will error out if any NodeTest
+                // application errors out.
+                let filtered_nodes: Vec<DomRoot<Node>> = nodes
+                    .into_iter()
+                    .map(|node| {
+                        apply_node_test(&axis_step.node_test, &node)
+                            .map(|matches| matches.then_some(node))
+                    })
+                    .collect::<Result<Vec<_>, _>>()?
+                    .into_iter()
+                    .flatten()
+                    .collect();
+
+                trace!("[StepExpr] Filtering got nodes {:?}", filtered_nodes);
+
+                if axis_step.predicates.predicates.is_empty() {
+                    trace!(
+                        "[StepExpr] No predicates, returning nodes {:?}",
+                        filtered_nodes
+                    );
+                    Ok(Value::Nodeset(filtered_nodes))
+                } else {
+                    // Apply predicates
+                    let predicate_list_subcontext = context
+                        .update_predicate_nodes(filtered_nodes.iter().map(|n| &**n).collect());
+                    axis_step.predicates.evaluate(&predicate_list_subcontext)
+                }
+            },
+        }
+    }
+
+    fn is_primitive(&self) -> bool {
+        match self {
+            StepExpr::Filter(filter_expr) => filter_expr.is_primitive(),
+            StepExpr::Axis(_) => false,
+        }
+    }
+}
+
+impl Evaluatable for PredicateListExpr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        if let Some(ref predicate_nodes) = context.predicate_nodes {
+            // Initializing: every node the predicates act on is matched
+            let mut matched_nodes: Vec<DomRoot<Node>> = predicate_nodes.clone();
+
+            // apply each predicate to the nodes matched by the previous predicate
+            for predicate_expr in &self.predicates {
+                let context_for_predicate =
+                    context.update_predicate_nodes(matched_nodes.iter().map(|n| &**n).collect());
+
+                let narrowed_nodes = predicate_expr
+                    .evaluate(&context_for_predicate)
+                    .and_then(try_extract_nodeset)?;
+                matched_nodes = narrowed_nodes;
+                trace!(
+                    "[PredicateListExpr] Predicate {:?} matched nodes {:?}",
+                    predicate_expr,
+                    matched_nodes
+                );
+            }
+            Ok(Value::Nodeset(matched_nodes))
+        } else {
+            Err(Error::Internal {
+                msg: "[PredicateListExpr] No nodes on stack for predicate to operate on"
+                    .to_string(),
+            })
+        }
+    }
+
+    fn is_primitive(&self) -> bool {
+        self.predicates.len() == 1 && self.predicates[0].is_primitive()
+    }
+}
+
+impl Evaluatable for PredicateExpr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        let narrowed_nodes: Result<Vec<DomRoot<Node>>, Error> = context
+            .subcontext_iter_for_nodes()
+            .filter_map(|ctx| {
+                if let Some(predicate_ctx) = ctx.predicate_ctx {
+                    let eval_result = self.expr.evaluate(&ctx);
+
+                    let v = match eval_result {
+                        Ok(Value::Number(v)) => Ok(predicate_ctx.index == v as usize),
+                        Ok(v) => Ok(v.boolean()),
+                        Err(e) => Err(e),
+                    };
+
+                    match v {
+                        Ok(true) => Some(Ok(ctx.context_node)),
+                        Ok(false) => None,
+                        Err(e) => Some(Err(e)),
+                    }
+                } else {
+                    Some(Err(Error::Internal {
+                        msg: "[PredicateExpr] No predicate context set".to_string(),
+                    }))
+                }
+            })
+            .collect();
+
+        Ok(Value::Nodeset(narrowed_nodes?))
+    }
+
+    fn is_primitive(&self) -> bool {
+        self.expr.is_primitive()
+    }
+}
+
+impl Evaluatable for FilterExpr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        let primary_result = self.primary.evaluate(context)?;
+        let have_predicates = !self.predicates.predicates.is_empty();
+
+        match (have_predicates, &primary_result) {
+            (false, _) => {
+                trace!(
+                    "[FilterExpr] No predicates, returning primary result: {:?}",
+                    primary_result
+                );
+                Ok(primary_result)
+            },
+            (true, Value::Nodeset(vec)) => {
+                let predicate_list_subcontext =
+                    context.update_predicate_nodes(vec.iter().map(|n| &**n).collect());
+                let result_filtered_by_predicates =
+                    self.predicates.evaluate(&predicate_list_subcontext);
+                trace!(
+                    "[FilterExpr] Result filtered by predicates: {:?}",
+                    result_filtered_by_predicates
+                );
+                result_filtered_by_predicates
+            },
+            // You can't use filtering expressions `[]` on other than node-sets
+            (true, _) => Err(Error::NotANodeset),
+        }
+    }
+
+    fn is_primitive(&self) -> bool {
+        self.predicates.predicates.is_empty() && self.primary.is_primitive()
+    }
+}
+
+impl Evaluatable for PrimaryExpr {
+    fn evaluate(&self, context: &EvaluationCtx) -> Result<Value, Error> {
+        match self {
+            PrimaryExpr::Literal(literal) => literal.evaluate(context),
+            PrimaryExpr::Variable(_qname) => todo!(),
+            PrimaryExpr::Parenthesized(expr) => expr.evaluate(context),
+            PrimaryExpr::ContextItem => Ok(Value::Nodeset(vec![context.context_node.clone()])),
+            PrimaryExpr::Function(core_function) => core_function.evaluate(context),
+        }
+    }
+
+    fn is_primitive(&self) -> bool {
+        match self {
+            PrimaryExpr::Literal(_) => true,
+            PrimaryExpr::Variable(_qname) => false,
+            PrimaryExpr::Parenthesized(expr) => expr.is_primitive(),
+            PrimaryExpr::ContextItem => false,
+            PrimaryExpr::Function(_) => false,
+        }
+    }
+}
+
+impl Evaluatable for Literal {
+    fn evaluate(&self, _context: &EvaluationCtx) -> Result<Value, Error> {
+        match self {
+            Literal::Numeric(numeric_literal) => match numeric_literal {
+                // We currently make no difference between ints and floats
+                NumericLiteral::Integer(v) => Ok(Value::Number(*v as f64)),
+                NumericLiteral::Decimal(v) => Ok(Value::Number(*v)),
+            },
+            Literal::String(s) => Ok(Value::String(s.into())),
+        }
+    }
+
+    fn is_primitive(&self) -> bool {
+        true
+    }
+}
author	Ville Lindholm <ville@lindholm.dev>	2024-12-08 04:01:50 +0200
committer	GitHub <noreply@github.com>	2024-12-08 02:01:50 +0000
commit	bc7fe41a02b044b52a54c7347f347557a4bd0e8d (patch)
tree	328133ab7eea5f406965c24a3fd2c6cc4858c55c /components/script/xpath/eval.rs
parent	264c0f972fd1a732ee1d1490d78a1d26a65c6f5f (diff)
download	servo-bc7fe41a02b044b52a54c7347f347557a4bd0e8d.tar.gz servo-bc7fe41a02b044b52a54c7347f347557a4bd0e8d.zip