diff options
author | Emilio Cobos Álvarez <emilio@crisal.io> | 2023-05-24 06:36:08 +0000 |
---|---|---|
committer | Martin Robinson <mrobinson@igalia.com> | 2023-11-24 08:57:14 +0100 |
commit | d49b014c78e77de890e199541759cad0f12a1aa2 (patch) | |
tree | 6b2d3da40e1434ce86b3e906a1dbb3c87c5fc7a1 | |
parent | ea3fcce25f933e139f786da516ef806d1ed0a109 (diff) | |
download | servo-d49b014c78e77de890e199541759cad0f12a1aa2.tar.gz servo-d49b014c78e77de890e199541759cad0f12a1aa2.zip |
style: Make style parallel traversal more tunable at runtime
This adds two prefs to configure the parallel traversal work item size
and kick-off threshold, but otherwise shouldn't change behavior.
I switched from iterator generics to just a slice while at it, mostly
for simplicity, but there is a trade-off:
* When switching from sequential to parallel traversal, we potentially
pay the price of memmoving the VecDeque around once to make them a
contiguous slice.
* However we win in the common case of the smaller-than-work-unit size
in which case we no longer need to copy stuff to a WorkItem. So I
think overall this should be an improvement.
Differential Revision: https://phabricator.services.mozilla.com/D178656
-rw-r--r-- | components/style/driver.rs | 105 | ||||
-rw-r--r-- | components/style/parallel.rs | 72 |
2 files changed, 89 insertions, 88 deletions
diff --git a/components/style/driver.rs b/components/style/driver.rs index b7e33454f41..c02924b73b3 100644 --- a/components/style/driver.rs +++ b/components/style/driver.rs @@ -11,7 +11,7 @@ use crate::context::{PerThreadTraversalStatistics, StyleContext}; use crate::context::{ThreadLocalStyleContext, TraversalStatistics}; use crate::dom::{SendNode, TElement, TNode}; use crate::parallel; -use crate::parallel::{DispatchMode, WORK_UNIT_MAX}; +use crate::parallel::{work_unit_max, DispatchMode}; use crate::scoped_tls::ScopedTLS; use crate::traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken}; use rayon; @@ -48,17 +48,19 @@ fn report_statistics(stats: &PerThreadTraversalStatistics) { gecko_stats.mStylesReused += stats.styles_reused; } -/// Do a DOM traversal for top-down and (optionally) bottom-up processing, -/// generic over `D`. +fn parallelism_threshold() -> usize { + static_prefs::pref!("layout.css.stylo-parallelism-threshold") as usize +} + +/// Do a DOM traversal for top-down and (optionally) bottom-up processing, generic over `D`. /// -/// We use an adaptive traversal strategy. We start out with simple sequential -/// processing, until we arrive at a wide enough level in the DOM that the -/// parallel traversal would parallelize it. If a thread pool is provided, we -/// then transfer control over to the parallel traversal. +/// We use an adaptive traversal strategy. We start out with simple sequential processing, until we +/// arrive at a wide enough level in the DOM that the parallel traversal would parallelize it. +/// If a thread pool is provided, we then transfer control over to the parallel traversal. /// -/// Returns true if the traversal was parallel, and also returns the statistics -/// object containing information on nodes traversed (on nightly only). Not -/// all of its fields will be initialized since we don't call finish(). +/// Returns true if the traversal was parallel, and also returns the statistics object containing +/// information on nodes traversed (on nightly only). Not all of its fields will be initialized +/// since we don't call finish(). pub fn traverse_dom<E, D>( traversal: &D, token: PreTraverseToken<E>, @@ -100,7 +102,9 @@ where // Process the nodes breadth-first, just like the parallel traversal does. // This helps keep similar traversal characteristics for the style sharing // cache. - let mut discovered = VecDeque::<SendNode<E::ConcreteNode>>::with_capacity(WORK_UNIT_MAX * 2); + let work_unit_max = work_unit_max(); + let parallelism_threshold = parallelism_threshold(); + let mut discovered = VecDeque::<SendNode<E::ConcreteNode>>::with_capacity(work_unit_max * 2); let mut depth = root.depth(); let mut nodes_remaining_at_current_depth = 1; discovered.push_back(unsafe { SendNode::new(root.as_node()) }); @@ -122,45 +126,48 @@ where ); nodes_remaining_at_current_depth -= 1; - if nodes_remaining_at_current_depth == 0 { - depth += 1; - // If there is enough work to parallelize over, and the caller allows - // parallelism, switch to the parallel driver. We do this only when - // moving to the next level in the dom so that we can pass the same - // depth for all the children. - if pool.is_some() && discovered.len() > WORK_UNIT_MAX { - let pool = pool.unwrap(); - let tls = ScopedTLS::<ThreadLocalStyleContext<E>>::new(pool); - let root_opaque = root.as_node().opaque(); - let drain = discovered.drain(..); - pool.scope_fifo(|scope| { - // Enable a breadth-first rayon traversal. This causes the work - // queue to be always FIFO, rather than FIFO for stealers and - // FILO for the owner (which is what rayon does by default). This - // ensures that we process all the elements at a given depth before - // proceeding to the next depth, which is important for style sharing. - #[cfg(feature = "gecko")] - gecko_profiler_label!(Layout, StyleComputation); - parallel::traverse_nodes( - drain, - DispatchMode::TailCall, - /* recursion_ok = */ true, - root_opaque, - PerLevelTraversalData { - current_dom_depth: depth, - }, - scope, - pool, - traversal, - &tls, - ); - }); - - tls_slots = Some(tls.into_slots()); - break; - } - nodes_remaining_at_current_depth = discovered.len(); + + // If there is enough work to parallelize over, and the caller allows parallelism, switch + // to the parallel driver. We do this only when moving to the next level in the dom so that + // we can pass the same depth for all the children. + if nodes_remaining_at_current_depth != 0 { + continue; + } + depth += 1; + if pool.is_some() && + discovered.len() > parallelism_threshold && + parallelism_threshold > 0 + { + let pool = pool.unwrap(); + let tls = ScopedTLS::<ThreadLocalStyleContext<E>>::new(pool); + let root_opaque = root.as_node().opaque(); + pool.scope_fifo(|scope| { + // Enable a breadth-first rayon traversal. This causes the work + // queue to be always FIFO, rather than FIFO for stealers and + // FILO for the owner (which is what rayon does by default). This + // ensures that we process all the elements at a given depth before + // proceeding to the next depth, which is important for style sharing. + #[cfg(feature = "gecko")] + gecko_profiler_label!(Layout, StyleComputation); + parallel::traverse_nodes( + discovered.make_contiguous(), + DispatchMode::TailCall, + /* recursion_ok = */ true, + root_opaque, + PerLevelTraversalData { + current_dom_depth: depth, + }, + scope, + pool, + traversal, + &tls, + ); + }); + + tls_slots = Some(tls.into_slots()); + break; } + nodes_remaining_at_current_depth = discovered.len(); } // Collect statistics from thread-locals if requested. diff --git a/components/style/parallel.rs b/components/style/parallel.rs index e578c6bbe51..815393d050d 100644 --- a/components/style/parallel.rs +++ b/components/style/parallel.rs @@ -26,8 +26,6 @@ use crate::context::{StyleContext, ThreadLocalStyleContext}; use crate::dom::{OpaqueNode, SendNode, TElement}; use crate::scoped_tls::ScopedTLS; use crate::traversal::{DomTraversal, PerLevelTraversalData}; -use arrayvec::ArrayVec; -use itertools::Itertools; use rayon; use smallvec::SmallVec; @@ -59,23 +57,10 @@ pub const STYLE_THREAD_STACK_SIZE_KB: usize = 512; /// pub const STACK_SAFETY_MARGIN_KB: usize = 168; -/// The maximum number of child nodes that we will process as a single unit. -/// -/// Larger values will increase style sharing cache hits and general DOM -/// locality at the expense of decreased opportunities for parallelism. There -/// are some measurements in -/// https://bugzilla.mozilla.org/show_bug.cgi?id=1385982#c11 and comments 12 -/// and 13 that investigate some slightly different values for the work unit -/// size. If the size is significantly increased, make sure to adjust the -/// condition for kicking off a new work unit in top_down_dom, because -/// otherwise we're likely to end up doing too much work serially. For -/// example, the condition there could become some fraction of WORK_UNIT_MAX -/// instead of WORK_UNIT_MAX. -pub const WORK_UNIT_MAX: usize = 16; - -/// A set of nodes, sized to the work unit. This gets copied when sent to other -/// threads, so we keep it compact. -type WorkUnit<N> = ArrayVec<SendNode<N>, WORK_UNIT_MAX>; +/// See documentation of the pref for performance characteristics. +pub fn work_unit_max() -> usize { + static_prefs::pref!("layout.css.stylo-work-unit-size") as usize +} /// A callback to create our thread local context. This needs to be /// out of line so we don't allocate stack space for the entire struct @@ -115,14 +100,15 @@ fn top_down_dom<'a, 'scope, E, D>( E: TElement + 'scope, D: DomTraversal<E>, { - debug_assert!(nodes.len() <= WORK_UNIT_MAX); + let work_unit_max = work_unit_max(); + debug_assert!(nodes.len() <= work_unit_max); // We set this below, when we have a borrow of the thread-local-context // available. let recursion_ok; // Collect all the children of the elements in our work unit. This will - // contain the combined children of up to WORK_UNIT_MAX nodes, which may + // contain the combined children of up to work_unit_max nodes, which may // be numerous. As such, we store it in a large SmallVec to minimize heap- // spilling, and never move it. let mut discovered_child_nodes = SmallVec::<[SendNode<E::ConcreteNode>; 128]>::new(); @@ -171,19 +157,19 @@ fn top_down_dom<'a, 'scope, E, D>( // following. // // The worst case behavior for waiting until we have a full work - // item is a deep tree which has WORK_UNIT_MAX "linear" branches, - // hence WORK_UNIT_MAX elements at each level. Such a tree would + // item is a deep tree which has work_unit_max "linear" branches, + // hence work_unit_max elements at each level. Such a tree would // end up getting processed entirely sequentially, because we would // process each level one at a time as a single work unit, whether // via our end-of-loop tail call or not. If we kicked off a // traversal as soon as we discovered kids, we would instead // process such a tree more or less with a thread-per-branch, // multiplexed across our actual threadpool. - if discovered_child_nodes.len() >= WORK_UNIT_MAX { + if discovered_child_nodes.len() >= work_unit_max { let mut traversal_data_copy = traversal_data.clone(); traversal_data_copy.current_dom_depth += 1; traverse_nodes( - discovered_child_nodes.drain(..), + &discovered_child_nodes, DispatchMode::NotTailCall, recursion_ok, root, @@ -193,6 +179,7 @@ fn top_down_dom<'a, 'scope, E, D>( traversal, tls, ); + discovered_child_nodes.clear(); } let node = **n; @@ -213,7 +200,7 @@ fn top_down_dom<'a, 'scope, E, D>( if !discovered_child_nodes.is_empty() { traversal_data.current_dom_depth += 1; traverse_nodes( - discovered_child_nodes.drain(..), + &discovered_child_nodes, DispatchMode::TailCall, recursion_ok, root, @@ -245,8 +232,8 @@ impl DispatchMode { /// Enqueues |nodes| for processing, possibly on this thread if the tail call /// conditions are met. #[inline] -pub fn traverse_nodes<'a, 'scope, E, D, I>( - nodes: I, +pub fn traverse_nodes<'a, 'scope, E, D>( + nodes: &[SendNode<E::ConcreteNode>], mode: DispatchMode, recursion_ok: bool, root: OpaqueNode, @@ -258,7 +245,6 @@ pub fn traverse_nodes<'a, 'scope, E, D, I>( ) where E: TElement + 'scope, D: DomTraversal<E>, - I: ExactSizeIterator<Item = SendNode<E::ConcreteNode>>, { debug_assert_ne!(nodes.len(), 0); @@ -272,29 +258,37 @@ pub fn traverse_nodes<'a, 'scope, E, D, I>( let may_dispatch_tail = mode.is_tail_call() && recursion_ok && !pool.current_thread_has_pending_tasks().unwrap(); - // In the common case, our children fit within a single work unit, in which - // case we can pass the SmallVec directly and avoid extra allocation. - if nodes.len() <= WORK_UNIT_MAX { - let work: WorkUnit<E::ConcreteNode> = nodes.collect(); + let work_unit_max = work_unit_max(); + // In the common case, our children fit within a single work unit, in which case we can pass + // the nodes directly and avoid extra allocation. + if nodes.len() <= work_unit_max { if may_dispatch_tail { - top_down_dom(&work, root, traversal_data, scope, pool, traversal, tls); + top_down_dom(&nodes, root, traversal_data, scope, pool, traversal, tls); } else { + let work = nodes.to_vec(); scope.spawn_fifo(move |scope| { #[cfg(feature = "gecko")] gecko_profiler_label!(Layout, StyleComputation); - let work = work; top_down_dom(&work, root, traversal_data, scope, pool, traversal, tls); }); } } else { - for chunk in nodes.chunks(WORK_UNIT_MAX).into_iter() { - let nodes: WorkUnit<E::ConcreteNode> = chunk.collect(); + for chunk in nodes.chunks(work_unit_max) { + let work = chunk.to_vec(); let traversal_data_copy = traversal_data.clone(); scope.spawn_fifo(move |scope| { #[cfg(feature = "gecko")] gecko_profiler_label!(Layout, StyleComputation); - let n = nodes; - top_down_dom(&*n, root, traversal_data_copy, scope, pool, traversal, tls) + let work = work; + top_down_dom( + &work, + root, + traversal_data_copy, + scope, + pool, + traversal, + tls, + ) }); } } |