components/style/parallel.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

//! Implements parallel traversal over the DOM tree.
//!
//! This traversal is based on Rayon, and therefore its safety is largely
//! verified by the type system.
//!
//! The primary trickiness and fine print for the above relates to the
//! thread safety of the DOM nodes themselves. Accessing a DOM element
//! concurrently on multiple threads is actually mostly "safe", since all
//! the mutable state is protected by an AtomicRefCell, and so we'll
//! generally panic if something goes wrong. Still, we try to to enforce our
//! thread invariants at compile time whenever possible. As such, TNode and
//! TElement are not Send, so ordinary style system code cannot accidentally
//! share them with other threads. In the parallel traversal, we explicitly
//! invoke |unsafe { SendNode::new(n) }| to put nodes in containers that may
//! be sent to other threads. This occurs in only a handful of places and is
//! easy to grep for. At the time of this writing, there is no other unsafe
//! code in the parallel traversal.

#![deny(missing_docs)]

use arrayvec::ArrayVec;
use context::{StyleContext, ThreadLocalStyleContext, TraversalStatistics};
use dom::{OpaqueNode, SendNode, TElement, TNode};
use rayon;
use scoped_tls::ScopedTLS;
use smallvec::SmallVec;
use std::borrow::Borrow;
use time;
use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken};

/// The maximum number of child nodes that we will process as a single unit.
///
/// Larger values will increase style sharing cache hits and general DOM locality
/// at the expense of decreased opportunities for parallelism. This value has not
/// been measured and could potentially be tuned.
pub const WORK_UNIT_MAX: usize = 16;

/// A set of nodes, sized to the work unit. This gets copied when sent to other
/// threads, so we keep it compact.
type WorkUnit<N> = ArrayVec<[SendNode<N>; WORK_UNIT_MAX]>;

/// Entry point for the parallel traversal.
#[allow(unsafe_code)]
pub fn traverse_dom<E, D>(traversal: &D,
                          root: E,
                          token: PreTraverseToken,
                          pool: &rayon::ThreadPool)
    where E: TElement,
          D: DomTraversal<E>,
{
    let dump_stats = traversal.shared_context().options.dump_style_statistics;
    let start_time = if dump_stats { Some(time::precise_time_s()) } else { None };

    // Set up the SmallVec. We need to move this, and in most cases this is just
    // one node, so keep it small.
    let mut nodes = SmallVec::<[SendNode<E::ConcreteNode>; 8]>::new();

    debug_assert!(traversal.is_parallel());
    // Handle Gecko's eager initial styling. We don't currently support it
    // in conjunction with bottom-up traversal. If we did, we'd need to put
    // it on the context to make it available to the bottom-up phase.
    let depth = if token.traverse_unstyled_children_only() {
        debug_assert!(!D::needs_postorder_traversal());
        for kid in root.as_node().traversal_children() {
            if kid.as_element().map_or(false, |el| el.get_data().is_none()) {
                nodes.push(unsafe { SendNode::new(kid) });
            }
        }
        root.depth() + 1
    } else {
        nodes.push(unsafe { SendNode::new(root.as_node()) });
        root.depth()
    };

    if nodes.is_empty() {
        return;
    }

    let traversal_data = PerLevelTraversalData {
        current_dom_depth: depth,
    };
    let tls = ScopedTLS::<ThreadLocalStyleContext<E>>::new(pool);
    let root = root.as_node().opaque();

    pool.install(|| {
        rayon::scope(|scope| {
            let nodes = nodes;
            traverse_nodes(&*nodes,
                           DispatchMode::TailCall,
                           0,
                           root,
                           traversal_data,
                           scope,
                           pool,
                           traversal,
                           &tls);
        });
    });

    // Dump statistics to stdout if requested.
    if dump_stats {
        let slots = unsafe { tls.unsafe_get() };
        let mut aggregate = slots.iter().fold(TraversalStatistics::default(), |acc, t| {
            match *t.borrow() {
                None => acc,
                Some(ref cx) => &cx.borrow().statistics + &acc,
            }
        });
        aggregate.finish(traversal, start_time.unwrap());
        if aggregate.is_large_traversal() {
            println!("{}", aggregate);
        }
    }
}

/// A callback to create our thread local context.  This needs to be
/// out of line so we don't allocate stack space for the entire struct
/// in the caller.
#[inline(never)]
fn create_thread_local_context<'scope, E, D>(
    traversal: &'scope D,
    slot: &mut Option<ThreadLocalStyleContext<E>>)
    where E: TElement + 'scope,
          D: DomTraversal<E>
{
    *slot = Some(ThreadLocalStyleContext::new(traversal.shared_context()));
}

/// A parallel top-down DOM traversal.
///
/// This algorithm traverses the DOM in a breadth-first, top-down manner. The
/// goals are:
/// * Never process a child before its parent (since child style depends on
///   parent style). If this were to happen, the styling algorithm would panic.
/// * Prioritize discovering nodes as quickly as possible to maximize
///   opportunities for parallelism.
/// * Style all the children of a given node (i.e. all sibling nodes) on
///   a single thread (with an upper bound to handle nodes with an
///   abnormally large number of children). This is important because we use
///   a thread-local cache to share styles between siblings.
#[inline(always)]
#[allow(unsafe_code)]
fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
                                  recursion_depth: usize,
                                  root: OpaqueNode,
                                  mut traversal_data: PerLevelTraversalData,
                                  scope: &'a rayon::Scope<'scope>,
                                  pool: &'scope rayon::ThreadPool,
                                  traversal: &'scope D,
                                  tls: &'scope ScopedTLS<'scope, ThreadLocalStyleContext<E>>)
    where E: TElement + 'scope,
          D: DomTraversal<E>,
{
    debug_assert!(nodes.len() <= WORK_UNIT_MAX);

    // Collect all the children of the elements in our work unit. This will
    // contain the combined children of up to WORK_UNIT_MAX nodes, which may
    // be numerous. As such, we store it in a large SmallVec to minimize heap-
    // spilling, and never move it.
    let mut discovered_child_nodes = SmallVec::<[SendNode<E::ConcreteNode>; 128]>::new();
    {
        // Scope the borrow of the TLS so that the borrow is dropped before
        // a potential recursive call when we pass TailCall.
        let mut tlc = tls.ensure(
            |slot: &mut Option<ThreadLocalStyleContext<E>>| create_thread_local_context(traversal, slot));
        let mut context = StyleContext {
            shared: traversal.shared_context(),
            thread_local: &mut *tlc,
        };

        for n in nodes {
            // If the last node we processed produced children, spawn them off
            // into a work item. We do this at the beginning of the loop (rather
            // than at the end) so that we can traverse the children of the last
            // sibling directly on this thread without a spawn call.
            //
            // This has the important effect of removing the allocation and
            // context-switching overhead of the parallel traversal for perfectly
            // linear regions of the DOM, i.e.:
            //
            // <russian><doll><tag><nesting></nesting></tag></doll></russian>
            //
            // Which are not at all uncommon.
            if !discovered_child_nodes.is_empty() {
                let mut traversal_data_copy = traversal_data.clone();
                traversal_data_copy.current_dom_depth += 1;
                traverse_nodes(&*discovered_child_nodes,
                               DispatchMode::NotTailCall,
                               recursion_depth,
                               root,
                               traversal_data_copy,
                               scope,
                               pool,
                               traversal,
                               tls);
                discovered_child_nodes.clear();
            }

            let node = **n;
            let mut children_to_process = 0isize;
            traversal.process_preorder(&traversal_data, &mut context, node, |n| {
                children_to_process += 1;
                let send_n = unsafe { SendNode::new(n) };
                discovered_child_nodes.push(send_n);
            });

            traversal.handle_postorder_traversal(&mut context, root, node,
                                                 children_to_process);
        }
    }

    // Handle the children of the last element in this work unit. If any exist,
    // we can process them (or at least one work unit's worth of them) directly
    // on this thread by passing TailCall.
    if !discovered_child_nodes.is_empty() {
        traversal_data.current_dom_depth += 1;
        traverse_nodes(&discovered_child_nodes,
                       DispatchMode::TailCall,
                       recursion_depth,
                       root,
                       traversal_data,
                       scope,
                       pool,
                       traversal,
                       tls);
    }
}

/// Controls whether traverse_nodes may make a recursive call to continue
/// doing work, or whether it should always dispatch work asynchronously.
#[derive(Clone, Copy, PartialEq)]
enum DispatchMode {
    TailCall,
    NotTailCall,
}

impl DispatchMode {
    fn is_tail_call(&self) -> bool { matches!(*self, DispatchMode::TailCall) }
}

// On x86_64-linux, a recursive cycle requires 3472 bytes of stack.  Limiting
// the depth to 150 therefore should keep the stack use by the recursion to
// 520800 bytes, which would give a generously conservative margin should we
// decide to reduce the thread stack size from its default of 2MB down to 1MB.
const RECURSION_DEPTH_LIMIT: usize = 150;

#[inline]
fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
                                    mode: DispatchMode,
                                    recursion_depth: usize,
                                    root: OpaqueNode,
                                    traversal_data: PerLevelTraversalData,
                                    scope: &'a rayon::Scope<'scope>,
                                    pool: &'scope rayon::ThreadPool,
                                    traversal: &'scope D,
                                    tls: &'scope ScopedTLS<'scope, ThreadLocalStyleContext<E>>)
    where E: TElement + 'scope,
          D: DomTraversal<E>,
{
    debug_assert!(!nodes.is_empty());

    // This is a tail call from the perspective of the caller. However, we only
    // want to actually dispatch the job as a tail call if there's nothing left
    // in our local queue. Otherwise we need to return to it to maintain proper
    // breadth-first ordering. We also need to take care to avoid stack
    // overflow due to excessive tail recursion. The stack overflow isn't
    // observable to content -- we're still completely correct, just not
    // using tail recursion any more. See bug 1368302.
    debug_assert!(recursion_depth <= RECURSION_DEPTH_LIMIT);
    let may_dispatch_tail = mode.is_tail_call() &&
        recursion_depth != RECURSION_DEPTH_LIMIT &&
        !pool.current_thread_has_pending_tasks().unwrap();

    // In the common case, our children fit within a single work unit, in which
    // case we can pass the SmallVec directly and avoid extra allocation.
    if nodes.len() <= WORK_UNIT_MAX {
        let work = nodes.iter().cloned().collect::<WorkUnit<E::ConcreteNode>>();
        if may_dispatch_tail {
            top_down_dom(&work, recursion_depth + 1, root,
                         traversal_data, scope, pool, traversal, tls);
        } else {
            scope.spawn(move |scope| {
                let work = work;
                top_down_dom(&work, 0, root,
                             traversal_data, scope, pool, traversal, tls);
            });
        }
    } else {
        for chunk in nodes.chunks(WORK_UNIT_MAX) {
            let nodes = chunk.iter().cloned().collect::<WorkUnit<E::ConcreteNode>>();
            let traversal_data_copy = traversal_data.clone();
            scope.spawn(move |scope| {
                let n = nodes;
                top_down_dom(&*n, 0, root,
                             traversal_data_copy, scope, pool, traversal, tls)
            });
        }
    }
}