3 files changed, 96 insertions, 20 deletions
diff --git a/components/style/context.rs b/components/style/context.rs
index c7c5d2ae354..3183cd05747 100644
--- a/components/style/context.rs
+++ b/components/style/context.rs
@@ -16,6 +16,7 @@ use euclid::Size2D;
 use fnv::FnvHashMap;
 use font_metrics::FontMetricsProvider;
 #[cfg(feature = "gecko")] use gecko_bindings::structs;
+use parallel::STYLE_THREAD_STACK_SIZE_KB;
 #[cfg(feature = "servo")] use parking_lot::RwLock;
 use properties::ComputedValues;
 #[cfg(feature = "servo")] use properties::PropertyId;
@@ -605,6 +606,61 @@ where
     }
 }
 
+
+/// A helper type for stack limit checking.  This assumes that stacks grow
+/// down, which is true for all non-ancient CPU architectures.
+pub struct StackLimitChecker {
+   lower_limit: usize
+}
+
+impl StackLimitChecker {
+    /// Create a new limit checker, for this thread, allowing further use
+    /// of up to |stack_size| bytes beyond (below) the current stack pointer.
+    #[inline(never)]
+    pub fn new(stack_size_limit: usize) -> Self {
+        StackLimitChecker {
+            lower_limit: StackLimitChecker::get_sp() - stack_size_limit
+        }
+    }
+
+    /// Checks whether the previously stored stack limit has now been exceeded.
+    #[inline(never)]
+    pub fn limit_exceeded(&self) -> bool {
+        let curr_sp = StackLimitChecker::get_sp();
+
+        // Try to assert if we're called from a different thread than the
+        // one that originally created this object.  This is a bit subtle
+        // and relies on wraparound behaviour of unsigned integers.
+        //
+        // * If we're called from a thread whose stack has a higher address
+        //   than the one that created this object, then
+        //   |curr_sp - self.lower_limit| will (almost certainly) be larger
+        //   than the thread stack size, so the check will fail.
+        //
+        // * If we're called from a thread whose stack has a lower address
+        //   than the one that created this object, then
+        //   |curr_sp - self.lower_limit| will be negative, which will look
+        //   like a very large unsigned value, so the check will also fail.
+        //
+        // The correctness of depends on the assumption that no stack wraps
+        // around the end of the address space.
+        debug_assert!(curr_sp - self.lower_limit
+                      <= STYLE_THREAD_STACK_SIZE_KB * 1024);
+
+        // The actual bounds check.
+        curr_sp <= self.lower_limit
+    }
+
+    // Technically, rustc can optimize this away, but shouldn't for now.
+    // We should fix this once black_box is stable.
+    #[inline(always)]
+    fn get_sp() -> usize {
+        let mut foo: usize = 42;
+        (&mut foo as *mut usize) as usize
+    }
+}
+
+
 /// A thread-local style context.
 ///
 /// This context contains data that needs to be used during restyling, but is
@@ -639,6 +695,9 @@ pub struct ThreadLocalStyleContext<E: TElement> {
     /// The struct used to compute and cache font metrics from style
     /// for evaluation of the font-relative em/ch units and font-size
     pub font_metrics_provider: E::FontMetricsProvider,
+    /// A checker used to ensure that parallel.rs does not recurse indefinitely
+    /// even on arbitrarily deep trees.  See Gecko bug 1376883.
+    pub stack_limit_checker: StackLimitChecker,
 }
 
 impl<E: TElement> ThreadLocalStyleContext<E> {
@@ -654,6 +713,8 @@ impl<E: TElement> ThreadLocalStyleContext<E> {
             statistics: TraversalStatistics::default(),
             current_element_info: None,
             font_metrics_provider: E::FontMetricsProvider::create_from(shared),
+            stack_limit_checker: StackLimitChecker::new(
+                (STYLE_THREAD_STACK_SIZE_KB - 40) * 1024),
         }
     }
 
@@ -668,6 +729,15 @@ impl<E: TElement> ThreadLocalStyleContext<E> {
             statistics: TraversalStatistics::default(),
             current_element_info: None,
             font_metrics_provider: E::FontMetricsProvider::create_from(shared),
+            // Threads in the styling pool have small stacks, and we have to
+            // be careful not to run out of stack during recursion in
+            // parallel.rs.  Therefore set up a stack limit checker, in
+            // which we reserve 40KB of stack as a safety buffer.  Currently
+            // the stack size is 128KB, so this allows 88KB for recursive
+            // DOM traversal, which encompasses 53 levels of recursion before
+            // the limiter kicks in, on x86_64-Linux.  See Gecko bug 1376883.
+            stack_limit_checker: StackLimitChecker::new(
+                (STYLE_THREAD_STACK_SIZE_KB - 40) * 1024),
         }
     }
 
diff --git a/components/style/gecko/global_style_data.rs b/components/style/gecko/global_style_data.rs
index b5698e02193..8ba55191c03 100644
--- a/components/style/gecko/global_style_data.rs
+++ b/components/style/gecko/global_style_data.rs
@@ -9,6 +9,7 @@ use gecko_bindings::bindings;
 use gecko_bindings::bindings::{Gecko_RegisterProfilerThread, Gecko_UnregisterProfilerThread};
 use gecko_bindings::bindings::Gecko_SetJemallocThreadLocalArena;
 use num_cpus;
+use parallel::STYLE_THREAD_STACK_SIZE_KB;
 use rayon;
 use shared_lock::SharedRwLock;
 use std::cmp;
@@ -92,7 +93,9 @@ lazy_static! {
                 .breadth_first()
                 .thread_name(thread_name)
                 .start_handler(thread_startup)
-                .exit_handler(thread_shutdown);
+                .exit_handler(thread_shutdown)
+                // Set thread stack size to 128KB.  See Gecko bug 1376883.
+                .stack_size(STYLE_THREAD_STACK_SIZE_KB * 1024);
             let pool = rayon::ThreadPool::new(configuration).ok();
             pool
         };
diff --git a/components/style/parallel.rs b/components/style/parallel.rs
index db5466e12a3..7e9e8154122 100644
--- a/components/style/parallel.rs
+++ b/components/style/parallel.rs
@@ -32,6 +32,9 @@ use std::borrow::Borrow;
 use time;
 use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken};
 
+/// The minimum stack size for a thread in the styling pool, in kilobytes.
+pub const STYLE_THREAD_STACK_SIZE_KB: usize = 128;
+
 /// The maximum number of child nodes that we will process as a single unit.
 ///
 /// Larger values will increase style sharing cache hits and general DOM
@@ -77,7 +80,7 @@ pub fn traverse_dom<E, D>(traversal: &D,
             let root_opaque = root.opaque();
             traverse_nodes(&[root],
                            DispatchMode::TailCall,
-                           0,
+                           true,
                            root_opaque,
                            traversal_data,
                            scope,
@@ -132,7 +135,6 @@ fn create_thread_local_context<'scope, E, D>(
 #[inline(always)]
 #[allow(unsafe_code)]
 fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
-                                  recursion_depth: usize,
                                   root: OpaqueNode,
                                   mut traversal_data: PerLevelTraversalData,
                                   scope: &'a rayon::Scope<'scope>,
@@ -144,6 +146,10 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
 {
     debug_assert!(nodes.len() <= WORK_UNIT_MAX);
 
+    // We set this below, when we have a borrow of the thread-local-context
+    // available.
+    let recursion_ok;
+
     // Collect all the children of the elements in our work unit. This will
     // contain the combined children of up to WORK_UNIT_MAX nodes, which may
     // be numerous. As such, we store it in a large SmallVec to minimize heap-
@@ -154,6 +160,10 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
         // a potential recursive call when we pass TailCall.
         let mut tlc = tls.ensure(
             |slot: &mut Option<ThreadLocalStyleContext<E>>| create_thread_local_context(traversal, slot));
+
+        // Check that we're not in danger of running out of stack.
+        recursion_ok = !tlc.stack_limit_checker.limit_exceeded();
+
         let mut context = StyleContext {
             shared: traversal.shared_context(),
             thread_local: &mut *tlc,
@@ -202,7 +212,7 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
                 traversal_data_copy.current_dom_depth += 1;
                 traverse_nodes(&*discovered_child_nodes,
                                DispatchMode::NotTailCall,
-                               recursion_depth,
+                               recursion_ok,
                                root,
                                traversal_data_copy,
                                scope,
@@ -232,7 +242,7 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
         traversal_data.current_dom_depth += 1;
         traverse_nodes(&discovered_child_nodes,
                        DispatchMode::TailCall,
-                       recursion_depth,
+                       recursion_ok,
                        root,
                        traversal_data,
                        scope,
@@ -254,16 +264,10 @@ impl DispatchMode {
     fn is_tail_call(&self) -> bool { matches!(*self, DispatchMode::TailCall) }
 }
 
-// On x86_64-linux, a recursive cycle requires 3472 bytes of stack.  Limiting
-// the depth to 150 therefore should keep the stack use by the recursion to
-// 520800 bytes, which would give a generously conservative margin should we
-// decide to reduce the thread stack size from its default of 2MB down to 1MB.
-const RECURSION_DEPTH_LIMIT: usize = 150;
-
 #[inline]
 fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
                                     mode: DispatchMode,
-                                    recursion_depth: usize,
+                                    recursion_ok: bool,
                                     root: OpaqueNode,
                                     traversal_data: PerLevelTraversalData,
                                     scope: &'a rayon::Scope<'scope>,
@@ -279,12 +283,11 @@ fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
     // want to actually dispatch the job as a tail call if there's nothing left
     // in our local queue. Otherwise we need to return to it to maintain proper
     // breadth-first ordering. We also need to take care to avoid stack
-    // overflow due to excessive tail recursion. The stack overflow isn't
-    // observable to content -- we're still completely correct, just not
-    // using tail recursion any more. See bug 1368302.
-    debug_assert!(recursion_depth <= RECURSION_DEPTH_LIMIT);
+    // overflow due to excessive tail recursion. The stack overflow avoidance
+    // isn't observable to content -- we're still completely correct, just not
+    // using tail recursion any more. See Gecko bugs 1368302 and 1376883.
     let may_dispatch_tail = mode.is_tail_call() &&
-        recursion_depth != RECURSION_DEPTH_LIMIT &&
+        recursion_ok &&
         !pool.current_thread_has_pending_tasks().unwrap();
 
     // In the common case, our children fit within a single work unit, in which
@@ -292,12 +295,12 @@ fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
     if nodes.len() <= WORK_UNIT_MAX {
         let work = nodes.iter().cloned().collect::<WorkUnit<E::ConcreteNode>>();
         if may_dispatch_tail {
-            top_down_dom(&work, recursion_depth + 1, root,
+            top_down_dom(&work, root,
                          traversal_data, scope, pool, traversal, tls);
         } else {
             scope.spawn(move |scope| {
                 let work = work;
-                top_down_dom(&work, 0, root,
+                top_down_dom(&work, root,
                              traversal_data, scope, pool, traversal, tls);
             });
         }
@@ -307,7 +310,7 @@ fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
             let traversal_data_copy = traversal_data.clone();
             scope.spawn(move |scope| {
                 let n = nodes;
-                top_down_dom(&*n, 0, root,
+                top_down_dom(&*n, root,
                              traversal_data_copy, scope, pool, traversal, tls)
             });
         }