The GC should be parallel
https://bugs.webkit.org/show_bug.cgi?id=70995

Source/JavaScriptCore: 

Reviewed by Geoff Garen.
        
Added parallel tracing to the GC. This works by having local mark
stacks per thread, and a global shared one. Threads sometimes
donate cells from the mark stack to the global one if the heuristics
tell them that it's affordable to do so. Threads that have depleted
their local mark stacks try to steal some from the shared one.

Marking is now done using an atomic weak relaxed CAS (compare-and-swap).
        
This is a 23% speed-up on V8-splay when I use 4 marking threads,
leading to a 3.5% speed-up on V8.
        
It also appears that this reduces GC pause times on real websites by
more than half.

* JavaScriptCore.exp:
* JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.def:
* heap/Heap.cpp:
(JSC::Heap::Heap):
(JSC::Heap::~Heap):
(JSC::Heap::markRoots):
* heap/Heap.h:
* heap/MarkStack.cpp:
(JSC::MarkStackSegmentAllocator::MarkStackSegmentAllocator):
(JSC::MarkStackSegmentAllocator::~MarkStackSegmentAllocator):
(JSC::MarkStackSegmentAllocator::allocate):
(JSC::MarkStackSegmentAllocator::release):
(JSC::MarkStackSegmentAllocator::shrinkReserve):
(JSC::MarkStackArray::MarkStackArray):
(JSC::MarkStackArray::~MarkStackArray):
(JSC::MarkStackArray::expand):
(JSC::MarkStackArray::refill):
(JSC::MarkStackArray::donateSomeCellsTo):
(JSC::MarkStackArray::stealSomeCellsFrom):
(JSC::MarkStackThreadSharedData::markingThreadMain):
(JSC::MarkStackThreadSharedData::markingThreadStartFunc):
(JSC::MarkStackThreadSharedData::MarkStackThreadSharedData):
(JSC::MarkStackThreadSharedData::~MarkStackThreadSharedData):
(JSC::MarkStackThreadSharedData::reset):
(JSC::MarkStack::reset):
(JSC::SlotVisitor::donateSlow):
(JSC::SlotVisitor::drain):
(JSC::SlotVisitor::drainFromShared):
(JSC::MarkStack::mergeOpaqueRoots):
(JSC::SlotVisitor::harvestWeakReferences):
* heap/MarkStack.h:
(JSC::MarkStackSegment::data):
(JSC::MarkStackSegment::capacityFromSize):
(JSC::MarkStackSegment::sizeFromCapacity):
(JSC::MarkStackArray::postIncTop):
(JSC::MarkStackArray::preDecTop):
(JSC::MarkStackArray::setTopForFullSegment):
(JSC::MarkStackArray::setTopForEmptySegment):
(JSC::MarkStackArray::top):
(JSC::MarkStackArray::validatePrevious):
(JSC::MarkStack::addWeakReferenceHarvester):
(JSC::MarkStack::mergeOpaqueRootsIfNecessary):
(JSC::MarkStack::mergeOpaqueRootsIfProfitable):
(JSC::MarkStack::MarkStack):
(JSC::MarkStack::addOpaqueRoot):
(JSC::MarkStack::containsOpaqueRoot):
(JSC::MarkStack::opaqueRootCount):
(JSC::MarkStackArray::append):
(JSC::MarkStackArray::canRemoveLast):
(JSC::MarkStackArray::removeLast):
(JSC::MarkStackArray::isEmpty):
(JSC::MarkStackArray::canDonateSomeCells):
(JSC::MarkStackArray::size):
(JSC::ParallelModeEnabler::ParallelModeEnabler):
(JSC::ParallelModeEnabler::~ParallelModeEnabler):
* heap/MarkedBlock.h:
(JSC::MarkedBlock::testAndSetMarked):
* heap/SlotVisitor.h:
(JSC::SlotVisitor::donate):
(JSC::SlotVisitor::donateAndDrain):
(JSC::SlotVisitor::donateKnownParallel):
(JSC::SlotVisitor::SlotVisitor):
* heap/WeakReferenceHarvester.h:
* runtime/Heuristics.cpp:
(JSC::Heuristics::initializeHeuristics):
* runtime/Heuristics.h:
* wtf/Atomics.h:
(WTF::weakCompareAndSwap):
* wtf/Bitmap.h:
(WTF::::Bitmap):
(WTF::::get):
(WTF::::set):
(WTF::::testAndSet):
(WTF::::testAndClear):
(WTF::::concurrentTestAndSet):
(WTF::::concurrentTestAndClear):
(WTF::::clear):
(WTF::::clearAll):
(WTF::::nextPossiblyUnset):
(WTF::::findRunOfZeros):
(WTF::::count):
(WTF::::isEmpty):
(WTF::::isFull):
* wtf/MainThread.h:
(WTF::isMainThreadOrGCThread):
* wtf/Platform.h:
* wtf/ThreadSpecific.h:
(WTF::::isSet):
* wtf/mac/MainThreadMac.mm:
(WTF::initializeGCThreads):
(WTF::initializeMainThreadPlatform):
(WTF::initializeMainThreadToProcessMainThreadPlatform):
(WTF::registerGCThread):
(WTF::isMainThreadOrGCThread):

Source/WebCore: 

Reviewed by Geoff Garen.
        
Added parallel tracing to the GC. This required loosening some assertions,
since some code may now be called from outside the main thread.

No new tests, since no behavior was changed.

* platform/TreeShared.h:
(WebCore::TreeShared::parent):



git-svn-id: http://svn.webkit.org/repository/webkit/trunk@98937 268f45cc-cd09-0410-ab3c-d52691b4dbfc
diff --git a/Source/JavaScriptCore/heap/Heap.cpp b/Source/JavaScriptCore/heap/Heap.cpp
index f2dc3003..e804ceb 100644
--- a/Source/JavaScriptCore/heap/Heap.cpp
+++ b/Source/JavaScriptCore/heap/Heap.cpp
@@ -32,6 +32,7 @@
 #include "JSONObject.h"
 #include "Tracing.h"
 #include <algorithm>
+#include <wtf/CurrentTime.h>
 
 
 using namespace std;
@@ -315,7 +316,8 @@
     , m_markListSet(0)
     , m_activityCallback(DefaultGCActivityCallback::create(this))
     , m_machineThreads(this)
-    , m_slotVisitor(globalData->jsArrayVPtr, globalData->jsFinalObjectVPtr, globalData->jsStringVPtr)
+    , m_sharedData(globalData)
+    , m_slotVisitor(m_sharedData, globalData->jsArrayVPtr, globalData->jsFinalObjectVPtr, globalData->jsStringVPtr)
     , m_handleHeap(globalData)
     , m_isSafeToCollect(false)
     , m_globalData(globalData)
@@ -324,19 +326,20 @@
     (*m_activityCallback)();
     m_numberOfFreeBlocks = 0;
     m_blockFreeingThread = createThread(blockFreeingThreadStartFunc, this, "JavaScriptCore::BlockFree");
+    
     ASSERT(m_blockFreeingThread);
 }
 
 Heap::~Heap()
 {
-    // destroy our thread
+    // Destroy our block freeing thread.
     {
         MutexLocker locker(m_freeBlockLock);
         m_blockFreeingThreadShouldQuit = true;
         m_freeBlockCondition.broadcast();
     }
     waitForThreadCompletion(m_blockFreeingThread, 0);
-    
+
     // The destroy function must already have been called, so assert this.
     ASSERT(!m_globalData);
 }
@@ -589,74 +592,84 @@
     SlotVisitor& visitor = m_slotVisitor;
     HeapRootVisitor heapRootVisitor(visitor);
 
-#if ENABLE(GGC)
     {
-        size_t dirtyCellCount = dirtyCells.size();
-        GCPHASE(VisitDirtyCells);
-        GCCOUNTER(DirtyCellCount, dirtyCellCount);
-        for (size_t i = 0; i < dirtyCellCount; i++) {
-            heapRootVisitor.visitChildren(dirtyCells[i]);
-            visitor.drain();
+        ParallelModeEnabler enabler(visitor);
+#if ENABLE(GGC)
+        {
+            size_t dirtyCellCount = dirtyCells.size();
+            GCPHASE(VisitDirtyCells);
+            GCCOUNTER(DirtyCellCount, dirtyCellCount);
+            for (size_t i = 0; i < dirtyCellCount; i++) {
+                heapRootVisitor.visitChildren(dirtyCells[i]);
+                visitor.donateAndDrain();
+            }
         }
-    }
 #endif
     
-    if (m_globalData->codeBlocksBeingCompiled.size()) {
-        GCPHASE(VisitActiveCodeBlock);
-        for (size_t i = 0; i < m_globalData->codeBlocksBeingCompiled.size(); i++)
-            m_globalData->codeBlocksBeingCompiled[i]->visitAggregate(visitor);
-    }
-    
-    {
-        GCPHASE(VisitMachineRoots);
-        visitor.append(machineThreadRoots);
-        visitor.drain();
-    }
-    {
-        GCPHASE(VisitRegisterFileRoots);
-        visitor.append(registerFileRoots);
-        visitor.drain();
-    }
-    {
-        GCPHASE(VisitProtectedObjects);
-        markProtectedObjects(heapRootVisitor);
-        visitor.drain();
-    }
-    {
-        GCPHASE(VisitTempSortVectors);
-        markTempSortVectors(heapRootVisitor);
-        visitor.drain();
-    }
-
-    {
-        GCPHASE(MarkingArgumentBuffers);
-        if (m_markListSet && m_markListSet->size()) {
-            MarkedArgumentBuffer::markLists(heapRootVisitor, *m_markListSet);
-            visitor.drain();
+        if (m_globalData->codeBlocksBeingCompiled.size()) {
+            GCPHASE(VisitActiveCodeBlock);
+            for (size_t i = 0; i < m_globalData->codeBlocksBeingCompiled.size(); i++)
+                m_globalData->codeBlocksBeingCompiled[i]->visitAggregate(visitor);
         }
-    }
-    if (m_globalData->exception) {
-        GCPHASE(MarkingException);
-        heapRootVisitor.visit(&m_globalData->exception);
-        visitor.drain();
-    }
     
-    {
-        GCPHASE(VisitStrongHandles);
-        m_handleHeap.visitStrongHandles(heapRootVisitor);
-        visitor.drain();
-    }
+        {
+            GCPHASE(VisitMachineRoots);
+            visitor.append(machineThreadRoots);
+            visitor.donateAndDrain();
+        }
+        {
+            GCPHASE(VisitRegisterFileRoots);
+            visitor.append(registerFileRoots);
+            visitor.donateAndDrain();
+        }
+        {
+            GCPHASE(VisitProtectedObjects);
+            markProtectedObjects(heapRootVisitor);
+            visitor.donateAndDrain();
+        }
+        {
+            GCPHASE(VisitTempSortVectors);
+            markTempSortVectors(heapRootVisitor);
+            visitor.donateAndDrain();
+        }
+
+        {
+            GCPHASE(MarkingArgumentBuffers);
+            if (m_markListSet && m_markListSet->size()) {
+                MarkedArgumentBuffer::markLists(heapRootVisitor, *m_markListSet);
+                visitor.donateAndDrain();
+            }
+        }
+        if (m_globalData->exception) {
+            GCPHASE(MarkingException);
+            heapRootVisitor.visit(&m_globalData->exception);
+            visitor.donateAndDrain();
+        }
     
-    {
-        GCPHASE(HandleStack);
-        m_handleStack.visit(heapRootVisitor);
-        visitor.drain();
-    }
+        {
+            GCPHASE(VisitStrongHandles);
+            m_handleHeap.visitStrongHandles(heapRootVisitor);
+            visitor.donateAndDrain();
+        }
     
-    {
-        GCPHASE(TraceCodeBlocks);
-        m_jettisonedCodeBlocks.traceCodeBlocks(visitor);
-        visitor.drain();
+        {
+            GCPHASE(HandleStack);
+            m_handleStack.visit(heapRootVisitor);
+            visitor.donateAndDrain();
+        }
+    
+        {
+            GCPHASE(TraceCodeBlocks);
+            m_jettisonedCodeBlocks.traceCodeBlocks(visitor);
+            visitor.donateAndDrain();
+        }
+    
+#if ENABLE(PARALLEL_GC)
+        {
+            GCPHASE(Convergence);
+            visitor.drainFromShared(SlotVisitor::MasterDrain);
+        }
+#endif
     }
 
     // Weak handles must be marked last, because their owners use the set of
@@ -667,12 +680,19 @@
         do {
             lastOpaqueRootCount = visitor.opaqueRootCount();
             m_handleHeap.visitWeakHandles(heapRootVisitor);
-            visitor.drain();
+            {
+                ParallelModeEnabler enabler(visitor);
+                visitor.donateAndDrain();
+#if ENABLE(PARALLEL_GC)
+                visitor.drainFromShared(SlotVisitor::MasterDrain);
+#endif
+            }
             // If the set of opaque roots has grown, more weak handles may have become reachable.
         } while (lastOpaqueRootCount != visitor.opaqueRootCount());
     }
     GCCOUNTER(VisitedValueCount, visitor.visitCount());
     visitor.reset();
+    m_sharedData.reset();
 
     m_operationInProgress = NoOperation;
 }