blob: e9338eb8f183dc9423b3cbef24b56c705ac7a8f9 [file] [log] [blame]
/*
* Copyright (C) 2019 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "B3ReduceLoopStrength.h"
#if ENABLE(B3_JIT)
#include "B3BasicBlockInlines.h"
#include "B3BlockInsertionSet.h"
#include "B3ConstPtrValue.h"
#include "B3EnsureLoopPreHeaders.h"
#include "B3InsertionSet.h"
#include "B3NaturalLoops.h"
#include "B3PhaseScope.h"
#include "B3ProcedureInlines.h"
#include "B3ValueInlines.h"
#include "B3Variable.h"
#include "B3VariableValue.h"
#include <wtf/GraphNodeWorklist.h>
#include <wtf/IndexSet.h>
#include <wtf/SmallPtrSet.h>
#include <wtf/Vector.h>
namespace JSC { namespace B3 {
#if CPU(X86_64)
/*
* The semantics of B3 require that loads and stores are not reduced in granularity.
* If we would use system memcpy, then it is possible that we would get a byte copy loop,
* and this could cause GC tearing.
*/
void fastForwardCopy32(uint32_t* dst, const uint32_t* src, size_t size)
{
uint64_t i;
int64_t counter;
uint64_t tmp, tmp2;
asm volatile(
"test %q[size], %q[size]\t\n"
"jz 900f\t\n" // exit
"xorq %q[i], %q[i]\t\n"
// if size < 8
"cmp $8, %q[size]\t\n"
"jl 100f\t\n" // backporch
// if (dst + size*4 <= src || src + size*4 <= dst)
"leaq (%q[src], %q[size], 4), %q[tmp]\t\n"
"cmp %q[tmp], %q[dst]\t\n"
"jae 500f\t\n" // simd no overlap
"leaq (%q[dst], %q[size], 4), %q[tmp]\t\n"
"cmp %q[tmp], %q[src]\t\n"
"jae 500f\t\n" // simd no overlap
"jmp 100f\t\n" // Backporch
// Backporch (assumes we can write at least one int)
"100:\t\n"
// counter = (size - i)%4
"mov %q[size], %q[counter]\t\n"
"subq %q[i], %q[counter]\t\n"
// If we are a multiple of 4, unroll the loop
// We already know we are nonzero
"and $3, %q[counter]\t\n"
"jz 200f\t\n" // unrolled loop
"negq %q[counter]\t\n"
".balign 32\t\n"
"101:\t\n"
"mov (%q[src], %q[i], 4), %k[tmp]\t\n"
"mov %k[tmp], (%q[dst], %q[i], 4)\t\n"
"incq %q[i]\t\n"
"incq %q[counter]\t\n"
"jnz 101b\t\n"
// Do we still have anything left?
"cmpq %q[size], %q[i]\t\n"
"je 900f\t\n" // exit
// Then go to the unrolled loop
"jmp 200f\t\n"
// Unrolled loop (assumes multiple of 4, can do one iteration)
"200:\t\n"
"shr $2, %q[counter]\t\n"
"negq %q[counter]\t\n"
".balign 32\t\n"
"201:\t\n"
"mov (%q[src], %q[i], 4), %k[tmp]\t\n"
"mov %k[tmp], (%q[dst], %q[i], 4)\t\n"
"mov 4(%q[src], %q[i], 4), %k[tmp2]\t\n"
"mov %k[tmp2], 4(%q[dst], %q[i], 4)\t\n"
"mov 8(%q[src], %q[i], 4), %k[tmp2]\t\n"
"mov %k[tmp2], 8(%q[dst], %q[i], 4)\t\n"
"mov 12(%q[src], %q[i], 4), %k[tmp]\t\n"
"mov %k[tmp], 12(%q[dst], %q[i], 4)\t\n"
"addq $4, %q[i]\t\n"
"cmpq %q[size], %q[i]\t\n"
"jb 201b\t\n"
"jmp 900f\t\n" // exit
// simd no overlap
// counter = -(size/8);
"500:\t\n"
"mov %q[size], %q[counter]\t\n"
"shrq $3, %q[counter]\t\n"
"negq %q[counter]\t\n"
".balign 32\t\n"
"501:\t\n"
"movups (%q[src], %q[i], 4), %%xmm0\t\n"
"movups 16(%q[src], %q[i], 4), %%xmm1\t\n"
"movups %%xmm0, (%q[dst], %q[i], 4)\t\n"
"movups %%xmm1, 16(%q[dst], %q[i], 4)\t\n"
"addq $8, %q[i]\t\n"
"incq %q[counter]\t\n"
"jnz 501b\t\n"
// if (i == size)
"cmpq %q[size], %q[i]\t\n"
"je 900f\t\n" // end
"jmp 100b\t\n" // backporch
"900:\t\n"
: [i] "=&c" (i), [counter] "=&r" (counter), [tmp] "=&a" (tmp), [tmp2] "=&r" (tmp2)
: [dst] "D" (dst), [src] "S" (src), [size] "r" (size)
: "xmm0", "xmm1");
}
#endif
class ReduceLoopStrength {
static constexpr bool verbose = false;
struct AddrInfo {
Value* appendAddr(Procedure& proc, BasicBlock* block, Value* addr)
{
block->appendNew<Value>(proc, Identity, addr->origin(), addr);
return addr;
}
Value* insideLoopAddr;
Value* arrayBase;
Value* index;
};
public:
ReduceLoopStrength(Procedure& proc)
: m_proc(proc)
, m_insertionSet(proc)
, m_blockInsertionSet(proc)
, m_root(proc.at(0))
, m_phiChildren(proc)
{
}
#if CPU(X86_64)
void reduceByteCopyLoopsToMemcpy()
{
HashSet<Value*> patternMatchedValues;
Value* store = m_value;
if (store->opcode() != Store)
return;
patternMatchedValues.add(store);
Value* load = store->child(0);
uint32_t elemSize = sizeofType(load->type());
if (load->opcode() != Load || elemSize != 4)
return;
patternMatchedValues.add(load);
if (verbose)
dataLogLn("Found store/load:", *store);
auto extractArrayInfo = [&] (Value* value) -> Optional<AddrInfo> {
patternMatchedValues.add(value);
Optional<AddrInfo> addr = { AddrInfo() };
Value* sum = value;
if (value->opcode() == ZExt32)
sum = sum->child(0);
if (sum->opcode() != Add || sum->numChildren() != 2)
return { };
addr->insideLoopAddr = sum;
auto extractLoopIndexInSubtree = [&] (Value* value) -> Value* {
// Match arrBase[i*elemSize]
if (value->opcode() == Shl
&& value->child(0)->opcode() == Phi
&& value->child(1)->isInt(WTF::fastLog2(elemSize)))
return value->child(0);
if (value->opcode() == Shl
&& value->child(0)->opcode() == ZExt32
&& value->child(0)->child(0)->opcode() == Phi
&& value->child(1)->isInt(WTF::fastLog2(elemSize)))
return value->child(0)->child(0);
return nullptr;
};
// Try to find a known pattern applied to a single phi, which we can assume is our loop index.
Value* left = extractLoopIndexInSubtree(sum->child(0));
Value* right = extractLoopIndexInSubtree(sum->child(1));
if (left && !right) {
addr->index = left;
addr->arrayBase = sum->child(1);
} else if (right && !left) {
addr->index = right;
addr->arrayBase = sum->child(0);
} else
return { };
patternMatchedValues.add(addr->index);
patternMatchedValues.add(addr->arrayBase);
return addr;
};
auto destination = extractArrayInfo(store->child(1));
auto source = extractArrayInfo(load->child(0));
if (!source || !destination || source->index != destination->index)
return;
if (destination->arrayBase->type() != Int64 || source->arrayBase->type() != Int64)
return;
if (verbose)
dataLogLn("Found array info: ", *source->arrayBase, "[", *source->index, "] = ", *destination->arrayBase, "[", *destination->index, "]");
// Find the loop header, footer and inner blocks.
// Verify that there is one entrance to and one exit from the loop.
auto* loop = m_proc.naturalLoops().innerMostLoopOf(m_block);
if (!loop)
return;
BasicBlock* loopHead = loop->header();
BasicBlock* loopFoot = nullptr;
BasicBlock* loopPreheader = nullptr;
BasicBlock* loopPostfooter = nullptr;
SmallPtrSet<BasicBlock*> loopInnerBlocks;
{
for (unsigned i = 0; i < loop->size(); ++i)
loopInnerBlocks.add(loop->at(i));
if (!loopInnerBlocks.contains(load->owner))
return;
if (!loopInnerBlocks.contains(store->owner))
return;
SmallPtrSet<BasicBlock*> loopEntrances;
SmallPtrSet<BasicBlock*> loopExits;
for (auto* block : loopInnerBlocks) {
for (auto successor : block->successors()) {
if (!loopInnerBlocks.contains(successor.block()))
loopExits.add(successor.block());
}
for (auto* predecessor : block->predecessors()) {
if (!loopInnerBlocks.contains(predecessor))
loopEntrances.add(predecessor);
}
}
if (loopExits.size() != 1) {
if (verbose) {
dataLog("Loop has incorrect number of exits: ");
for (BasicBlock* block : loopExits)
dataLog(*block, " ");
dataLogLn();
}
return;
}
loopPostfooter = *loopExits.begin();
if (loopEntrances.size() != 1) {
if (verbose) {
dataLog("Loop has incorrect number of entrances: ");
for (BasicBlock* block : loopEntrances)
dataLog(*block, " ");
dataLogLn();
}
return;
}
loopPreheader = *loopEntrances.begin();
if (!loopHead->predecessors().contains(loopPreheader)) {
if (verbose)
dataLogLn("Loop head does not follow preheader");
return;
}
for (BasicBlock* predecessor : loopPostfooter->predecessors()) {
if (loopInnerBlocks.contains(predecessor)) {
// There is only one loop exit.
RELEASE_ASSERT(!loopFoot);
loopFoot = predecessor;
}
}
}
if (verbose) {
dataLog("Found loop head:", *loopHead, " foot:", *loopFoot, " prehead ", *loopPreheader, " postfoot ", *loopPostfooter, " inner blocks: ");
for (BasicBlock* block : loopInnerBlocks)
dataLog(*block, " ");
dataLogLn();
}
// Verify that the index is a monotonic inductive variable.
Value* startingIndex = nullptr;
{
if (m_phiChildren.at(source->index).size() != 2)
return;
Value* updateIndex = nullptr;
for (Value* up : m_phiChildren.at(source->index)) {
if (m_proc.dominators().dominates(up->owner, loopPreheader))
startingIndex = up;
else
updateIndex = up;
}
if (!updateIndex || !startingIndex || !loopInnerBlocks.contains(updateIndex->owner))
return;
patternMatchedValues.add(updateIndex);
patternMatchedValues.add(startingIndex);
updateIndex = updateIndex->child(0);
startingIndex = startingIndex->child(0);
if (updateIndex->opcode() != Add || !updateIndex->child(1)->isInt(1) || updateIndex->child(0) != source->index)
return;
if (!startingIndex->isInt(0))
return;
}
if (verbose)
dataLogLn("Found starting index:", *startingIndex);
// Identify the loop exit condition.
Value* exitCondition = nullptr;
// Is this an exit condition or a continuation condition?
bool conditionInverted = false;
// Do we check the index before or after using it?
bool checkBeforeWrite = false;
{
Value* branch = loopFoot->last();
if (branch->opcode() != Branch)
return;
patternMatchedValues.add(branch);
exitCondition = branch->child(0);
conditionInverted = loopPostfooter != loopFoot->successor(0).block();
checkBeforeWrite = m_proc.dominators().strictlyDominates(loopFoot, m_block);
}
if (verbose)
dataLogLn("Found exit condition: ", *exitCondition, " inverted: ", conditionInverted, " checks before the first write: ", checkBeforeWrite);
// Idenfity the loop bound by matching loop bound expressions.
Value* loopBound = nullptr;
{
auto extractLoopBound = [&] (Value* index, Value* bound) -> Value* {
// Match i+1 when we do a write first followed by the check for the next iteration
if (!checkBeforeWrite && index->opcode() == Add && index->child(0) == source->index && index->child(1)->isInt(1))
return bound;
return nullptr;
};
if (exitCondition->opcode() == GreaterThan && conditionInverted) {
// enter loop again if size > i
loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0));
} else if (exitCondition->opcode() == LessThan && !conditionInverted) {
// enter loop again if size < i
loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0));
}
if (!loopBound) {
if (verbose)
dataLogLn("Unable to extract bound from exit condition ", *exitCondition);
return;
}
}
// Make sure there are no other effectful things happening.
// This also guarantees that we found the only branch in the loop. This means that our
// load/store must happen iff our index update and branch do.
for (BasicBlock* block : loopInnerBlocks) {
for (unsigned index = 0; index < block->size(); ++index) {
Value* value = block->at(index);
if (patternMatchedValues.contains(value) || value->opcode() == Jump)
continue;
Effects effects = value->effects();
effects.readsLocalState = false;
if (effects != Effects::none()) {
if (verbose)
dataLogLn("Not byte copy loop, node ", *value, " has effects");
return;
}
}
}
if (!hoistValue(loopPreheader, source->arrayBase, false)
|| !hoistValue(loopPreheader, destination->arrayBase, false)
|| !hoistValue(loopPreheader, loopBound, false))
return;
if (verbose)
dataLogLn("Found byte copy loop: ", *source->arrayBase, "[", *source->index, "] = ", *destination->arrayBase, "[", *destination->index, "] from index ", *startingIndex, " to max index ", *loopBound, " stride: ", elemSize);
bool hoistResult = hoistValue(loopPreheader, source->arrayBase);
hoistResult &= hoistValue(loopPreheader, destination->arrayBase);
hoistResult &= hoistValue(loopPreheader, loopBound);
ASSERT_UNUSED(hoistResult, hoistResult);
auto origin = loopPostfooter->at(0)->origin();
BasicBlock* memcpy = m_blockInsertionSet.insertBefore(loopPostfooter, loopPostfooter->frequency());
memcpy->setSuccessors(FrequentedBlock(loopPostfooter));
memcpy->addPredecessor(loopPreheader);
for (BasicBlock* pred : loopPostfooter->predecessors())
loopPostfooter->removePredecessor(pred);
loopPostfooter->addPredecessor(memcpy);
loopPreheader->setSuccessors(memcpy);
Effects effects = Effects::forCall();
memcpy->appendNew<CCallValue>(m_proc, B3::Void, origin, effects,
memcpy->appendNew<ConstPtrValue>(m_proc, origin, tagCFunctionPtr<void*>(fastForwardCopy32, B3CCallPtrTag)),
destination->appendAddr(m_proc, memcpy, destination->arrayBase),
source->appendAddr(m_proc, memcpy, source->arrayBase),
loopBound);
memcpy->appendNew<Value>(m_proc, Jump, origin);
}
bool hoistValue(BasicBlock* into, Value* value, bool canMutate = true)
{
if (m_proc.dominators().dominates(value->owner, into))
return true;
Effects effects = value->effects();
if (effects != Effects::none()) {
if (verbose)
dataLogLn("Cannot hoist ", *value, " into ", *into, " since it has effects");
return false;
}
for (Value* child : value->children()) {
if (!hoistValue(into, child, false))
return false;
}
if (canMutate) {
for (Value* child : value->children())
hoistValue(into, child);
value->owner->at(value->index()) = m_proc.add<Value>(Nop, Void, value->origin());
into->appendNonTerminal(value);
}
return true;
}
bool run()
{
if (verbose)
dataLogLn("ReduceLoopStrength examining proc: ", m_proc);
bool result = false;
do {
m_changed = false;
for (BasicBlock* block : m_proc.blocksInPreOrder()) {
m_block = block;
for (m_index = 0; m_index < block->size(); ++m_index) {
m_value = m_block->at(m_index);
m_value->performSubstitution();
reduceByteCopyLoopsToMemcpy();
m_insertionSet.execute(m_block);
m_changed |= m_blockInsertionSet.execute();
if (m_changed) {
m_proc.resetReachability();
m_proc.invalidateCFG();
m_proc.resetValueOwners();
result = true;
break;
}
}
if (m_changed)
break;
}
} while (m_changed && m_proc.optLevel() >= 2);
if (result && verbose)
dataLogLn("ReduceLoopStrength produced proc: ", m_proc);
return result;
}
#else
bool run()
{
return false;
}
#endif
Procedure& m_proc;
InsertionSet m_insertionSet;
BlockInsertionSet m_blockInsertionSet;
BasicBlock* m_root { nullptr };
BasicBlock* m_block { nullptr };
unsigned m_index { 0 };
Value* m_value { nullptr };
bool m_changed { false };
PhiChildren m_phiChildren;
};
bool reduceLoopStrength(Procedure& proc)
{
PhaseScope phaseScope(proc, "reduceLoopStrength");
return ReduceLoopStrength(proc).run();
}
} } // namespace JSC::B3
#endif // ENABLE(B3_JIT)