blob: f7e2a2f8eadc71bff485eb8ddf3a41ece60a5ed0 [file] [log] [blame]
/*
Copyright (c) 2017, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __semaphore_cuda
namespace cuda
{
#else
#include "semaphore"
namespace std
{
#endif //__semaphore_cuda
namespace experimental
{
inline namespace v1 {
namespace details
{
#ifdef __semaphore_cuda
#else
#ifdef __linux__
// On Linux, we make use of the kernel memory wait operations. These have been available for a long time.
template <class A>
inline const void *__semaphore_fixalign(A &a)
{
static_assert(sizeof(A) <= sizeof(int), "Linux only supports 'int' for Futex.");
return (const void *)((intptr_t)&a & ~(sizeof(int) - 1));
}
inline int __semaphore_readint(const void *p)
{
int i;
memcpy(&i, p, sizeof(int));
return i;
}
template <class A, class V>
inline void __semaphore_wait(A &a, V v)
{
auto p = __semaphore_fixalign(a);
auto i = __semaphore_readint(p);
asm volatile("" ::
: "memory");
if (a.load(memory_order_relaxed) != v)
return;
syscall(SYS_futex, p, FUTEX_WAIT_PRIVATE, i, 0, 0, 0);
}
template <class A, class V, class Rep, class Period>
void __semaphore_wait_timed(A &a, V v, const chrono::duration<Rep, Period> &t)
{
auto p = __semaphore_fixalign(a);
auto i = __semaphore_readint(p);
asm volatile("" ::
: "memory");
if (a.load(memory_order_relaxed) != v)
return;
syscall(SYS_futex, p, FUTEX_WAIT_PRIVATE, i, __semaphore_to_timespec(t), 0, 0);
}
template <class A>
inline void __semaphore_wake_one(A &a)
{
syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
}
template <class A>
inline void __semaphore_wake_all(A &a)
{
syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
}
template <class A, class V>
inline void __semaphore_wait(volatile A &a, V v)
{
auto p = __semaphore_fixalign(a);
auto i = __semaphore_readint(p);
asm volatile("" ::
: "memory");
if (a.load(memory_order_relaxed) != v)
return;
syscall(SYS_futex, p, FUTEX_WAIT, i, 0, 0, 0);
}
template <class A, class V, class Rep, class Period>
void __semaphore_wait_timed(volatile A &a, V v, const chrono::duration<Rep, Period> &t)
{
auto p = __semaphore_fixalign(a);
auto i = __semaphore_readint(p);
asm volatile("" ::
: "memory");
if (a.load(memory_order_relaxed) != v)
return;
syscall(SYS_futex, p, FUTEX_WAIT, i, details::__semaphore_to_timespec(t), 0, 0);
}
template <class A>
inline void __semaphore_wake_one(volatile A &a)
{
syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE, 1, 0, 0, 0);
}
template <class A>
inline void __semaphore_wake_all(volatile A &a)
{
syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE, INT_MAX, 0, 0, 0);
}
#endif // __linux__
#if defined(WIN32) && _WIN32_WINNT >= 0x0602
// On Windows, we make use of the kernel memory wait operations as well. These first became available with Windows 8.
template <class A, class V>
void __semaphore_wait(A &a, V v)
{
static_assert(sizeof(V) <= 8, "Windows only allows sizes between 1B and 8B for WaitOnAddress.");
WaitOnAddress((PVOID)&a, (PVOID)&v, sizeof(v), -1);
}
template <class A, class V, class Rep, class Period>
void __semaphore_wait_timed(A &a, V v, chrono::duration<Rep, Period> const &delta)
{
static_assert(sizeof(V) <= 8, "Windows only allows sizes between 1B and 8B for WaitOnAddress.");
WaitOnAddress((PVOID)&a, (PVOID)&v, sizeof(v), (DWORD)chrono::duration_cast<chrono::milliseconds>(delta).count());
}
template <class A>
inline void __semaphore_wake_one(A &a)
{
WakeByAddressSingle((PVOID)&a);
}
template <class A>
inline void __semaphore_wake_all(A &a)
{
WakeByAddressAll((PVOID)&a);
}
#endif // defined(WIN32) && _WIN32_WINNT >= 0x0602
#endif // __semaphore_cuda
template<class Fn>
__semaphore_abi bool __binary_semaphore_acquire_slow(
atomic<binary_semaphore::count_type>& atom, atomic<binary_semaphore::count_type>& ticket,
atomic<binary_semaphore::count_type>& tocket, bool const&, Fn fn) noexcept
{
uint32_t const tick = ticket.fetch_add(1, std::memory_order_relaxed);
uint32_t tock = tocket.load(std::memory_order_relaxed);
uint32_t contbit = 0u;
#ifdef __semaphore_fast_path
uint32_t sum = 0u;
while(1) {
if(sum < 64*1024) {
#else
while(1) {
#endif
uint32_t const delta = (tick - tock) * 128;
#if !defined(__CUDA_ARCH__)
std::this_thread::sleep_for(std::chrono::nanoseconds(delta));
#elif defined(__has_cuda_nanosleep)
details::__mme_nanosleep(delta);
#endif
#ifdef __semaphore_fast_path
sum += delta;
}
else
{
uint32_t old = atom.fetch_or(binary_semaphore::__slowbit, std::memory_order_relaxed) | binary_semaphore::__slowbit;
if ((old & binary_semaphore::__valubit) != 0) {
atomic_thread_fence(std::memory_order_seq_cst);
if(!fn(old))
return false;
}
}
uint32_t old = atom.load(std::memory_order_relaxed);
#else
uint32_t old = atom.load(std::memory_order_relaxed);
if(!fn(old))
return false;
#endif
tock = tocket.load(std::memory_order_relaxed);
if(tock != tick)
continue;
while ((old & binary_semaphore::__valubit) == 0) {
old &= ~binary_semaphore::__lockbit;
uint32_t next = old - contbit + binary_semaphore::__valubit;
if (atom.compare_exchange_weak(old, next, std::memory_order_acquire, std::memory_order_relaxed))
return true;
}
if(contbit == 0)
atom.fetch_add(contbit = binary_semaphore::__contbit, std::memory_order_relaxed);
}
}
}
#ifdef __semaphore_fast_path
__semaphore_abi void binary_semaphore::__release_slow(count_type old) noexcept
{
count_type lock = 0;
do {
old &= ~__lockbit;
lock = (old & __slowbit) ? __lockbit : 0;
} while (!__atom.compare_exchange_weak(old, (old | lock) & ~(__valubit | __slowbit), std::memory_order_release, std::memory_order_relaxed));
if (lock != 0)
{
atomic_thread_fence(std::memory_order_seq_cst);
details::__semaphore_wake_all(__atom);
__atom.fetch_and(~__lockbit, std::memory_order_release);
}
}
#endif
__semaphore_abi void binary_semaphore::__acquire_slow() noexcept
{
auto const fn = [=] __semaphore_abi (uint32_t old) -> bool {
#ifdef __semaphore_fast_path
details::__semaphore_wait(__atom, old);
#else
(void)old;
#endif
return true;
};
details::__binary_semaphore_acquire_slow(__atom, __ticket, __tocket, __stolen, fn);
}
#ifndef __semaphore_cuda
__semaphore_abi bool binary_semaphore::__acquire_slow_timed(std::chrono::time_point<details::__semaphore_clock, details::__semaphore_duration> const& abs_time) noexcept
{
auto const fn = [=](uint32_t old) __semaphore_abi -> bool {
#ifdef __semaphore_fast_path
auto rel_time = abs_time - details::__semaphore_clock::now();
if(rel_time > std::chrono::microseconds(0))
details::__semaphore_wait_timed(__atom, old, rel_time);
#else
(void)old;
#endif
return details::__semaphore_clock::now() < abs_time;
};
return details::__binary_semaphore_acquire_slow(__atom, __ticket, __tocket, __stolen, fn);
}
#endif
#ifndef __semaphore_sem
__semaphore_abi bool counting_semaphore::__fetch_sub_if_slow(counting_semaphore::count_type old, std::memory_order order) noexcept
{
do
{
old &= ~__lockmask;
if (atom.compare_exchange_weak(old, old - (1 << __shift), order, std::memory_order_relaxed))
return true;
} while ((old >> __shift) >= 1);
return false;
}
#ifdef __semaphore_fast_path
void counting_semaphore::__fetch_add_slow(counting_semaphore::count_type term, counting_semaphore::count_type old, std::memory_order order, semaphore_notify notify) noexcept
{
while (1)
{
bool const apply_lock = ((old & __contmask) != 0) && (notify != semaphore_notify::none);
int const set = ((old & __valumask) + (term << __shift)) | (apply_lock ? __lockmask : 0);
old &= ~__lockmask;
if (atom.compare_exchange_weak(old, set, order, std::memory_order_relaxed))
{
if (apply_lock)
{
switch (notify)
{
case semaphore_notify_all:
details::__semaphore_wake_all(atom);
break;
case semaphore_notify_one:
details::__semaphore_wake_one(atom);
break;
case semaphore_notify_none:
break;
}
atom.fetch_and(~__lockmask, std::memory_order_relaxed);
}
break;
}
}
}
#endif
__semaphore_abi void counting_semaphore::__acquire_slow(std::memory_order order) noexcept
{
int old;
details::__semaphore_exponential_backoff b;
#ifdef __semaphore_fast_path
for (int i = 0; i < 2; ++i)
{
#else
while (1)
{
#endif
b.sleep();
old = atom.load(order);
if ((old >> __shift) >= 1)
goto done;
}
#ifdef __semaphore_fast_path
while (1)
{
old = atom.fetch_or(__contmask, std::memory_order_relaxed) | __contmask;
if ((old >> __shift) >= 1)
goto done;
details::__semaphore_wait(atom, old);
old = atom.load(order);
if ((old >> __shift) >= 1)
goto done;
}
#endif
done:
#ifdef __semaphore_fast_path
while (old & __lockmask)
old = atom.load(std::memory_order_relaxed);
#else
;
#endif
}
#endif
#ifndef __semaphore_cuda
static constexpr int __atomic_wait_table_entry_size = sizeof(synchronic) > alignof(synchronic) ? sizeof(synchronic) : alignof(synchronic);
static constexpr int __atomic_wait_table_entry_count = 1024;
__semaphore_managed alignas(64) unsigned char __atomic_wait_table[__atomic_wait_table_entry_count][__atomic_wait_table_entry_size] = { 0 };
__semaphore_abi size_t __atomic_wait_table_index(void const* ptr) {
return ((uintptr_t)ptr / __atomic_wait_table_entry_size) & (__atomic_wait_table_entry_count - 1);
}
__semaphore_abi synchronic *__atomic_wait_get_semaphore(void const *a)
{
return (synchronic *)&__atomic_wait_table[__atomic_wait_table_index(a)][0];
}
#endif
}
} // namespace experimental
} // namespace std